LLVM OpenMP* Runtime Library
kmp_taskq.cpp
1 /*
2  * kmp_taskq.cpp -- TASKQ support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 
21 #define MAX_MESSAGE 512
22 
23 /* Taskq routines and global variables */
24 
25 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
26 
27 #define THREAD_ALLOC_FOR_TASKQ
28 
29 static int in_parallel_context(kmp_team_t *team) {
30  return !team->t.t_serialized;
31 }
32 
33 static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
34  int gtid = *gtid_ref;
35  int tid = __kmp_tid_from_gtid(gtid);
36  kmp_uint32 my_token;
37  kmpc_task_queue_t *taskq;
38  kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
39 
40  if (__kmp_env_consistency_check)
41 #if KMP_USE_DYNAMIC_LOCK
42  __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL, 0);
43 #else
44  __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL);
45 #endif
46 
47  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
48  KMP_MB(); /* Flush all pending memory write invalidates. */
49 
50  /* GEH - need check here under stats to make sure */
51  /* inside task (curr_thunk[*tid_ref] != NULL) */
52 
53  my_token = tq->tq_curr_thunk[tid]->th_tasknum;
54 
55  taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
56 
57  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
58  KMP_MB();
59  }
60 }
61 
62 static void __kmp_taskq_xo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
63  int gtid = *gtid_ref;
64  int tid = __kmp_tid_from_gtid(gtid);
65  kmp_uint32 my_token;
66  kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
67 
68  if (__kmp_env_consistency_check)
69  __kmp_pop_sync(gtid, ct_ordered_in_taskq, loc_ref);
70 
71  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
72  KMP_MB(); /* Flush all pending memory write invalidates. */
73 
74  /* GEH - need check here under stats to make sure */
75  /* inside task (curr_thunk[tid] != NULL) */
76 
77  my_token = tq->tq_curr_thunk[tid]->th_tasknum;
78 
79  KMP_MB(); /* Flush all pending memory write invalidates. */
80 
81  tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue->tq_tasknum_serving =
82  my_token + 1;
83 
84  KMP_MB(); /* Flush all pending memory write invalidates. */
85  }
86 }
87 
88 static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) {
89  kmp_uint32 my_token;
90  kmpc_task_queue_t *taskq;
91 
92  /* assume we are always called from an active parallel context */
93 
94  KMP_MB(); /* Flush all pending memory write invalidates. */
95 
96  my_token = thunk->th_tasknum;
97 
98  taskq = thunk->th.th_shareds->sv_queue;
99 
100  if (taskq->tq_tasknum_serving <= my_token) {
101  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
102  KMP_MB();
103  taskq->tq_tasknum_serving = my_token + 1;
104  KMP_MB();
105  }
106 }
107 
108 #ifdef KMP_DEBUG
109 
110 static void __kmp_dump_TQF(kmp_int32 flags) {
111  if (flags & TQF_IS_ORDERED)
112  __kmp_printf("ORDERED ");
113  if (flags & TQF_IS_LASTPRIVATE)
114  __kmp_printf("LAST_PRIV ");
115  if (flags & TQF_IS_NOWAIT)
116  __kmp_printf("NOWAIT ");
117  if (flags & TQF_HEURISTICS)
118  __kmp_printf("HEURIST ");
119  if (flags & TQF_INTERFACE_RESERVED1)
120  __kmp_printf("RESERV1 ");
121  if (flags & TQF_INTERFACE_RESERVED2)
122  __kmp_printf("RESERV2 ");
123  if (flags & TQF_INTERFACE_RESERVED3)
124  __kmp_printf("RESERV3 ");
125  if (flags & TQF_INTERFACE_RESERVED4)
126  __kmp_printf("RESERV4 ");
127  if (flags & TQF_IS_LAST_TASK)
128  __kmp_printf("LAST_TASK ");
129  if (flags & TQF_TASKQ_TASK)
130  __kmp_printf("TASKQ_TASK ");
131  if (flags & TQF_RELEASE_WORKERS)
132  __kmp_printf("RELEASE ");
133  if (flags & TQF_ALL_TASKS_QUEUED)
134  __kmp_printf("ALL_QUEUED ");
135  if (flags & TQF_PARALLEL_CONTEXT)
136  __kmp_printf("PARALLEL ");
137  if (flags & TQF_DEALLOCATED)
138  __kmp_printf("DEALLOC ");
139  if (!(flags & (TQF_INTERNAL_FLAGS | TQF_INTERFACE_FLAGS)))
140  __kmp_printf("(NONE)");
141 }
142 
143 static void __kmp_dump_thunk(kmp_taskq_t *tq, kmpc_thunk_t *thunk,
144  kmp_int32 global_tid) {
145  int i;
146  int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
147 
148  __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
149 
150  if (thunk != NULL) {
151  for (i = 0; i < nproc; i++) {
152  if (tq->tq_curr_thunk[i] == thunk) {
153  __kmp_printf("[%i] ", i);
154  }
155  }
156  __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
157  __kmp_printf("th_task=%p, ", thunk->th_task);
158  __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
159  __kmp_printf("th_status=%d, ", thunk->th_status);
160  __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
161  __kmp_printf("th_flags=");
162  __kmp_dump_TQF(thunk->th_flags);
163  }
164 
165  __kmp_printf("\n");
166 }
167 
168 static void __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num) {
169  kmpc_thunk_t *th;
170 
171  __kmp_printf(" Thunk stack for T#%d: ", thread_num);
172 
173  for (th = thunk; th != NULL; th = th->th_encl_thunk)
174  __kmp_printf("%p ", th);
175 
176  __kmp_printf("\n");
177 }
178 
179 static void __kmp_dump_task_queue(kmp_taskq_t *tq, kmpc_task_queue_t *queue,
180  kmp_int32 global_tid) {
181  int qs, count, i;
182  kmpc_thunk_t *thunk;
183  kmpc_task_queue_t *taskq;
184 
185  __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
186 
187  if (queue != NULL) {
188  int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
189 
190  if (__kmp_env_consistency_check) {
191  __kmp_printf(" tq_loc : ");
192  }
193  if (in_parallel) {
194 
195  // if (queue->tq.tq_parent != 0)
196  //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
197 
198  //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
199 
200  // Make sure data structures are in consistent state before querying them
201  // Seems to work without this for digital/alpha, needed for IBM/RS6000
202  KMP_MB();
203 
204  __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
205  __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
206  __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
207  __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
208  __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
209 
210  //__kmp_release_lock(& queue->tq_link_lck, global_tid);
211 
212  // if (queue->tq.tq_parent != 0)
213  //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
214 
215  //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
216  //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
217 
218  // Make sure data structures are in consistent state before querying them
219  // Seems to work without this for digital/alpha, needed for IBM/RS6000
220  KMP_MB();
221  }
222 
223  __kmp_printf(" tq_shareds : ");
224  for (i = 0; i < ((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
225  __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
226  __kmp_printf("\n");
227 
228  if (in_parallel) {
229  __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
230  __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
231  }
232 
233  __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
234  __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
235  __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
236 
237  __kmp_printf(" tq_free_thunks : ");
238  for (thunk = queue->tq_free_thunks; thunk != NULL;
239  thunk = thunk->th.th_next_free)
240  __kmp_printf("%p ", thunk);
241  __kmp_printf("\n");
242 
243  __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
244  __kmp_printf(" tq_head : %d\n", queue->tq_head);
245  __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
246  __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
247  __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
248  __kmp_printf(" tq_flags : ");
249  __kmp_dump_TQF(queue->tq_flags);
250  __kmp_printf("\n");
251 
252  if (in_parallel) {
253  __kmp_printf(" tq_th_thunks : ");
254  for (i = 0; i < queue->tq_nproc; i++) {
255  __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
256  }
257  __kmp_printf("\n");
258  }
259 
260  __kmp_printf("\n");
261  __kmp_printf(" Queue slots:\n");
262 
263  qs = queue->tq_tail;
264  for (count = 0; count < queue->tq_nfull; ++count) {
265  __kmp_printf("(%d)", qs);
266  __kmp_dump_thunk(tq, queue->tq_queue[qs].qs_thunk, global_tid);
267  qs = (qs + 1) % queue->tq_nslots;
268  }
269 
270  __kmp_printf("\n");
271 
272  if (in_parallel) {
273  if (queue->tq_taskq_slot != NULL) {
274  __kmp_printf(" TaskQ slot:\n");
275  __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot),
276  global_tid);
277  __kmp_printf("\n");
278  }
279  //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
280  //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
281  }
282  }
283 
284  __kmp_printf(" Taskq freelist: ");
285 
286  //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
287 
288  // Make sure data structures are in consistent state before querying them
289  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
290  KMP_MB();
291 
292  for (taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free)
293  __kmp_printf("%p ", taskq);
294 
295  //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
296 
297  __kmp_printf("\n\n");
298 }
299 
300 static void __kmp_aux_dump_task_queue_tree(kmp_taskq_t *tq,
301  kmpc_task_queue_t *curr_queue,
302  kmp_int32 level,
303  kmp_int32 global_tid) {
304  int i, count, qs;
305  int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
306  kmpc_task_queue_t *queue = curr_queue;
307 
308  if (curr_queue == NULL)
309  return;
310 
311  __kmp_printf(" ");
312 
313  for (i = 0; i < level; i++)
314  __kmp_printf(" ");
315 
316  __kmp_printf("%p", curr_queue);
317 
318  for (i = 0; i < nproc; i++) {
319  if (tq->tq_curr_thunk[i] &&
320  tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue) {
321  __kmp_printf(" [%i]", i);
322  }
323  }
324 
325  __kmp_printf(":");
326 
327  //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
328 
329  // Make sure data structures are in consistent state before querying them
330  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
331  KMP_MB();
332 
333  qs = curr_queue->tq_tail;
334 
335  for (count = 0; count < curr_queue->tq_nfull; ++count) {
336  __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
337  qs = (qs + 1) % curr_queue->tq_nslots;
338  }
339 
340  //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
341 
342  __kmp_printf("\n");
343 
344  if (curr_queue->tq_first_child) {
345  //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
346 
347  // Make sure data structures are in consistent state before querying them
348  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
349  KMP_MB();
350 
351  if (curr_queue->tq_first_child) {
352  for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
353  queue != NULL; queue = queue->tq_next_child) {
354  __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid);
355  }
356  }
357 
358  //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
359  }
360 }
361 
362 static void __kmp_dump_task_queue_tree(kmp_taskq_t *tq,
363  kmpc_task_queue_t *tqroot,
364  kmp_int32 global_tid) {
365  __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
366 
367  __kmp_aux_dump_task_queue_tree(tq, tqroot, 0, global_tid);
368 
369  __kmp_printf("\n");
370 }
371 #endif
372 
373 /* New taskq storage routines that try to minimize overhead of mallocs but
374  still provide cache line alignment. */
375 static void *__kmp_taskq_allocate(size_t size, kmp_int32 global_tid) {
376  void *addr, *orig_addr;
377  size_t bytes;
378 
379  KB_TRACE(5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int)size,
380  global_tid));
381 
382  bytes = sizeof(void *) + CACHE_LINE + size;
383 
384 #ifdef THREAD_ALLOC_FOR_TASKQ
385  orig_addr =
386  (void *)__kmp_thread_malloc(__kmp_thread_from_gtid(global_tid), bytes);
387 #else
388  KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", bytes));
389  orig_addr = (void *)KMP_INTERNAL_MALLOC(bytes);
390 #endif /* THREAD_ALLOC_FOR_TASKQ */
391 
392  if (orig_addr == 0)
393  KMP_FATAL(OutOfHeapMemory);
394 
395  addr = orig_addr;
396 
397  if (((kmp_uintptr_t)addr & (CACHE_LINE - 1)) != 0) {
398  KB_TRACE(50, ("__kmp_taskq_allocate: adjust for cache alignment\n"));
399  addr = (void *)(((kmp_uintptr_t)addr + CACHE_LINE) & ~(CACHE_LINE - 1));
400  }
401 
402  (*(void **)addr) = orig_addr;
403 
404  KB_TRACE(10,
405  ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, "
406  "gtid: %d\n",
407  orig_addr, ((void **)addr) + 1,
408  ((char *)(((void **)addr) + 1)) + size - 1, (int)size, global_tid));
409 
410  return (((void **)addr) + 1);
411 }
412 
413 static void __kmpc_taskq_free(void *p, kmp_int32 global_tid) {
414  KB_TRACE(5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid));
415 
416  KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n",
417  (*(((void **)p) - 1)), global_tid));
418 
419 #ifdef THREAD_ALLOC_FOR_TASKQ
420  __kmp_thread_free(__kmp_thread_from_gtid(global_tid), *(((void **)p) - 1));
421 #else
422  KMP_INTERNAL_FREE(*(((void **)p) - 1));
423 #endif /* THREAD_ALLOC_FOR_TASKQ */
424 }
425 
426 /* Keep freed kmpc_task_queue_t on an internal freelist and recycle since
427  they're of constant size. */
428 
429 static kmpc_task_queue_t *
430 __kmp_alloc_taskq(kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots,
431  kmp_int32 nthunks, kmp_int32 nshareds, kmp_int32 nproc,
432  size_t sizeof_thunk, size_t sizeof_shareds,
433  kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid) {
434  kmp_int32 i;
435  size_t bytes;
436  kmpc_task_queue_t *new_queue;
437  kmpc_aligned_shared_vars_t *shared_var_array;
438  char *shared_var_storage;
439  char *pt; /* for doing byte-adjusted address computations */
440 
441  __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
442 
443  // Make sure data structures are in consistent state before querying them
444  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
445  KMP_MB();
446 
447  if (tq->tq_freelist) {
448  new_queue = tq->tq_freelist;
449  tq->tq_freelist = tq->tq_freelist->tq.tq_next_free;
450 
451  KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
452 
453  new_queue->tq_flags = 0;
454 
455  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
456  } else {
457  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
458 
459  new_queue = (kmpc_task_queue_t *)__kmp_taskq_allocate(
460  sizeof(kmpc_task_queue_t), global_tid);
461  new_queue->tq_flags = 0;
462  }
463 
464  /* space in the task queue for queue slots (allocate as one big chunk */
465  /* of storage including new_taskq_task space) */
466 
467  sizeof_thunk +=
468  (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
469  pt = (char *)__kmp_taskq_allocate(nthunks * sizeof_thunk, global_tid);
470  new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
471  *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
472 
473  /* chain the allocated thunks into a freelist for this queue */
474 
475  new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
476 
477  for (i = 0; i < (nthunks - 2); i++) {
478  ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th.th_next_free =
479  (kmpc_thunk_t *)(pt + (i + 1) * sizeof_thunk);
480 #ifdef KMP_DEBUG
481  ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th_flags = TQF_DEALLOCATED;
482 #endif
483  }
484 
485  ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th.th_next_free = NULL;
486 #ifdef KMP_DEBUG
487  ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th_flags =
488  TQF_DEALLOCATED;
489 #endif
490 
491  /* initialize the locks */
492 
493  if (in_parallel) {
494  __kmp_init_lock(&new_queue->tq_link_lck);
495  __kmp_init_lock(&new_queue->tq_free_thunks_lck);
496  __kmp_init_lock(&new_queue->tq_queue_lck);
497  }
498 
499  /* now allocate the slots */
500 
501  bytes = nslots * sizeof(kmpc_aligned_queue_slot_t);
502  new_queue->tq_queue =
503  (kmpc_aligned_queue_slot_t *)__kmp_taskq_allocate(bytes, global_tid);
504 
505  /* space for array of pointers to shared variable structures */
506  sizeof_shareds += sizeof(kmpc_task_queue_t *);
507  sizeof_shareds +=
508  (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
509 
510  bytes = nshareds * sizeof(kmpc_aligned_shared_vars_t);
511  shared_var_array =
512  (kmpc_aligned_shared_vars_t *)__kmp_taskq_allocate(bytes, global_tid);
513 
514  bytes = nshareds * sizeof_shareds;
515  shared_var_storage = (char *)__kmp_taskq_allocate(bytes, global_tid);
516 
517  for (i = 0; i < nshareds; i++) {
518  shared_var_array[i].ai_data =
519  (kmpc_shared_vars_t *)(shared_var_storage + i * sizeof_shareds);
520  shared_var_array[i].ai_data->sv_queue = new_queue;
521  }
522  new_queue->tq_shareds = shared_var_array;
523 
524  /* array for number of outstanding thunks per thread */
525 
526  if (in_parallel) {
527  bytes = nproc * sizeof(kmpc_aligned_int32_t);
528  new_queue->tq_th_thunks =
529  (kmpc_aligned_int32_t *)__kmp_taskq_allocate(bytes, global_tid);
530  new_queue->tq_nproc = nproc;
531 
532  for (i = 0; i < nproc; i++)
533  new_queue->tq_th_thunks[i].ai_data = 0;
534  }
535 
536  return new_queue;
537 }
538 
539 static void __kmp_free_taskq(kmp_taskq_t *tq, kmpc_task_queue_t *p,
540  int in_parallel, kmp_int32 global_tid) {
541  __kmpc_taskq_free(p->tq_thunk_space, global_tid);
542  __kmpc_taskq_free(p->tq_queue, global_tid);
543 
544  /* free shared var structure storage */
545  __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data),
546  global_tid);
547  /* free array of pointers to shared vars storage */
548  __kmpc_taskq_free(p->tq_shareds, global_tid);
549 
550 #ifdef KMP_DEBUG
551  p->tq_first_child = NULL;
552  p->tq_next_child = NULL;
553  p->tq_prev_child = NULL;
554  p->tq_ref_count = -10;
555  p->tq_shareds = NULL;
556  p->tq_tasknum_queuing = 0;
557  p->tq_tasknum_serving = 0;
558  p->tq_queue = NULL;
559  p->tq_thunk_space = NULL;
560  p->tq_taskq_slot = NULL;
561  p->tq_free_thunks = NULL;
562  p->tq_nslots = 0;
563  p->tq_head = 0;
564  p->tq_tail = 0;
565  p->tq_nfull = 0;
566  p->tq_hiwat = 0;
567 
568  if (in_parallel) {
569  int i;
570 
571  for (i = 0; i < p->tq_nproc; i++)
572  p->tq_th_thunks[i].ai_data = 0;
573  }
574  if (__kmp_env_consistency_check)
575  p->tq_loc = NULL;
576  KMP_DEBUG_ASSERT(p->tq_flags & TQF_DEALLOCATED);
577  p->tq_flags = TQF_DEALLOCATED;
578 #endif /* KMP_DEBUG */
579 
580  if (in_parallel) {
581  __kmpc_taskq_free(p->tq_th_thunks, global_tid);
582  __kmp_destroy_lock(&p->tq_link_lck);
583  __kmp_destroy_lock(&p->tq_queue_lck);
584  __kmp_destroy_lock(&p->tq_free_thunks_lck);
585  }
586 #ifdef KMP_DEBUG
587  p->tq_th_thunks = NULL;
588 #endif /* KMP_DEBUG */
589 
590  // Make sure data structures are in consistent state before querying them
591  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
592  KMP_MB();
593 
594  __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
595  p->tq.tq_next_free = tq->tq_freelist;
596 
597  tq->tq_freelist = p;
598  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
599 }
600 
601 /* Once a group of thunks has been allocated for use in a particular queue,
602  these are managed via a per-queue freelist.
603  We force a check that there's always a thunk free if we need one. */
604 
605 static kmpc_thunk_t *__kmp_alloc_thunk(kmpc_task_queue_t *queue,
606  int in_parallel, kmp_int32 global_tid) {
607  kmpc_thunk_t *fl;
608 
609  if (in_parallel) {
610  __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
611  // Make sure data structures are in consistent state before querying them
612  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
613  KMP_MB();
614  }
615 
616  fl = queue->tq_free_thunks;
617 
618  KMP_DEBUG_ASSERT(fl != NULL);
619 
620  queue->tq_free_thunks = fl->th.th_next_free;
621  fl->th_flags = 0;
622 
623  if (in_parallel)
624  __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
625 
626  return fl;
627 }
628 
629 static void __kmp_free_thunk(kmpc_task_queue_t *queue, kmpc_thunk_t *p,
630  int in_parallel, kmp_int32 global_tid) {
631 #ifdef KMP_DEBUG
632  p->th_task = 0;
633  p->th_encl_thunk = 0;
634  p->th_status = 0;
635  p->th_tasknum = 0;
636 /* Also could zero pointers to private vars */
637 #endif
638 
639  if (in_parallel) {
640  __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
641  // Make sure data structures are in consistent state before querying them
642  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
643  KMP_MB();
644  }
645 
646  p->th.th_next_free = queue->tq_free_thunks;
647  queue->tq_free_thunks = p;
648 
649 #ifdef KMP_DEBUG
650  p->th_flags = TQF_DEALLOCATED;
651 #endif
652 
653  if (in_parallel)
654  __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
655 }
656 
657 /* returns nonzero if the queue just became full after the enqueue */
658 static kmp_int32 __kmp_enqueue_task(kmp_taskq_t *tq, kmp_int32 global_tid,
659  kmpc_task_queue_t *queue,
660  kmpc_thunk_t *thunk, int in_parallel) {
661  kmp_int32 ret;
662 
663  /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the
664  * master is executing then) */
665  if (in_parallel) {
666  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
667  // Make sure data structures are in consistent state before querying them
668  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
669  KMP_MB();
670  }
671 
672  KMP_DEBUG_ASSERT(queue->tq_nfull < queue->tq_nslots); // check queue not full
673 
674  queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
675 
676  if (queue->tq_head >= queue->tq_nslots)
677  queue->tq_head = 0;
678 
679  (queue->tq_nfull)++;
680 
681  KMP_MB(); /* to assure that nfull is seen to increase before
682  TQF_ALL_TASKS_QUEUED is set */
683 
684  ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
685 
686  if (in_parallel) {
687  /* don't need to wait until workers are released before unlocking */
688  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
689 
690  if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
691  // If just creating the root queue, the worker threads are waiting at a
692  // join barrier until now, when there's something in the queue for them to
693  // do; release them now to do work. This should only be done when this is
694  // the first task enqueued, so reset the flag here also.
695  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers
696  are still in spin mode */
697  // avoid releasing barrier twice if taskq_task switches threads
698  KMP_MB();
699 
700  __kmpc_end_barrier_master(NULL, global_tid);
701  }
702  }
703 
704  return ret;
705 }
706 
707 static kmpc_thunk_t *__kmp_dequeue_task(kmp_int32 global_tid,
708  kmpc_task_queue_t *queue,
709  int in_parallel) {
710  kmpc_thunk_t *pt;
711  int tid = __kmp_tid_from_gtid(global_tid);
712 
713  KMP_DEBUG_ASSERT(queue->tq_nfull > 0); /* check queue not empty */
714 
715  if (queue->tq.tq_parent != NULL && in_parallel) {
716  int ct;
717  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
718  ct = ++(queue->tq_ref_count);
719  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
720  KMP_DEBUG_REF_CTS(
721  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
722  }
723 
724  pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
725 
726  if (queue->tq_tail >= queue->tq_nslots)
727  queue->tq_tail = 0;
728 
729  if (in_parallel) {
730  queue->tq_th_thunks[tid].ai_data++;
731 
732  KMP_MB(); /* necessary so ai_data increment is propagated to other threads
733  immediately (digital) */
734 
735  KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding "
736  "thunks from queue %p\n",
737  global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
738  }
739 
740  (queue->tq_nfull)--;
741 
742 #ifdef KMP_DEBUG
743  KMP_MB();
744 
745  /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is
746  * decremented */
747 
748  KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
749 
750  if (in_parallel) {
751  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <=
752  __KMP_TASKQ_THUNKS_PER_TH);
753  }
754 #endif
755 
756  return pt;
757 }
758 
759 /* Find the next (non-null) task to dequeue and return it.
760  * This is never called unless in_parallel=TRUE
761  *
762  * Here are the rules for deciding which queue to take the task from:
763  * 1. Walk up the task queue tree from the current queue's parent and look
764  * on the way up (for loop, below).
765  * 2. Do a depth-first search back down the tree from the root and
766  * look (find_task_in_descendant_queue()).
767  *
768  * Here are the rules for deciding which task to take from a queue
769  * (__kmp_find_task_in_queue ()):
770  * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
771  * must be staged to make sure we execute the last one with
772  * TQF_IS_LAST_TASK at the end of task queue execution.
773  * 2. If the queue length is below some high water mark and the taskq task
774  * is enqueued, prefer running the taskq task.
775  * 3. Otherwise, take a (normal) task from the queue.
776  *
777  * If we do all this and return pt == NULL at the bottom of this routine,
778  * this means there are no more tasks to execute (except possibly for
779  * TQF_IS_LASTPRIVATE).
780  */
781 
782 static kmpc_thunk_t *__kmp_find_task_in_queue(kmp_int32 global_tid,
783  kmpc_task_queue_t *queue) {
784  kmpc_thunk_t *pt = NULL;
785  int tid = __kmp_tid_from_gtid(global_tid);
786 
787  /* To prevent deadlock from tq_queue_lck if queue already deallocated */
788  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
789 
790  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
791 
792  /* Check again to avoid race in __kmpc_end_taskq() */
793  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
794  // Make sure data structures are in consistent state before querying them
795  // Seems to work without this for digital/alpha, needed for IBM/RS6000
796  KMP_MB();
797 
798  if ((queue->tq_taskq_slot != NULL) &&
799  (queue->tq_nfull <= queue->tq_hiwat)) {
800  /* if there's enough room in the queue and the dispatcher */
801  /* (taskq task) is available, schedule more tasks */
802  pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot);
803  queue->tq_taskq_slot = NULL;
804  } else if (queue->tq_nfull == 0 ||
805  queue->tq_th_thunks[tid].ai_data >=
806  __KMP_TASKQ_THUNKS_PER_TH) {
807  /* do nothing if no thunks available or this thread can't */
808  /* run any because it already is executing too many */
809  pt = NULL;
810  } else if (queue->tq_nfull > 1) {
811  /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
812 
813  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
814  } else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
815  // one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE
816  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
817  } else if (queue->tq_flags & TQF_IS_LAST_TASK) {
818  /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
819  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
820  /* instrumentation does copy-out. */
821  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
822  pt->th_flags |=
823  TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
824  }
825  }
826 
827  /* GEH - What happens here if is lastprivate, but not last task? */
828  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
829  }
830 
831  return pt;
832 }
833 
834 /* Walk a tree of queues starting at queue's first child and return a non-NULL
835  thunk if one can be scheduled. Must only be called when in_parallel=TRUE */
836 
837 static kmpc_thunk_t *
838 __kmp_find_task_in_descendant_queue(kmp_int32 global_tid,
839  kmpc_task_queue_t *curr_queue) {
840  kmpc_thunk_t *pt = NULL;
841  kmpc_task_queue_t *queue = curr_queue;
842 
843  if (curr_queue->tq_first_child != NULL) {
844  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
845  // Make sure data structures are in consistent state before querying them
846  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
847  KMP_MB();
848 
849  queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
850  if (queue == NULL) {
851  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
852  return NULL;
853  }
854 
855  while (queue != NULL) {
856  int ct;
857  kmpc_task_queue_t *next;
858 
859  ct = ++(queue->tq_ref_count);
860  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
861  KMP_DEBUG_REF_CTS(
862  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
863 
864  pt = __kmp_find_task_in_queue(global_tid, queue);
865 
866  if (pt != NULL) {
867  int ct;
868 
869  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
870  // Make sure data structures in consistent state before querying them
871  // Seems to work without this for digital/alpha, needed for IBM/RS6000
872  KMP_MB();
873 
874  ct = --(queue->tq_ref_count);
875  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
876  global_tid, queue, ct));
877  KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
878 
879  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
880 
881  return pt;
882  }
883 
884  /* although reference count stays active during descendant walk, shouldn't
885  matter since if children still exist, reference counts aren't being
886  monitored anyway */
887 
888  pt = __kmp_find_task_in_descendant_queue(global_tid, queue);
889 
890  if (pt != NULL) {
891  int ct;
892 
893  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
894  // Make sure data structures in consistent state before querying them
895  // Seems to work without this for digital/alpha, needed for IBM/RS6000
896  KMP_MB();
897 
898  ct = --(queue->tq_ref_count);
899  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
900  global_tid, queue, ct));
901  KMP_DEBUG_ASSERT(ct >= 0);
902 
903  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
904 
905  return pt;
906  }
907 
908  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
909  // Make sure data structures in consistent state before querying them
910  // Seems to work without this for digital/alpha, needed for IBM/RS6000
911  KMP_MB();
912 
913  next = queue->tq_next_child;
914 
915  ct = --(queue->tq_ref_count);
916  KMP_DEBUG_REF_CTS(
917  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
918  KMP_DEBUG_ASSERT(ct >= 0);
919 
920  queue = next;
921  }
922 
923  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
924  }
925 
926  return pt;
927 }
928 
929 /* Walk up the taskq tree looking for a task to execute. If we get to the root,
930  search the tree for a descendent queue task. Must only be called when
931  in_parallel=TRUE */
932 static kmpc_thunk_t *
933 __kmp_find_task_in_ancestor_queue(kmp_taskq_t *tq, kmp_int32 global_tid,
934  kmpc_task_queue_t *curr_queue) {
935  kmpc_task_queue_t *queue;
936  kmpc_thunk_t *pt;
937 
938  pt = NULL;
939 
940  if (curr_queue->tq.tq_parent != NULL) {
941  queue = curr_queue->tq.tq_parent;
942 
943  while (queue != NULL) {
944  if (queue->tq.tq_parent != NULL) {
945  int ct;
946  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
947  // Make sure data structures in consistent state before querying them
948  // Seems to work without this for digital/alpha, needed for IBM/RS6000
949  KMP_MB();
950 
951  ct = ++(queue->tq_ref_count);
952  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
953  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n", __LINE__,
954  global_tid, queue, ct));
955  }
956 
957  pt = __kmp_find_task_in_queue(global_tid, queue);
958  if (pt != NULL) {
959  if (queue->tq.tq_parent != NULL) {
960  int ct;
961  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
962  // Make sure data structures in consistent state before querying them
963  // Seems to work without this for digital/alpha, needed for IBM/RS6000
964  KMP_MB();
965 
966  ct = --(queue->tq_ref_count);
967  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
968  global_tid, queue, ct));
969  KMP_DEBUG_ASSERT(ct >= 0);
970 
971  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
972  }
973 
974  return pt;
975  }
976 
977  if (queue->tq.tq_parent != NULL) {
978  int ct;
979  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
980  // Make sure data structures in consistent state before querying them
981  // Seems to work without this for digital/alpha, needed for IBM/RS6000
982  KMP_MB();
983 
984  ct = --(queue->tq_ref_count);
985  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
986  global_tid, queue, ct));
987  KMP_DEBUG_ASSERT(ct >= 0);
988  }
989  queue = queue->tq.tq_parent;
990 
991  if (queue != NULL)
992  __kmp_release_lock(&queue->tq_link_lck, global_tid);
993  }
994  }
995 
996  pt = __kmp_find_task_in_descendant_queue(global_tid, tq->tq_root);
997 
998  return pt;
999 }
1000 
1001 static int __kmp_taskq_tasks_finished(kmpc_task_queue_t *queue) {
1002  int i;
1003 
1004  /* KMP_MB(); */ /* is this really necessary? */
1005 
1006  for (i = 0; i < queue->tq_nproc; i++) {
1007  if (queue->tq_th_thunks[i].ai_data != 0)
1008  return FALSE;
1009  }
1010 
1011  return TRUE;
1012 }
1013 
1014 static int __kmp_taskq_has_any_children(kmpc_task_queue_t *queue) {
1015  return (queue->tq_first_child != NULL);
1016 }
1017 
1018 static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid,
1019  kmpc_task_queue_t *queue,
1020  int in_parallel) {
1021 #ifdef KMP_DEBUG
1022  kmp_int32 i;
1023  kmpc_thunk_t *thunk;
1024 #endif
1025 
1026  KF_TRACE(50,
1027  ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1028  KF_DUMP(50, __kmp_dump_task_queue(tq, queue, global_tid));
1029 
1030  /* sub-queue in a recursion, not the root task queue */
1031  KMP_DEBUG_ASSERT(queue->tq.tq_parent != NULL);
1032 
1033  if (in_parallel) {
1034  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1035  // Make sure data structures are in consistent state before querying them
1036  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1037  KMP_MB();
1038  }
1039 
1040  KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1041 
1042  /* unlink queue from its siblings if any at this level */
1043  if (queue->tq_prev_child != NULL)
1044  queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1045  if (queue->tq_next_child != NULL)
1046  queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1047  if (queue->tq.tq_parent->tq_first_child == queue)
1048  queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1049 
1050  queue->tq_prev_child = NULL;
1051  queue->tq_next_child = NULL;
1052 
1053  if (in_parallel) {
1054  KMP_DEBUG_REF_CTS(
1055  ("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1056  __LINE__, global_tid, queue, queue->tq_ref_count));
1057 
1058  /* wait until all other threads have stopped accessing this queue */
1059  while (queue->tq_ref_count > 1) {
1060  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1061 
1062  KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE,
1063  NULL);
1064 
1065  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1066  // Make sure data structures are in consistent state before querying them
1067  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1068  KMP_MB();
1069  }
1070 
1071  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1072  }
1073 
1074  KMP_DEBUG_REF_CTS(
1075  ("line %d gtid %d: Q %p freeing queue\n", __LINE__, global_tid, queue));
1076 
1077 #ifdef KMP_DEBUG
1078  KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1079  KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1080 
1081  for (i = 0; i < queue->tq_nproc; i++) {
1082  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1083  }
1084 
1085  i = 0;
1086  for (thunk = queue->tq_free_thunks; thunk != NULL;
1087  thunk = thunk->th.th_next_free)
1088  ++i;
1089 
1090  KMP_ASSERT(i ==
1091  queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1092 #endif
1093 
1094  /* release storage for queue entry */
1095  __kmp_free_taskq(tq, queue, TRUE, global_tid);
1096 
1097  KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1098  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1099 }
1100 
1101 /* Starting from indicated queue, proceed downward through tree and remove all
1102  taskqs which are finished, but only go down to taskqs which have the "nowait"
1103  clause present. Assume this is only called when in_parallel=TRUE. */
1104 
1105 static void __kmp_find_and_remove_finished_child_taskq(
1106  kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue) {
1107  kmpc_task_queue_t *queue = curr_queue;
1108 
1109  if (curr_queue->tq_first_child != NULL) {
1110  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1111  // Make sure data structures are in consistent state before querying them
1112  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1113  KMP_MB();
1114 
1115  queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
1116  if (queue != NULL) {
1117  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1118  return;
1119  }
1120 
1121  while (queue != NULL) {
1122  kmpc_task_queue_t *next;
1123  int ct = ++(queue->tq_ref_count);
1124  KMP_DEBUG_REF_CTS(
1125  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
1126 
1127  /* although reference count stays active during descendant walk, */
1128  /* shouldn't matter since if children still exist, reference */
1129  /* counts aren't being monitored anyway */
1130 
1131  if (queue->tq_flags & TQF_IS_NOWAIT) {
1132  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1133 
1134  if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) &&
1135  (queue->tq_nfull == 0) && __kmp_taskq_tasks_finished(queue) &&
1136  !__kmp_taskq_has_any_children(queue)) {
1137 
1138  /* Only remove this if we have not already marked it for deallocation.
1139  This should prevent multiple threads from trying to free this. */
1140 
1141  if (__kmp_test_lock(&queue->tq_queue_lck, global_tid)) {
1142  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1143  queue->tq_flags |= TQF_DEALLOCATED;
1144  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1145 
1146  __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1147 
1148  /* Can't do any more here since can't be sure where sibling queue
1149  * is so just exit this level */
1150  return;
1151  } else {
1152  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1153  }
1154  }
1155  /* otherwise, just fall through and decrement reference count */
1156  }
1157  }
1158 
1159  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1160  // Make sure data structures are in consistent state before querying them
1161  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1162  KMP_MB();
1163 
1164  next = queue->tq_next_child;
1165 
1166  ct = --(queue->tq_ref_count);
1167  KMP_DEBUG_REF_CTS(
1168  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1169  KMP_DEBUG_ASSERT(ct >= 0);
1170 
1171  queue = next;
1172  }
1173 
1174  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1175  }
1176 }
1177 
1178 /* Starting from indicated queue, proceed downward through tree and remove all
1179  taskq's assuming all are finished and assuming NO other threads are executing
1180  at this point. */
1181 static void __kmp_remove_all_child_taskq(kmp_taskq_t *tq, kmp_int32 global_tid,
1182  kmpc_task_queue_t *queue) {
1183  kmpc_task_queue_t *next_child;
1184 
1185  queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child);
1186 
1187  while (queue != NULL) {
1188  __kmp_remove_all_child_taskq(tq, global_tid, queue);
1189 
1190  next_child = queue->tq_next_child;
1191  queue->tq_flags |= TQF_DEALLOCATED;
1192  __kmp_remove_queue_from_tree(tq, global_tid, queue, FALSE);
1193  queue = next_child;
1194  }
1195 }
1196 
1197 static void __kmp_execute_task_from_queue(kmp_taskq_t *tq, ident_t *loc,
1198  kmp_int32 global_tid,
1199  kmpc_thunk_t *thunk,
1200  int in_parallel) {
1201  kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1202  kmp_int32 tid = __kmp_tid_from_gtid(global_tid);
1203 
1204  KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1205  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1206  KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1207  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1208 
1209  /* For the taskq task, the curr_thunk pushes and pop pairs are set up as
1210  * follows:
1211  *
1212  * happens exactly once:
1213  * 1) __kmpc_taskq : push (if returning thunk only)
1214  * 4) __kmpc_end_taskq_task : pop
1215  *
1216  * optionally happens *each* time taskq task is dequeued/enqueued:
1217  * 2) __kmpc_taskq_task : pop
1218  * 3) __kmp_execute_task_from_queue : push
1219  *
1220  * execution ordering: 1,(2,3)*,4
1221  */
1222 
1223  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1224  kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1225  thunk->th.th_shareds =
1226  CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data);
1227 
1228  if (__kmp_env_consistency_check) {
1229  __kmp_push_workshare(global_tid,
1230  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1231  : ct_task,
1232  queue->tq_loc);
1233  }
1234  } else {
1235  if (__kmp_env_consistency_check)
1236  __kmp_push_workshare(global_tid, ct_taskq, queue->tq_loc);
1237  }
1238 
1239  if (in_parallel) {
1240  thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1241  tq->tq_curr_thunk[tid] = thunk;
1242 
1243  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1244  }
1245 
1246  KF_TRACE(50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk,
1247  queue, global_tid));
1248  thunk->th_task(global_tid, thunk);
1249  KF_TRACE(50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue,
1250  global_tid));
1251 
1252  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1253  if (__kmp_env_consistency_check)
1254  __kmp_pop_workshare(global_tid,
1255  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1256  : ct_task,
1257  queue->tq_loc);
1258 
1259  if (in_parallel) {
1260  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1261  thunk->th_encl_thunk = NULL;
1262  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1263  }
1264 
1265  if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1266  __kmp_taskq_check_ordered(global_tid, thunk);
1267  }
1268 
1269  __kmp_free_thunk(queue, thunk, in_parallel, global_tid);
1270 
1271  KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n",
1272  global_tid, thunk));
1273  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1274 
1275  if (in_parallel) {
1276  KMP_MB(); /* needed so thunk put on free list before outstanding thunk
1277  count is decremented */
1278 
1279  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1280 
1281  KF_TRACE(
1282  200,
1283  ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1284  global_tid, queue->tq_th_thunks[tid].ai_data - 1, queue));
1285 
1286  queue->tq_th_thunks[tid].ai_data--;
1287 
1288  /* KMP_MB(); */ /* is MB really necessary ? */
1289  }
1290 
1291  if (queue->tq.tq_parent != NULL && in_parallel) {
1292  int ct;
1293  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1294  ct = --(queue->tq_ref_count);
1295  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1296  KMP_DEBUG_REF_CTS(
1297  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1298  KMP_DEBUG_ASSERT(ct >= 0);
1299  }
1300  }
1301 }
1302 
1303 /* starts a taskq; creates and returns a thunk for the taskq_task */
1304 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1305 kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid,
1306  kmpc_task_t taskq_task, size_t sizeof_thunk,
1307  size_t sizeof_shareds, kmp_int32 flags,
1308  kmpc_shared_vars_t **shareds) {
1309  int in_parallel;
1310  kmp_int32 nslots, nthunks, nshareds, nproc;
1311  kmpc_task_queue_t *new_queue, *curr_queue;
1312  kmpc_thunk_t *new_taskq_thunk;
1313  kmp_info_t *th;
1314  kmp_team_t *team;
1315  kmp_taskq_t *tq;
1316  kmp_int32 tid;
1317 
1318  KE_TRACE(10, ("__kmpc_taskq called (%d)\n", global_tid));
1319 
1320  th = __kmp_threads[global_tid];
1321  team = th->th.th_team;
1322  tq = &team->t.t_taskq;
1323  nproc = team->t.t_nproc;
1324  tid = __kmp_tid_from_gtid(global_tid);
1325 
1326  /* find out whether this is a parallel taskq or serialized one. */
1327  in_parallel = in_parallel_context(team);
1328 
1329  if (!tq->tq_root) {
1330  if (in_parallel) {
1331  /* Vector ORDERED SECTION to taskq version */
1332  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1333 
1334  /* Vector ORDERED SECTION to taskq version */
1335  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1336  }
1337 
1338  if (in_parallel) {
1339  // This shouldn't be a barrier region boundary, it will confuse the user.
1340  /* Need the boundary to be at the end taskq instead. */
1341  if (__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1342  /* Creating the active root queue, and we are not the master thread. */
1343  /* The master thread below created the queue and tasks have been */
1344  /* enqueued, and the master thread released this barrier. This */
1345  /* worker thread can now proceed and execute tasks. See also the */
1346  /* TQF_RELEASE_WORKERS which is used to handle this case. */
1347  *shareds =
1348  CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data);
1349  KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1350 
1351  return NULL;
1352  }
1353  }
1354 
1355  /* master thread only executes this code */
1356  if (tq->tq_curr_thunk_capacity < nproc) {
1357  if (tq->tq_curr_thunk)
1358  __kmp_free(tq->tq_curr_thunk);
1359  else {
1360  /* only need to do this once at outer level, i.e. when tq_curr_thunk is
1361  * still NULL */
1362  __kmp_init_lock(&tq->tq_freelist_lck);
1363  }
1364 
1365  tq->tq_curr_thunk =
1366  (kmpc_thunk_t **)__kmp_allocate(nproc * sizeof(kmpc_thunk_t *));
1367  tq->tq_curr_thunk_capacity = nproc;
1368  }
1369 
1370  if (in_parallel)
1371  tq->tq_global_flags = TQF_RELEASE_WORKERS;
1372  }
1373 
1374  /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1375  /* on some heuristics (e.g., depth of queue nesting?). */
1376  nslots = (in_parallel) ? (2 * nproc) : 1;
1377 
1378  /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1379  /* jobs being executed by other threads, and one extra for taskq slot */
1380  nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1)
1381  : nslots + 2;
1382 
1383  /* Only the root taskq gets a per-thread array of shareds. */
1384  /* The rest of the taskq's only get one copy of the shared vars. */
1385  nshareds = (!tq->tq_root && in_parallel) ? nproc : 1;
1386 
1387  /* create overall queue data structure and its components that require
1388  * allocation */
1389  new_queue = __kmp_alloc_taskq(tq, in_parallel, nslots, nthunks, nshareds,
1390  nproc, sizeof_thunk, sizeof_shareds,
1391  &new_taskq_thunk, global_tid);
1392 
1393  /* rest of new_queue initializations */
1394  new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1395 
1396  if (in_parallel) {
1397  new_queue->tq_tasknum_queuing = 0;
1398  new_queue->tq_tasknum_serving = 0;
1399  new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1400  }
1401 
1402  new_queue->tq_taskq_slot = NULL;
1403  new_queue->tq_nslots = nslots;
1404  new_queue->tq_hiwat = HIGH_WATER_MARK(nslots);
1405  new_queue->tq_nfull = 0;
1406  new_queue->tq_head = 0;
1407  new_queue->tq_tail = 0;
1408  new_queue->tq_loc = loc;
1409 
1410  if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1411  /* prepare to serve the first-queued task's ORDERED directive */
1412  new_queue->tq_tasknum_serving = 1;
1413 
1414  /* Vector ORDERED SECTION to taskq version */
1415  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1416 
1417  /* Vector ORDERED SECTION to taskq version */
1418  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1419  }
1420 
1421  /* create a new thunk for the taskq_task in the new_queue */
1422  *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data);
1423 
1424  new_taskq_thunk->th.th_shareds = *shareds;
1425  new_taskq_thunk->th_task = taskq_task;
1426  new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1427  new_taskq_thunk->th_status = 0;
1428 
1429  KMP_DEBUG_ASSERT(new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1430 
1431  // Make sure these inits complete before threads start using this queue
1432  /* KMP_MB(); */ // (necessary?)
1433 
1434  /* insert the new task queue into the tree, but only after all fields
1435  * initialized */
1436 
1437  if (in_parallel) {
1438  if (!tq->tq_root) {
1439  new_queue->tq.tq_parent = NULL;
1440  new_queue->tq_first_child = NULL;
1441  new_queue->tq_next_child = NULL;
1442  new_queue->tq_prev_child = NULL;
1443  new_queue->tq_ref_count = 1;
1444  tq->tq_root = new_queue;
1445  } else {
1446  curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1447  new_queue->tq.tq_parent = curr_queue;
1448  new_queue->tq_first_child = NULL;
1449  new_queue->tq_prev_child = NULL;
1450  new_queue->tq_ref_count =
1451  1; /* for this the thread that built the queue */
1452 
1453  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n", __LINE__,
1454  global_tid, new_queue, new_queue->tq_ref_count));
1455 
1456  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1457 
1458  // Make sure data structures are in consistent state before querying them
1459  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1460  KMP_MB();
1461 
1462  new_queue->tq_next_child =
1463  CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child);
1464 
1465  if (curr_queue->tq_first_child != NULL)
1466  curr_queue->tq_first_child->tq_prev_child = new_queue;
1467 
1468  curr_queue->tq_first_child = new_queue;
1469 
1470  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1471  }
1472 
1473  /* set up thunk stack only after code that determines curr_queue above */
1474  new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1475  tq->tq_curr_thunk[tid] = new_taskq_thunk;
1476 
1477  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1478  } else {
1479  new_taskq_thunk->th_encl_thunk = 0;
1480  new_queue->tq.tq_parent = NULL;
1481  new_queue->tq_first_child = NULL;
1482  new_queue->tq_next_child = NULL;
1483  new_queue->tq_prev_child = NULL;
1484  new_queue->tq_ref_count = 1;
1485  }
1486 
1487 #ifdef KMP_DEBUG
1488  KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1489  KF_DUMP(150, __kmp_dump_thunk(tq, new_taskq_thunk, global_tid));
1490 
1491  if (in_parallel) {
1492  KF_TRACE(25,
1493  ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1494  } else {
1495  KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue,
1496  global_tid));
1497  }
1498 
1499  KF_DUMP(25, __kmp_dump_task_queue(tq, new_queue, global_tid));
1500 
1501  if (in_parallel) {
1502  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1503  }
1504 #endif /* KMP_DEBUG */
1505 
1506  if (__kmp_env_consistency_check)
1507  __kmp_push_workshare(global_tid, ct_taskq, new_queue->tq_loc);
1508 
1509  KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1510 
1511  return new_taskq_thunk;
1512 }
1513 
1514 /* ends a taskq; last thread out destroys the queue */
1515 
1516 void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
1517  kmpc_thunk_t *taskq_thunk) {
1518 #ifdef KMP_DEBUG
1519  kmp_int32 i;
1520 #endif
1521  kmp_taskq_t *tq;
1522  int in_parallel;
1523  kmp_info_t *th;
1524  kmp_int32 is_outermost;
1525  kmpc_task_queue_t *queue;
1526  kmpc_thunk_t *thunk;
1527  int nproc;
1528 
1529  KE_TRACE(10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1530 
1531  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1532  nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
1533 
1534  /* For the outermost taskq only, all but one thread will have taskq_thunk ==
1535  * NULL */
1536  queue = (taskq_thunk == NULL) ? tq->tq_root
1537  : taskq_thunk->th.th_shareds->sv_queue;
1538 
1539  KE_TRACE(50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1540  is_outermost = (queue == tq->tq_root);
1541  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1542 
1543  if (in_parallel) {
1544  kmp_uint32 spins;
1545 
1546  /* this is just a safeguard to release the waiting threads if */
1547  /* the outermost taskq never queues a task */
1548 
1549  if (is_outermost && (KMP_MASTER_GTID(global_tid))) {
1550  if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
1551  /* no lock needed, workers are still in spin mode */
1552  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1553 
1554  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1555  }
1556  }
1557 
1558  /* keep dequeueing work until all tasks are queued and dequeued */
1559 
1560  do {
1561  /* wait until something is available to dequeue */
1562  KMP_INIT_YIELD(spins);
1563 
1564  while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) &&
1565  (!__kmp_taskq_has_any_children(queue)) &&
1566  (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) {
1567  KMP_YIELD_WHEN(TRUE, spins);
1568  }
1569 
1570  /* check to see if we can execute tasks in the queue */
1571  while (((queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL)) &&
1572  (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL) {
1573  KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk,
1574  queue, global_tid));
1575  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1576  }
1577 
1578  /* see if work found can be found in a descendant queue */
1579  if ((__kmp_taskq_has_any_children(queue)) &&
1580  (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1581  NULL) {
1582 
1583  KF_TRACE(50,
1584  ("Stole thunk: %p in descendant queue: %p while waiting in "
1585  "queue: %p (%d)\n",
1586  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1587 
1588  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1589  }
1590 
1591  } while ((!(queue->tq_flags & TQF_ALL_TASKS_QUEUED)) ||
1592  (queue->tq_nfull != 0));
1593 
1594  KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue,
1595  global_tid));
1596 
1597  /* wait while all tasks are not finished and more work found
1598  in descendant queues */
1599 
1600  while ((!__kmp_taskq_tasks_finished(queue)) &&
1601  (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1602  NULL) {
1603 
1604  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in "
1605  "queue: %p (%d)\n",
1606  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1607 
1608  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1609  }
1610 
1611  KF_TRACE(50, ("No work found in descendent queues or all work finished in "
1612  "queue: %p (%d)\n",
1613  queue, global_tid));
1614 
1615  if (!is_outermost) {
1616  /* need to return if NOWAIT present and not outermost taskq */
1617 
1618  if (queue->tq_flags & TQF_IS_NOWAIT) {
1619  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1620  queue->tq_ref_count--;
1621  KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
1622  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1623 
1624  KE_TRACE(
1625  10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1626 
1627  return;
1628  }
1629 
1630  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1631 
1632  /* WAIT until all tasks are finished and no child queues exist before
1633  * proceeding */
1634  KMP_INIT_YIELD(spins);
1635 
1636  while (!__kmp_taskq_tasks_finished(queue) ||
1637  __kmp_taskq_has_any_children(queue)) {
1638  thunk = __kmp_find_task_in_ancestor_queue(tq, global_tid, queue);
1639 
1640  if (thunk != NULL) {
1641  KF_TRACE(50,
1642  ("Stole thunk: %p in ancestor queue: %p while waiting in "
1643  "queue: %p (%d)\n",
1644  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1645  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk,
1646  in_parallel);
1647  }
1648 
1649  KMP_YIELD_WHEN(thunk == NULL, spins);
1650 
1651  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1652  }
1653 
1654  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1655  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1656  queue->tq_flags |= TQF_DEALLOCATED;
1657  }
1658  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1659 
1660  /* only the allocating thread can deallocate the queue */
1661  if (taskq_thunk != NULL) {
1662  __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1663  }
1664 
1665  KE_TRACE(
1666  10,
1667  ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n",
1668  global_tid));
1669 
1670  return;
1671  }
1672 
1673  // Outermost Queue: steal work from descendants until all tasks are finished
1674 
1675  KMP_INIT_YIELD(spins);
1676 
1677  while (!__kmp_taskq_tasks_finished(queue)) {
1678  thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1679 
1680  if (thunk != NULL) {
1681  KF_TRACE(50,
1682  ("Stole thunk: %p in descendant queue: %p while waiting in "
1683  "queue: %p (%d)\n",
1684  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1685 
1686  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1687  }
1688 
1689  KMP_YIELD_WHEN(thunk == NULL, spins);
1690  }
1691 
1692  /* Need this barrier to prevent destruction of queue before threads have all
1693  * executed above code */
1694  /* This may need to be done earlier when NOWAIT is implemented for the
1695  * outermost level */
1696 
1697  if (!__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1698  /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1699  /* for right now, everybody waits, and the master thread destroys the */
1700  /* remaining queues. */
1701 
1702  __kmp_remove_all_child_taskq(tq, global_tid, queue);
1703 
1704  /* Now destroy the root queue */
1705  KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n",
1706  global_tid, queue));
1707  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1708 
1709 #ifdef KMP_DEBUG
1710  /* the root queue entry */
1711  KMP_DEBUG_ASSERT((queue->tq.tq_parent == NULL) &&
1712  (queue->tq_next_child == NULL));
1713 
1714  /* children must all be gone by now because of barrier above */
1715  KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1716 
1717  for (i = 0; i < nproc; i++) {
1718  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1719  }
1720 
1721  for (i = 0, thunk = queue->tq_free_thunks; thunk != NULL;
1722  i++, thunk = thunk->th.th_next_free)
1723  ;
1724 
1725  KMP_DEBUG_ASSERT(i ==
1726  queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1727 
1728  for (i = 0; i < nproc; i++) {
1729  KMP_DEBUG_ASSERT(!tq->tq_curr_thunk[i]);
1730  }
1731 #endif
1732  /* unlink the root queue entry */
1733  tq->tq_root = NULL;
1734 
1735  /* release storage for root queue entry */
1736  KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue,
1737  global_tid));
1738 
1739  queue->tq_flags |= TQF_DEALLOCATED;
1740  __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1741 
1742  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1743 
1744  /* release the workers now that the data structures are up to date */
1745  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1746  }
1747 
1748  th = __kmp_threads[global_tid];
1749 
1750  /* Reset ORDERED SECTION to parallel version */
1751  th->th.th_dispatch->th_deo_fcn = 0;
1752 
1753  /* Reset ORDERED SECTION to parallel version */
1754  th->th.th_dispatch->th_dxo_fcn = 0;
1755  } else {
1756  /* in serial execution context, dequeue the last task */
1757  /* and execute it, if there were any tasks encountered */
1758 
1759  if (queue->tq_nfull > 0) {
1760  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1761 
1762  thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1763 
1764  if (queue->tq_flags & TQF_IS_LAST_TASK) {
1765  /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1766  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1767  /* instrumentation does copy-out. */
1768 
1769  /* no need for test_then_or call since already locked */
1770  thunk->th_flags |= TQF_IS_LAST_TASK;
1771  }
1772 
1773  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1774  thunk, queue));
1775 
1776  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1777  }
1778 
1779  // destroy the unattached serial queue now that there is no more work to do
1780  KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n",
1781  queue, global_tid));
1782  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1783 
1784 #ifdef KMP_DEBUG
1785  i = 0;
1786  for (thunk = queue->tq_free_thunks; thunk != NULL;
1787  thunk = thunk->th.th_next_free)
1788  ++i;
1789  KMP_DEBUG_ASSERT(i == queue->tq_nslots + 1);
1790 #endif
1791  /* release storage for unattached serial queue */
1792  KF_TRACE(50,
1793  ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1794 
1795  queue->tq_flags |= TQF_DEALLOCATED;
1796  __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1797  }
1798 
1799  KE_TRACE(10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1800 }
1801 
1802 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1803 /* Returns nonzero if just filled up queue */
1804 
1805 kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk) {
1806  kmp_int32 ret;
1807  kmpc_task_queue_t *queue;
1808  int in_parallel;
1809  kmp_taskq_t *tq;
1810 
1811  KE_TRACE(10, ("__kmpc_task called (%d)\n", global_tid));
1812 
1813  KMP_DEBUG_ASSERT(!(thunk->th_flags &
1814  TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1815 
1816  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1817  queue = thunk->th.th_shareds->sv_queue;
1818  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1819 
1820  if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1821  thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1822 
1823  /* For serial execution dequeue the preceding task and execute it, if one
1824  * exists */
1825  /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1826 
1827  if (!in_parallel && queue->tq_nfull > 0) {
1828  kmpc_thunk_t *prev_thunk;
1829 
1830  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1831 
1832  prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1833 
1834  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1835  prev_thunk, queue));
1836 
1837  __kmp_execute_task_from_queue(tq, loc, global_tid, prev_thunk, in_parallel);
1838  }
1839 
1840  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
1841  variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
1842  task queue is not full and allocates a thunk (which is then passed to
1843  __kmpc_task()). So, the enqueue below should never fail due to a full
1844  queue. */
1845 
1846  KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1847  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1848 
1849  ret = __kmp_enqueue_task(tq, global_tid, queue, thunk, in_parallel);
1850 
1851  KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1852  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1853 
1854  KE_TRACE(10, ("__kmpc_task return (%d)\n", global_tid));
1855 
1856  return ret;
1857 }
1858 
1859 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1860 /* this should never be called unless in a parallel context */
1861 
1862 void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk,
1863  kmp_int32 status) {
1864  kmpc_task_queue_t *queue;
1865  kmp_taskq_t *tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1866  int tid = __kmp_tid_from_gtid(global_tid);
1867 
1868  KE_TRACE(10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1869  KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1870  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1871 
1872  queue = thunk->th.th_shareds->sv_queue;
1873 
1874  if (__kmp_env_consistency_check)
1875  __kmp_pop_workshare(global_tid, ct_taskq, loc);
1876 
1877  /* thunk->th_task is the taskq_task */
1878  KMP_DEBUG_ASSERT(thunk->th_flags & TQF_TASKQ_TASK);
1879 
1880  /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1881  KMP_DEBUG_ASSERT(queue->tq_taskq_slot == NULL);
1882 
1883  /* dequeue taskq thunk from curr_thunk stack */
1884  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1885  thunk->th_encl_thunk = NULL;
1886 
1887  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1888 
1889  thunk->th_status = status;
1890 
1891  // Flush thunk->th_status before taskq_task enqueued to avoid race condition
1892  KMP_MB();
1893 
1894  /* enqueue taskq_task in thunk into special slot in queue */
1895  /* GEH - probably don't need to lock taskq slot since only one */
1896  /* thread enqueues & already a lock set at dequeue point */
1897 
1898  queue->tq_taskq_slot = thunk;
1899 
1900  KE_TRACE(10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1901 }
1902 
1903 /* ends a taskq_task; done generating tasks */
1904 
1905 void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid,
1906  kmpc_thunk_t *thunk) {
1907  kmp_taskq_t *tq;
1908  kmpc_task_queue_t *queue;
1909  int in_parallel;
1910  int tid;
1911 
1912  KE_TRACE(10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1913 
1914  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1915  queue = thunk->th.th_shareds->sv_queue;
1916  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1917  tid = __kmp_tid_from_gtid(global_tid);
1918 
1919  if (__kmp_env_consistency_check)
1920  __kmp_pop_workshare(global_tid, ct_taskq, loc);
1921 
1922  if (in_parallel) {
1923 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1924  KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1925  TQF_ALL_TASKS_QUEUED);
1926 #else
1927  {
1928  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1929 
1930  // Make sure data structures are in consistent state before querying them
1931  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1932  KMP_MB();
1933 
1934  queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1935  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1936  }
1937 #endif
1938  }
1939 
1940  if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1941  /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in
1942  the queue if TQF_IS_LASTPRIVATE so we can positively identify that last
1943  task and run it with its TQF_IS_LAST_TASK bit turned on in th_flags.
1944  When __kmpc_end_taskq_task() is called we are done generating all the
1945  tasks, so we know the last one in the queue is the lastprivate task.
1946  Mark the queue as having gotten to this state via tq_flags &
1947  TQF_IS_LAST_TASK; when that task actually executes mark it via th_flags &
1948  TQF_IS_LAST_TASK (this th_flags bit signals the instrumented code to do
1949  copy-outs after execution). */
1950  if (!in_parallel) {
1951  /* No synchronization needed for serial context */
1952  queue->tq_flags |= TQF_IS_LAST_TASK;
1953  } else {
1954 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1955  KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1956  TQF_IS_LAST_TASK);
1957 #else
1958  {
1959  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1960 
1961  // Make sure data structures in consistent state before querying them
1962  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1963  KMP_MB();
1964 
1965  queue->tq_flags |= TQF_IS_LAST_TASK;
1966  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1967  }
1968 #endif
1969  /* to prevent race condition where last task is dequeued but */
1970  /* flag isn't visible yet (not sure about this) */
1971  KMP_MB();
1972  }
1973  }
1974 
1975  /* dequeue taskq thunk from curr_thunk stack */
1976  if (in_parallel) {
1977  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1978  thunk->th_encl_thunk = NULL;
1979 
1980  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1981  }
1982 
1983  KE_TRACE(10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
1984 }
1985 
1986 /* returns thunk for a regular task based on taskq_thunk */
1987 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
1988 
1989 kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid,
1990  kmpc_thunk_t *taskq_thunk, kmpc_task_t task) {
1991  kmp_taskq_t *tq;
1992  kmpc_task_queue_t *queue;
1993  kmpc_thunk_t *new_thunk;
1994  int in_parallel;
1995 
1996  KE_TRACE(10, ("__kmpc_task_buffer called (%d)\n", global_tid));
1997 
1998  KMP_DEBUG_ASSERT(
1999  taskq_thunk->th_flags &
2000  TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
2001 
2002  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
2003  queue = taskq_thunk->th.th_shareds->sv_queue;
2004  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2005 
2006  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
2007  variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
2008  task queue is not full and allocates a thunk (which is then passed to
2009  __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be
2010  the next to be enqueued in __kmpc_task(). */
2011 
2012  new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid);
2013  new_thunk->th.th_shareds =
2014  CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data);
2015  new_thunk->th_encl_thunk = NULL;
2016  new_thunk->th_task = task;
2017 
2018  /* GEH - shouldn't need to lock the read of tq_flags here */
2019  new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2020 
2021  new_thunk->th_status = 0;
2022 
2023  KMP_DEBUG_ASSERT(!(new_thunk->th_flags & TQF_TASKQ_TASK));
2024 
2025  KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2026  KF_DUMP(100, __kmp_dump_thunk(tq, new_thunk, global_tid));
2027 
2028  KE_TRACE(10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2029 
2030  return new_thunk;
2031 }
Definition: kmp.h:208
KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid)