LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_wait_release.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #include "tsan_annotations.h"
27 
28 /* ------------------------------------------------------------------------ */
29 /* ------------------------------------------------------------------------ */
30 
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr );
34 static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
35 static int __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
36 
37 #ifdef OMP_45_ENABLED
38 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
39 #endif
40 
41 #ifdef BUILD_TIED_TASK_STACK
42 
43 //---------------------------------------------------------------------------
44 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
45 // from top do bottom
46 //
47 // gtid: global thread identifier for thread containing stack
48 // thread_data: thread data for task team thread containing stack
49 // threshold: value above which the trace statement triggers
50 // location: string identifying call site of this function (for trace)
51 
52 static void
53 __kmp_trace_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data, int threshold, char *location )
54 {
55  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
56  kmp_taskdata_t **stack_top = task_stack -> ts_top;
57  kmp_int32 entries = task_stack -> ts_entries;
58  kmp_taskdata_t *tied_task;
59 
60  KA_TRACE(threshold, ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
61  "first_block = %p, stack_top = %p \n",
62  location, gtid, entries, task_stack->ts_first_block, stack_top ) );
63 
64  KMP_DEBUG_ASSERT( stack_top != NULL );
65  KMP_DEBUG_ASSERT( entries > 0 );
66 
67  while ( entries != 0 )
68  {
69  KMP_DEBUG_ASSERT( stack_top != & task_stack->ts_first_block.sb_block[0] );
70  // fix up ts_top if we need to pop from previous block
71  if ( entries & TASK_STACK_INDEX_MASK == 0 )
72  {
73  kmp_stack_block_t *stack_block = (kmp_stack_block_t *) (stack_top) ;
74 
75  stack_block = stack_block -> sb_prev;
76  stack_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
77  }
78 
79  // finish bookkeeping
80  stack_top--;
81  entries--;
82 
83  tied_task = * stack_top;
84 
85  KMP_DEBUG_ASSERT( tied_task != NULL );
86  KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
87 
88  KA_TRACE(threshold, ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
89  "stack_top=%p, tied_task=%p\n",
90  location, gtid, entries, stack_top, tied_task ) );
91  }
92  KMP_DEBUG_ASSERT( stack_top == & task_stack->ts_first_block.sb_block[0] );
93 
94  KA_TRACE(threshold, ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
95  location, gtid ) );
96 }
97 
98 //---------------------------------------------------------------------------
99 // __kmp_init_task_stack: initialize the task stack for the first time
100 // after a thread_data structure is created.
101 // It should not be necessary to do this again (assuming the stack works).
102 //
103 // gtid: global thread identifier of calling thread
104 // thread_data: thread data for task team thread containing stack
105 
106 static void
107 __kmp_init_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
108 {
109  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
110  kmp_stack_block_t *first_block;
111 
112  // set up the first block of the stack
113  first_block = & task_stack -> ts_first_block;
114  task_stack -> ts_top = (kmp_taskdata_t **) first_block;
115  memset( (void *) first_block, '\0', TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
116 
117  // initialize the stack to be empty
118  task_stack -> ts_entries = TASK_STACK_EMPTY;
119  first_block -> sb_next = NULL;
120  first_block -> sb_prev = NULL;
121 }
122 
123 
124 //---------------------------------------------------------------------------
125 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 // gtid: global thread identifier for calling thread
128 // thread_data: thread info for thread containing stack
129 
130 static void
131 __kmp_free_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
132 {
133  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
134  kmp_stack_block_t *stack_block = & task_stack -> ts_first_block;
135 
136  KMP_DEBUG_ASSERT( task_stack -> ts_entries == TASK_STACK_EMPTY );
137  // free from the second block of the stack
138  while ( stack_block != NULL ) {
139  kmp_stack_block_t *next_block = (stack_block) ? stack_block -> sb_next : NULL;
140 
141  stack_block -> sb_next = NULL;
142  stack_block -> sb_prev = NULL;
143  if (stack_block != & task_stack -> ts_first_block) {
144  __kmp_thread_free( thread, stack_block ); // free the block, if not the first
145  }
146  stack_block = next_block;
147  }
148  // initialize the stack to be empty
149  task_stack -> ts_entries = 0;
150  task_stack -> ts_top = NULL;
151 }
152 
153 
154 //---------------------------------------------------------------------------
155 // __kmp_push_task_stack: Push the tied task onto the task stack.
156 // Grow the stack if necessary by allocating another block.
157 //
158 // gtid: global thread identifier for calling thread
159 // thread: thread info for thread containing stack
160 // tied_task: the task to push on the stack
161 
162 static void
163 __kmp_push_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t * tied_task )
164 {
165  // GEH - need to consider what to do if tt_threads_data not allocated yet
166  kmp_thread_data_t *thread_data = & thread -> th.th_task_team ->
167  tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
168  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
169 
170  if ( tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser ) {
171  return; // Don't push anything on stack if team or team tasks are serialized
172  }
173 
174  KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
175  KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
176 
177  KA_TRACE(20, ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
178  gtid, thread, tied_task ) );
179  // Store entry
180  * (task_stack -> ts_top) = tied_task;
181 
182  // Do bookkeeping for next push
183  task_stack -> ts_top++;
184  task_stack -> ts_entries++;
185 
186  if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
187  {
188  // Find beginning of this task block
189  kmp_stack_block_t *stack_block =
190  (kmp_stack_block_t *) (task_stack -> ts_top - TASK_STACK_BLOCK_SIZE);
191 
192  // Check if we already have a block
193  if ( stack_block -> sb_next != NULL )
194  { // reset ts_top to beginning of next block
195  task_stack -> ts_top = & stack_block -> sb_next -> sb_block[0];
196  }
197  else
198  { // Alloc new block and link it up
199  kmp_stack_block_t *new_block = (kmp_stack_block_t *)
200  __kmp_thread_calloc(thread, sizeof(kmp_stack_block_t));
201 
202  task_stack -> ts_top = & new_block -> sb_block[0];
203  stack_block -> sb_next = new_block;
204  new_block -> sb_prev = stack_block;
205  new_block -> sb_next = NULL;
206 
207  KA_TRACE(30, ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
208  gtid, tied_task, new_block ) );
209  }
210  }
211  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
212 }
213 
214 //---------------------------------------------------------------------------
215 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
216 // the task, just check to make sure it matches the ending task passed in.
217 //
218 // gtid: global thread identifier for the calling thread
219 // thread: thread info structure containing stack
220 // tied_task: the task popped off the stack
221 // ending_task: the task that is ending (should match popped task)
222 
223 static void
224 __kmp_pop_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *ending_task )
225 {
226  // GEH - need to consider what to do if tt_threads_data not allocated yet
227  kmp_thread_data_t *thread_data = & thread -> th.th_task_team -> tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
228  kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
229  kmp_taskdata_t *tied_task;
230 
231  if ( ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser ) {
232  return; // Don't pop anything from stack if team or team tasks are serialized
233  }
234 
235  KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
236  KMP_DEBUG_ASSERT( task_stack -> ts_entries > 0 );
237 
238  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, thread ) );
239 
240  // fix up ts_top if we need to pop from previous block
241  if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
242  {
243  kmp_stack_block_t *stack_block =
244  (kmp_stack_block_t *) (task_stack -> ts_top) ;
245 
246  stack_block = stack_block -> sb_prev;
247  task_stack -> ts_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
248  }
249 
250  // finish bookkeeping
251  task_stack -> ts_top--;
252  task_stack -> ts_entries--;
253 
254  tied_task = * (task_stack -> ts_top );
255 
256  KMP_DEBUG_ASSERT( tied_task != NULL );
257  KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
258  KMP_DEBUG_ASSERT( tied_task == ending_task ); // If we built the stack correctly
259 
260  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
261  return;
262 }
263 #endif /* BUILD_TIED_TASK_STACK */
264 
265 //---------------------------------------------------
266 // __kmp_push_task: Add a task to the thread's deque
267 
268 static kmp_int32
269 __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
270 {
271  kmp_info_t * thread = __kmp_threads[ gtid ];
272  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
273  kmp_task_team_t * task_team = thread->th.th_task_team;
274  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
275  kmp_thread_data_t * thread_data;
276 
277  KA_TRACE(20, ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata ) );
278 
279  if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
280  // untied task needs to increment counter so that the task structure is not freed prematurely
281  kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
282  KA_TRACE(20, ( "__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
283  gtid, counter, taskdata ) );
284  }
285 
286  // The first check avoids building task_team thread data if serialized
287  if ( taskdata->td_flags.task_serial ) {
288  KA_TRACE(20, ( "__kmp_push_task: T#%d team serialized; returning TASK_NOT_PUSHED for task %p\n",
289  gtid, taskdata ) );
290  return TASK_NOT_PUSHED;
291  }
292 
293  // Now that serialized tasks have returned, we can assume that we are not in immediate exec mode
294  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
295  if ( ! KMP_TASKING_ENABLED(task_team) ) {
296  __kmp_enable_tasking( task_team, thread );
297  }
298  KMP_DEBUG_ASSERT( TCR_4(task_team -> tt.tt_found_tasks) == TRUE );
299  KMP_DEBUG_ASSERT( TCR_PTR(task_team -> tt.tt_threads_data) != NULL );
300 
301  // Find tasking deque specific to encountering thread
302  thread_data = & task_team -> tt.tt_threads_data[ tid ];
303 
304  // No lock needed since only owner can allocate
305  if (thread_data -> td.td_deque == NULL ) {
306  __kmp_alloc_task_deque( thread, thread_data );
307  }
308 
309  // Check if deque is full
310  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
311  {
312  KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n",
313  gtid, taskdata ) );
314  return TASK_NOT_PUSHED;
315  }
316 
317  // Lock the deque for the task push operation
318  __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
319 
320 #if OMP_45_ENABLED
321  // Need to recheck as we can get a proxy task from a thread outside of OpenMP
322  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
323  {
324  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
325  KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
326  gtid, taskdata ) );
327  return TASK_NOT_PUSHED;
328  }
329 #else
330  // Must have room since no thread can add tasks but calling thread
331  KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE(thread_data->td) );
332 #endif
333 
334  thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Push taskdata
335  // Wrap index.
336  thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
337  TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1); // Adjust task count
338 
339  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
340  "task=%p ntasks=%d head=%u tail=%u\n",
341  gtid, taskdata, thread_data->td.td_deque_ntasks,
342  thread_data->td.td_deque_head, thread_data->td.td_deque_tail) );
343 
344  __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
345 
346  return TASK_SUCCESSFULLY_PUSHED;
347 }
348 
349 
350 //-----------------------------------------------------------------------------------------
351 // __kmp_pop_current_task_from_thread: set up current task from called thread when team ends
352 // this_thr: thread structure to set current_task in.
353 
354 void
355 __kmp_pop_current_task_from_thread( kmp_info_t *this_thr )
356 {
357  KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(enter): T#%d this_thread=%p, curtask=%p, "
358  "curtask_parent=%p\n",
359  0, this_thr, this_thr -> th.th_current_task,
360  this_thr -> th.th_current_task -> td_parent ) );
361 
362  this_thr -> th.th_current_task = this_thr -> th.th_current_task -> td_parent;
363 
364  KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(exit): T#%d this_thread=%p, curtask=%p, "
365  "curtask_parent=%p\n",
366  0, this_thr, this_thr -> th.th_current_task,
367  this_thr -> th.th_current_task -> td_parent ) );
368 }
369 
370 
371 //---------------------------------------------------------------------------------------
372 // __kmp_push_current_task_to_thread: set up current task in called thread for a new team
373 // this_thr: thread structure to set up
374 // team: team for implicit task data
375 // tid: thread within team to set up
376 
377 void
378 __kmp_push_current_task_to_thread( kmp_info_t *this_thr, kmp_team_t *team, int tid )
379 {
380  // current task of the thread is a parent of the new just created implicit tasks of new team
381  KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p curtask=%p "
382  "parent_task=%p\n",
383  tid, this_thr, this_thr->th.th_current_task,
384  team->t.t_implicit_task_taskdata[tid].td_parent ) );
385 
386  KMP_DEBUG_ASSERT (this_thr != NULL);
387 
388  if( tid == 0 ) {
389  if( this_thr->th.th_current_task != & team -> t.t_implicit_task_taskdata[ 0 ] ) {
390  team -> t.t_implicit_task_taskdata[ 0 ].td_parent = this_thr->th.th_current_task;
391  this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ 0 ];
392  }
393  } else {
394  team -> t.t_implicit_task_taskdata[ tid ].td_parent = team -> t.t_implicit_task_taskdata[ 0 ].td_parent;
395  this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ tid ];
396  }
397 
398  KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p curtask=%p "
399  "parent_task=%p\n",
400  tid, this_thr, this_thr->th.th_current_task,
401  team->t.t_implicit_task_taskdata[tid].td_parent ) );
402 }
403 
404 
405 //----------------------------------------------------------------------
406 // __kmp_task_start: bookkeeping for a task starting execution
407 // GTID: global thread id of calling thread
408 // task: task starting execution
409 // current_task: task suspending
410 
411 static void
412 __kmp_task_start( kmp_int32 gtid, kmp_task_t * task, kmp_taskdata_t * current_task )
413 {
414  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
415  kmp_info_t * thread = __kmp_threads[ gtid ];
416 
417  KA_TRACE(10, ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
418  gtid, taskdata, current_task) );
419 
420  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
421 
422  // mark currently executing task as suspended
423  // TODO: GEH - make sure root team implicit task is initialized properly.
424  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
425  current_task -> td_flags.executing = 0;
426 
427  // Add task to stack if tied
428 #ifdef BUILD_TIED_TASK_STACK
429  if ( taskdata -> td_flags.tiedness == TASK_TIED )
430  {
431  __kmp_push_task_stack( gtid, thread, taskdata );
432  }
433 #endif /* BUILD_TIED_TASK_STACK */
434 
435  // mark starting task as executing and as current task
436  thread -> th.th_current_task = taskdata;
437 
438  KMP_DEBUG_ASSERT( taskdata->td_flags.started == 0 || taskdata->td_flags.tiedness == TASK_UNTIED );
439  KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 || taskdata->td_flags.tiedness == TASK_UNTIED );
440  taskdata -> td_flags.started = 1;
441  taskdata -> td_flags.executing = 1;
442  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
443  KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
444 
445  // GEH TODO: shouldn't we pass some sort of location identifier here?
446  // APT: yes, we will pass location here.
447  // need to store current thread state (in a thread or taskdata structure)
448  // before setting work_state, otherwise wrong state is set after end of task
449 
450  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n",
451  gtid, taskdata ) );
452 
453 #if OMPT_SUPPORT
454  if (ompt_enabled &&
455  ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
456  kmp_taskdata_t *parent = taskdata->td_parent;
457  ompt_callbacks.ompt_callback(ompt_event_task_begin)(
458  parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
459  parent ? &(parent->ompt_task_info.frame) : NULL,
460  taskdata->ompt_task_info.task_id,
461  taskdata->ompt_task_info.function);
462  }
463 #endif
464 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
465  /* OMPT emit all dependences if requested by the tool */
466  if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
467  ompt_callbacks.ompt_callback(ompt_event_task_dependences))
468  {
469  ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
470  taskdata->ompt_task_info.task_id,
471  taskdata->ompt_task_info.deps,
472  taskdata->ompt_task_info.ndeps
473  );
474  /* We can now free the allocated memory for the dependencies */
475  KMP_OMPT_DEPS_FREE (thread, taskdata->ompt_task_info.deps);
476  taskdata->ompt_task_info.deps = NULL;
477  taskdata->ompt_task_info.ndeps = 0;
478  }
479 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
480 
481  return;
482 }
483 
484 
485 //----------------------------------------------------------------------
486 // __kmpc_omp_task_begin_if0: report that a given serialized task has started execution
487 // loc_ref: source location information; points to beginning of task block.
488 // gtid: global thread number.
489 // task: task thunk for the started task.
490 
491 void
492 __kmpc_omp_task_begin_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
493 {
494  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
495  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
496 
497  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p current_task=%p\n",
498  gtid, loc_ref, taskdata, current_task ) );
499 
500  if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
501  // untied task needs to increment counter so that the task structure is not freed prematurely
502  kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
503  KA_TRACE(20, ( "__kmpc_omp_task_begin_if0: T#%d untied_count (%d) incremented for task %p\n",
504  gtid, counter, taskdata ) );
505  }
506 
507  taskdata -> td_flags.task_serial = 1; // Execute this task immediately, not deferred.
508  __kmp_task_start( gtid, task, current_task );
509 
510  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n",
511  gtid, loc_ref, taskdata ) );
512 
513  return;
514 }
515 
516 #ifdef TASK_UNUSED
517 //----------------------------------------------------------------------
518 // __kmpc_omp_task_begin: report that a given task has started execution
519 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
520 
521 void
522 __kmpc_omp_task_begin( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
523 {
524  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
525 
526  KA_TRACE(10, ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
527  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task ) );
528 
529  __kmp_task_start( gtid, task, current_task );
530 
531  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n",
532  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
533 
534  return;
535 }
536 #endif // TASK_UNUSED
537 
538 
539 //-------------------------------------------------------------------------------------
540 // __kmp_free_task: free the current task space and the space for shareds
541 // gtid: Global thread ID of calling thread
542 // taskdata: task to free
543 // thread: thread data structure of caller
544 
545 static void
546 __kmp_free_task( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
547 {
548  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n",
549  gtid, taskdata) );
550 
551  // Check to make sure all flags and counters have the correct values
552  KMP_DEBUG_ASSERT( taskdata->td_flags.tasktype == TASK_EXPLICIT );
553  KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 );
554  KMP_DEBUG_ASSERT( taskdata->td_flags.complete == 1 );
555  KMP_DEBUG_ASSERT( taskdata->td_flags.freed == 0 );
556  KMP_DEBUG_ASSERT( TCR_4(taskdata->td_allocated_child_tasks) == 0 || taskdata->td_flags.task_serial == 1);
557  KMP_DEBUG_ASSERT( TCR_4(taskdata->td_incomplete_child_tasks) == 0 );
558 
559  taskdata->td_flags.freed = 1;
560  ANNOTATE_HAPPENS_BEFORE(taskdata);
561  // deallocate the taskdata and shared variable blocks associated with this task
562  #if USE_FAST_MEMORY
563  __kmp_fast_free( thread, taskdata );
564  #else /* ! USE_FAST_MEMORY */
565  __kmp_thread_free( thread, taskdata );
566  #endif
567 
568  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n",
569  gtid, taskdata) );
570 }
571 
572 //-------------------------------------------------------------------------------------
573 // __kmp_free_task_and_ancestors: free the current task and ancestors without children
574 //
575 // gtid: Global thread ID of calling thread
576 // taskdata: task to free
577 // thread: thread data structure of caller
578 
579 static void
580 __kmp_free_task_and_ancestors( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
581 {
582 #if OMP_45_ENABLED
583  // Proxy tasks must always be allowed to free their parents
584  // because they can be run in background even in serial mode.
585  kmp_int32 team_serial = ( taskdata->td_flags.team_serial ||
586  taskdata->td_flags.tasking_ser ) && !taskdata->td_flags.proxy;
587 #else
588  kmp_int32 team_serial = taskdata->td_flags.team_serial ||
589  taskdata->td_flags.tasking_ser;
590 #endif
591  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
592 
593  kmp_int32 children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
594  KMP_DEBUG_ASSERT( children >= 0 );
595 
596  // Now, go up the ancestor tree to see if any ancestors can now be freed.
597  while ( children == 0 )
598  {
599  kmp_taskdata_t * parent_taskdata = taskdata -> td_parent;
600 
601  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
602  "and freeing itself\n", gtid, taskdata) );
603 
604  // --- Deallocate my ancestor task ---
605  __kmp_free_task( gtid, taskdata, thread );
606 
607  taskdata = parent_taskdata;
608 
609  // Stop checking ancestors at implicit task
610  // instead of walking up ancestor tree to avoid premature deallocation of ancestors.
611  if ( team_serial || taskdata -> td_flags.tasktype == TASK_IMPLICIT )
612  return;
613 
614  // Predecrement simulated by "- 1" calculation
615  children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
616  KMP_DEBUG_ASSERT( children >= 0 );
617  }
618 
619  KA_TRACE(20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
620  "not freeing it yet\n", gtid, taskdata, children) );
621 }
622 
623 //---------------------------------------------------------------------
624 // __kmp_task_finish: bookkeeping to do when a task finishes execution
625 // gtid: global thread ID for calling thread
626 // task: task to be finished
627 // resumed_task: task to be resumed. (may be NULL if task is serialized)
628 
629 static void
630 __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task )
631 {
632  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
633  kmp_info_t * thread = __kmp_threads[ gtid ];
634  kmp_task_team_t * task_team = thread->th.th_task_team; // might be NULL for serial teams...
635  kmp_int32 children = 0;
636 
637 #if OMPT_SUPPORT
638  if (ompt_enabled &&
639  ompt_callbacks.ompt_callback(ompt_event_task_end)) {
640  kmp_taskdata_t *parent = taskdata->td_parent;
641  ompt_callbacks.ompt_callback(ompt_event_task_end)(
642  taskdata->ompt_task_info.task_id);
643  }
644 #endif
645 
646  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming task %p\n",
647  gtid, taskdata, resumed_task) );
648 
649  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
650 
651  // Pop task from stack if tied
652 #ifdef BUILD_TIED_TASK_STACK
653  if ( taskdata -> td_flags.tiedness == TASK_TIED )
654  {
655  __kmp_pop_task_stack( gtid, thread, taskdata );
656  }
657 #endif /* BUILD_TIED_TASK_STACK */
658 
659  if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
660  // untied task needs to check the counter so that the task structure is not freed prematurely
661  kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
662  KA_TRACE(20, ( "__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
663  gtid, counter, taskdata ) );
664  if ( counter > 0 ) {
665  // untied task is not done, to be continued possibly by other thread, do not free it now
666  if (resumed_task == NULL) {
667  KMP_DEBUG_ASSERT( taskdata->td_flags.task_serial );
668  resumed_task = taskdata->td_parent; // In a serialized task, the resumed task is the parent
669  }
670  thread->th.th_current_task = resumed_task; // restore current_task
671  resumed_task->td_flags.executing = 1; // resume previous task
672  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, resuming task %p\n",
673  gtid, taskdata, resumed_task) );
674  return;
675  }
676  }
677 
678  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
679  taskdata -> td_flags.complete = 1; // mark the task as completed
680  KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 1 );
681  KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
682 
683  // Only need to keep track of count if team parallel and tasking not serialized
684  if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
685  // Predecrement simulated by "- 1" calculation
686  children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
687  KMP_DEBUG_ASSERT( children >= 0 );
688 #if OMP_40_ENABLED
689  if ( taskdata->td_taskgroup )
690  KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
691 #if OMP_45_ENABLED
692  }
693  // if we found proxy tasks there could exist a dependency chain
694  // with the proxy task as origin
695  if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) || (task_team && task_team->tt.tt_found_proxy_tasks) ) {
696 #endif
697  __kmp_release_deps(gtid,taskdata);
698 #endif
699  }
700 
701  // td_flags.executing must be marked as 0 after __kmp_release_deps has been called
702  // Othertwise, if a task is executed immediately from the release_deps code
703  // the flag will be reset to 1 again by this same function
704  KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 1 );
705  taskdata -> td_flags.executing = 0; // suspend the finishing task
706 
707  KA_TRACE(20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
708  gtid, taskdata, children) );
709 
710 #if OMP_40_ENABLED
711  /* If the tasks' destructor thunk flag has been set, we need to invoke the
712  destructor thunk that has been generated by the compiler.
713  The code is placed here, since at this point other tasks might have been released
714  hence overlapping the destructor invokations with some other work in the
715  released tasks. The OpenMP spec is not specific on when the destructors are
716  invoked, so we should be free to choose.
717  */
718  if (taskdata->td_flags.destructors_thunk) {
719  kmp_routine_entry_t destr_thunk = task->data1.destructors;
720  KMP_ASSERT(destr_thunk);
721  destr_thunk(gtid, task);
722  }
723 #endif // OMP_40_ENABLED
724 
725  // bookkeeping for resuming task:
726  // GEH - note tasking_ser => task_serial
727  KMP_DEBUG_ASSERT( (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
728  taskdata->td_flags.task_serial);
729  if ( taskdata->td_flags.task_serial )
730  {
731  if (resumed_task == NULL) {
732  resumed_task = taskdata->td_parent; // In a serialized task, the resumed task is the parent
733  }
734  else
735 #if OMP_45_ENABLED
736  if ( !(task_team && task_team->tt.tt_found_proxy_tasks) )
737 #endif
738  {
739  // verify resumed task passed in points to parent
740  KMP_DEBUG_ASSERT( resumed_task == taskdata->td_parent );
741  }
742  }
743  else {
744  KMP_DEBUG_ASSERT( resumed_task != NULL ); // verify that resumed task is passed as arguemnt
745  }
746 
747  // Free this task and then ancestor tasks if they have no children.
748  // Restore th_current_task first as suggested by John:
749  // johnmc: if an asynchronous inquiry peers into the runtime system
750  // it doesn't see the freed task as the current task.
751  thread->th.th_current_task = resumed_task;
752  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
753 
754  // TODO: GEH - make sure root team implicit task is initialized properly.
755  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
756  resumed_task->td_flags.executing = 1; // resume previous task
757 
758  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
759  gtid, taskdata, resumed_task) );
760 
761  return;
762 }
763 
764 //---------------------------------------------------------------------
765 // __kmpc_omp_task_complete_if0: report that a task has completed execution
766 // loc_ref: source location information; points to end of task block.
767 // gtid: global thread number.
768 // task: task thunk for the completed task.
769 
770 void
771 __kmpc_omp_task_complete_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
772 {
773  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
774  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
775 
776  __kmp_task_finish( gtid, task, NULL ); // this routine will provide task to resume
777 
778  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
779  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
780 
781  return;
782 }
783 
784 #ifdef TASK_UNUSED
785 //---------------------------------------------------------------------
786 // __kmpc_omp_task_complete: report that a task has completed execution
787 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
788 
789 void
790 __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
791 {
792  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n",
793  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
794 
795  __kmp_task_finish( gtid, task, NULL ); // Not sure how to find task to resume
796 
797  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n",
798  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
799  return;
800 }
801 #endif // TASK_UNUSED
802 
803 
804 #if OMPT_SUPPORT
805 //----------------------------------------------------------------------------------------------------
806 // __kmp_task_init_ompt:
807 // Initialize OMPT fields maintained by a task. This will only be called after
808 // ompt_tool, so we already know whether ompt is enabled or not.
809 
810 static inline void
811 __kmp_task_init_ompt( kmp_taskdata_t * task, int tid, void * function )
812 {
813  if (ompt_enabled) {
814  task->ompt_task_info.task_id = __ompt_task_id_new(tid);
815  task->ompt_task_info.function = function;
816  task->ompt_task_info.frame.exit_runtime_frame = NULL;
817  task->ompt_task_info.frame.reenter_runtime_frame = NULL;
818 #if OMP_40_ENABLED
819  task->ompt_task_info.ndeps = 0;
820  task->ompt_task_info.deps = NULL;
821 #endif /* OMP_40_ENABLED */
822  }
823 }
824 #endif
825 
826 
827 //----------------------------------------------------------------------------------------------------
828 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit task for a given thread
829 //
830 // loc_ref: reference to source location of parallel region
831 // this_thr: thread data structure corresponding to implicit task
832 // team: team for this_thr
833 // tid: thread id of given thread within team
834 // set_curr_task: TRUE if need to push current task to thread
835 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to have already been done elsewhere.
836 // TODO: Get better loc_ref. Value passed in may be NULL
837 
838 void
839 __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task )
840 {
841  kmp_taskdata_t * task = & team->t.t_implicit_task_taskdata[ tid ];
842 
843  KF_TRACE(10, ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
844  tid, team, task, set_curr_task ? "TRUE" : "FALSE" ) );
845 
846  task->td_task_id = KMP_GEN_TASK_ID();
847  task->td_team = team;
848 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info in debugger)
849  task->td_ident = loc_ref;
850  task->td_taskwait_ident = NULL;
851  task->td_taskwait_counter = 0;
852  task->td_taskwait_thread = 0;
853 
854  task->td_flags.tiedness = TASK_TIED;
855  task->td_flags.tasktype = TASK_IMPLICIT;
856 #if OMP_45_ENABLED
857  task->td_flags.proxy = TASK_FULL;
858 #endif
859 
860  // All implicit tasks are executed immediately, not deferred
861  task->td_flags.task_serial = 1;
862  task->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
863  task->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
864 
865  task->td_flags.started = 1;
866  task->td_flags.executing = 1;
867  task->td_flags.complete = 0;
868  task->td_flags.freed = 0;
869 
870 #if OMP_40_ENABLED
871  task->td_depnode = NULL;
872 #endif
873 
874  if (set_curr_task) { // only do this initialization the first time a thread is created
875  task->td_incomplete_child_tasks = 0;
876  task->td_allocated_child_tasks = 0; // Not used because do not need to deallocate implicit task
877 #if OMP_40_ENABLED
878  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
879  task->td_dephash = NULL;
880 #endif
881  __kmp_push_current_task_to_thread( this_thr, team, tid );
882  } else {
883  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
884  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
885  }
886 
887 #if OMPT_SUPPORT
888  __kmp_task_init_ompt(task, tid, NULL);
889 #endif
890 
891  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n",
892  tid, team, task ) );
893 }
894 
895 
896 //-----------------------------------------------------------------------------
902 //
903 void
904 __kmp_finish_implicit_task(kmp_info_t *thread)
905 {
906  kmp_taskdata_t *task = thread->th.th_current_task;
907  if (task->td_dephash)
908  __kmp_dephash_free_entries(thread, task->td_dephash);
909 }
910 
911 
912 //-----------------------------------------------------------------------------
917 //
918 void
919 __kmp_free_implicit_task(kmp_info_t *thread)
920 {
921  kmp_taskdata_t *task = thread->th.th_current_task;
922  if (task->td_dephash)
923  __kmp_dephash_free(thread, task->td_dephash);
924  task->td_dephash = NULL;
925 }
926 
927 
928 // Round up a size to a power of two specified by val
929 // Used to insert padding between structures co-allocated using a single malloc() call
930 static size_t
931 __kmp_round_up_to_val( size_t size, size_t val ) {
932  if ( size & ( val - 1 ) ) {
933  size &= ~ ( val - 1 );
934  if ( size <= KMP_SIZE_T_MAX - val ) {
935  size += val; // Round up if there is no overflow.
936  }; // if
937  }; // if
938  return size;
939 } // __kmp_round_up_to_va
940 
941 
942 //---------------------------------------------------------------------------------
943 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
944 //
945 // loc_ref: source location information
946 // gtid: global thread number.
947 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' task encountered.
948 // Converted from kmp_int32 to kmp_tasking_flags_t in routine.
949 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including private vars accessed in task.
950 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed in task.
951 // task_entry: Pointer to task code entry point generated by compiler.
952 // returns: a pointer to the allocated kmp_task_t structure (task).
953 
954 kmp_task_t *
955 __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
956  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
957  kmp_routine_entry_t task_entry )
958 {
959  kmp_task_t *task;
960  kmp_taskdata_t *taskdata;
961  kmp_info_t *thread = __kmp_threads[ gtid ];
962  kmp_team_t *team = thread->th.th_team;
963  kmp_taskdata_t *parent_task = thread->th.th_current_task;
964  size_t shareds_offset;
965 
966  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
967  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
968  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
969  sizeof_shareds, task_entry) );
970 
971  if ( parent_task->td_flags.final ) {
972  if (flags->merged_if0) {
973  }
974  flags->final = 1;
975  }
976 
977 #if OMP_45_ENABLED
978  if ( flags->proxy == TASK_PROXY ) {
979  flags->tiedness = TASK_UNTIED;
980  flags->merged_if0 = 1;
981 
982  /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */
983  if ( (thread->th.th_task_team) == NULL ) {
984  /* This should only happen if the team is serialized
985  setup a task team and propagate it to the thread
986  */
987  KMP_DEBUG_ASSERT(team->t.t_serialized);
988  KA_TRACE(30,("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid));
989  __kmp_task_team_setup(thread,team,1); // 1 indicates setup the current team regardless of nthreads
990  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
991  }
992  kmp_task_team_t * task_team = thread->th.th_task_team;
993 
994  /* tasking must be enabled now as the task might not be pushed */
995  if ( !KMP_TASKING_ENABLED( task_team ) ) {
996  KA_TRACE(30,("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
997  __kmp_enable_tasking( task_team, thread );
998  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
999  kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
1000  // No lock needed since only owner can allocate
1001  if (thread_data -> td.td_deque == NULL ) {
1002  __kmp_alloc_task_deque( thread, thread_data );
1003  }
1004  }
1005 
1006  if ( task_team->tt.tt_found_proxy_tasks == FALSE )
1007  TCW_4(task_team -> tt.tt_found_proxy_tasks, TRUE);
1008  }
1009 #endif
1010 
1011  // Calculate shared structure offset including padding after kmp_task_t struct
1012  // to align pointers in shared struct
1013  shareds_offset = sizeof( kmp_taskdata_t ) + sizeof_kmp_task_t;
1014  shareds_offset = __kmp_round_up_to_val( shareds_offset, sizeof( void * ));
1015 
1016  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1017  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n",
1018  gtid, shareds_offset) );
1019  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n",
1020  gtid, sizeof_shareds) );
1021 
1022  // Avoid double allocation here by combining shareds with taskdata
1023  #if USE_FAST_MEMORY
1024  taskdata = (kmp_taskdata_t *) __kmp_fast_allocate( thread, shareds_offset + sizeof_shareds );
1025  #else /* ! USE_FAST_MEMORY */
1026  taskdata = (kmp_taskdata_t *) __kmp_thread_malloc( thread, shareds_offset + sizeof_shareds );
1027  #endif /* USE_FAST_MEMORY */
1028  ANNOTATE_HAPPENS_AFTER(taskdata);
1029 
1030  task = KMP_TASKDATA_TO_TASK(taskdata);
1031 
1032  // Make sure task & taskdata are aligned appropriately
1033 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1034  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(double)-1) ) == 0 );
1035  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(double)-1) ) == 0 );
1036 #else
1037  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(_Quad)-1) ) == 0 );
1038  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(_Quad)-1) ) == 0 );
1039 #endif
1040  if (sizeof_shareds > 0) {
1041  // Avoid double allocation here by combining shareds with taskdata
1042  task->shareds = & ((char *) taskdata)[ shareds_offset ];
1043  // Make sure shareds struct is aligned to pointer size
1044  KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task->shareds) & (sizeof(void *)-1) ) == 0 );
1045  } else {
1046  task->shareds = NULL;
1047  }
1048  task->routine = task_entry;
1049  task->part_id = 0; // AC: Always start with 0 part id
1050 
1051  taskdata->td_task_id = KMP_GEN_TASK_ID();
1052  taskdata->td_team = team;
1053  taskdata->td_alloc_thread = thread;
1054  taskdata->td_parent = parent_task;
1055  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1056  taskdata->td_untied_count = 0;
1057  taskdata->td_ident = loc_ref;
1058  taskdata->td_taskwait_ident = NULL;
1059  taskdata->td_taskwait_counter = 0;
1060  taskdata->td_taskwait_thread = 0;
1061  KMP_DEBUG_ASSERT( taskdata->td_parent != NULL );
1062 #if OMP_45_ENABLED
1063  // avoid copying icvs for proxy tasks
1064  if ( flags->proxy == TASK_FULL )
1065 #endif
1066  copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
1067 
1068  taskdata->td_flags.tiedness = flags->tiedness;
1069  taskdata->td_flags.final = flags->final;
1070  taskdata->td_flags.merged_if0 = flags->merged_if0;
1071 #if OMP_40_ENABLED
1072  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1073 #endif // OMP_40_ENABLED
1074 #if OMP_45_ENABLED
1075  taskdata->td_flags.proxy = flags->proxy;
1076  taskdata->td_task_team = thread->th.th_task_team;
1077  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1078 #endif
1079  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1080 
1081  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1082  taskdata->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
1083 
1084  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1085  taskdata->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
1086 
1087  // GEH - Note we serialize the task if the team is serialized to make sure implicit parallel region
1088  // tasks are not left until program termination to execute. Also, it helps locality to execute
1089  // immediately.
1090  taskdata->td_flags.task_serial = ( parent_task->td_flags.final
1091  || taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser );
1092 
1093  taskdata->td_flags.started = 0;
1094  taskdata->td_flags.executing = 0;
1095  taskdata->td_flags.complete = 0;
1096  taskdata->td_flags.freed = 0;
1097 
1098  taskdata->td_flags.native = flags->native;
1099 
1100  taskdata->td_incomplete_child_tasks = 0;
1101  taskdata->td_allocated_child_tasks = 1; // start at one because counts current task and children
1102 #if OMP_40_ENABLED
1103  taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
1104  taskdata->td_dephash = NULL;
1105  taskdata->td_depnode = NULL;
1106 #endif
1107 
1108  // Only need to keep track of child task counts if team parallel and tasking not serialized or if it is a proxy task
1109 #if OMP_45_ENABLED
1110  if ( flags->proxy == TASK_PROXY || !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1111 #else
1112  if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1113 #endif
1114  {
1115  KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
1116 #if OMP_40_ENABLED
1117  if ( parent_task->td_taskgroup )
1118  KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
1119 #endif
1120  // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
1121  if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ) {
1122  KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
1123  }
1124  }
1125 
1126  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1127  gtid, taskdata, taskdata->td_parent) );
1128  ANNOTATE_HAPPENS_BEFORE(task);
1129 
1130 #if OMPT_SUPPORT
1131  __kmp_task_init_ompt(taskdata, gtid, (void*) task_entry);
1132 #endif
1133 
1134  return task;
1135 }
1136 
1137 
1138 kmp_task_t *
1139 __kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
1140  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1141  kmp_routine_entry_t task_entry )
1142 {
1143  kmp_task_t *retval;
1144  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags;
1145 
1146  input_flags->native = FALSE;
1147  // __kmp_task_alloc() sets up all other runtime flags
1148 
1149 #if OMP_45_ENABLED
1150  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1151  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1152  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1153  input_flags->proxy ? "proxy" : "",
1154  sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1155 #else
1156  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1157  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1158  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1159  sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1160 #endif
1161 
1162  retval = __kmp_task_alloc( loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1163  sizeof_shareds, task_entry );
1164 
1165  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval) );
1166 
1167  return retval;
1168 }
1169 
1170 //-----------------------------------------------------------
1171 // __kmp_invoke_task: invoke the specified task
1172 //
1173 // gtid: global thread ID of caller
1174 // task: the task to invoke
1175 // current_task: the task to resume after task invokation
1176 
1177 static void
1178 __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_task )
1179 {
1180  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
1181  kmp_uint64 cur_time;
1182 #if OMP_40_ENABLED
1183  int discard = 0 /* false */;
1184 #endif
1185  KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1186  gtid, taskdata, current_task) );
1187  KMP_DEBUG_ASSERT(task);
1188 #if OMP_45_ENABLED
1189  if ( taskdata->td_flags.proxy == TASK_PROXY &&
1190  taskdata->td_flags.complete == 1)
1191  {
1192  // This is a proxy task that was already completed but it needs to run
1193  // its bottom-half finish
1194  KA_TRACE(30, ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1195  gtid, taskdata) );
1196 
1197  __kmp_bottom_half_finish_proxy(gtid,task);
1198 
1199  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for proxy task %p, resuming task %p\n", gtid, taskdata, current_task) );
1200 
1201  return;
1202  }
1203 #endif
1204 
1205 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1206  if(__kmp_forkjoin_frames_mode == 3) {
1207  // Get the current time stamp to measure task execution time to correct barrier imbalance time
1208  cur_time = __itt_get_timestamp();
1209  }
1210 #endif
1211 
1212 #if OMP_45_ENABLED
1213  // Proxy tasks are not handled by the runtime
1214  if ( taskdata->td_flags.proxy != TASK_PROXY ) {
1215 #endif
1216  ANNOTATE_HAPPENS_AFTER(task);
1217  __kmp_task_start( gtid, task, current_task );
1218 #if OMP_45_ENABLED
1219  }
1220 #endif
1221 
1222 #if OMPT_SUPPORT
1223  ompt_thread_info_t oldInfo;
1224  kmp_info_t * thread;
1225  if (ompt_enabled) {
1226  // Store the threads states and restore them after the task
1227  thread = __kmp_threads[ gtid ];
1228  oldInfo = thread->th.ompt_thread_info;
1229  thread->th.ompt_thread_info.wait_id = 0;
1230  thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1231  taskdata->ompt_task_info.frame.exit_runtime_frame = __builtin_frame_address(0);
1232  }
1233 #endif
1234 
1235 #if OMP_40_ENABLED
1236  // TODO: cancel tasks if the parallel region has also been cancelled
1237  // TODO: check if this sequence can be hoisted above __kmp_task_start
1238  // if cancellation has been enabled for this run ...
1239  if (__kmp_omp_cancellation) {
1240  kmp_info_t *this_thr = __kmp_threads [ gtid ];
1241  kmp_team_t * this_team = this_thr->th.th_team;
1242  kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1243  if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
1244  KMP_COUNT_BLOCK(TASK_cancelled);
1245  // this task belongs to a task group and we need to cancel it
1246  discard = 1 /* true */;
1247  }
1248  }
1249 
1250  //
1251  // Invoke the task routine and pass in relevant data.
1252  // Thunks generated by gcc take a different argument list.
1253  //
1254  if (!discard) {
1255 #if KMP_STATS_ENABLED
1256  KMP_COUNT_BLOCK(TASK_executed);
1257  switch(KMP_GET_THREAD_STATE()) {
1258  case FORK_JOIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); break;
1259  case PLAIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); break;
1260  case TASKYIELD: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); break;
1261  case TASKWAIT: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); break;
1262  case TASKGROUP: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); break;
1263  default: KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); break;
1264  }
1265 #endif // KMP_STATS_ENABLED
1266 #endif // OMP_40_ENABLED
1267 
1268 #if OMPT_SUPPORT && OMPT_TRACE
1269  /* let OMPT know that we're about to run this task */
1270  if (ompt_enabled &&
1271  ompt_callbacks.ompt_callback(ompt_event_task_switch))
1272  {
1273  ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1274  current_task->ompt_task_info.task_id,
1275  taskdata->ompt_task_info.task_id);
1276  }
1277 #endif
1278 
1279 #ifdef KMP_GOMP_COMPAT
1280  if (taskdata->td_flags.native) {
1281  ((void (*)(void *))(*(task->routine)))(task->shareds);
1282  }
1283  else
1284 #endif /* KMP_GOMP_COMPAT */
1285  {
1286  (*(task->routine))(gtid, task);
1287  }
1288  KMP_POP_PARTITIONED_TIMER();
1289 
1290 #if OMPT_SUPPORT && OMPT_TRACE
1291  /* let OMPT know that we're returning to the callee task */
1292  if (ompt_enabled &&
1293  ompt_callbacks.ompt_callback(ompt_event_task_switch))
1294  {
1295  ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1296  taskdata->ompt_task_info.task_id,
1297  current_task->ompt_task_info.task_id);
1298  }
1299 #endif
1300 
1301 #if OMP_40_ENABLED
1302  }
1303 #endif // OMP_40_ENABLED
1304 
1305 
1306 #if OMPT_SUPPORT
1307  if (ompt_enabled) {
1308  thread->th.ompt_thread_info = oldInfo;
1309  taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
1310  }
1311 #endif
1312 
1313 #if OMP_45_ENABLED
1314  // Proxy tasks are not handled by the runtime
1315  if ( taskdata->td_flags.proxy != TASK_PROXY ) {
1316 #endif
1317  ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1318  __kmp_task_finish( gtid, task, current_task );
1319 #if OMP_45_ENABLED
1320  }
1321 #endif
1322 
1323 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1324  // Barrier imbalance - correct arrive time after the task finished
1325  if(__kmp_forkjoin_frames_mode == 3) {
1326  kmp_info_t *this_thr = __kmp_threads [ gtid ];
1327  if(this_thr->th.th_bar_arrive_time) {
1328  this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1329  }
1330  }
1331 #endif
1332  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1333  gtid, taskdata, current_task) );
1334  return;
1335 }
1336 
1337 //-----------------------------------------------------------------------
1338 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1339 //
1340 // loc_ref: location of original task pragma (ignored)
1341 // gtid: Global Thread ID of encountering thread
1342 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1343 // Returns:
1344 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1345 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1346 
1347 kmp_int32
1348 __kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1349 {
1350  kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1351 
1352  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n",
1353  gtid, loc_ref, new_taskdata ) );
1354 
1355  /* Should we execute the new task or queue it? For now, let's just always try to
1356  queue it. If the queue fills up, then we'll execute it. */
1357 
1358  if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1359  { // Execute this task immediately
1360  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1361  new_taskdata->td_flags.task_serial = 1;
1362  __kmp_invoke_task( gtid, new_task, current_task );
1363  }
1364 
1365  KA_TRACE(10, ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1366  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", gtid, loc_ref,
1367  new_taskdata ) );
1368 
1369  ANNOTATE_HAPPENS_BEFORE(new_task);
1370  return TASK_CURRENT_NOT_QUEUED;
1371 }
1372 
1373 //---------------------------------------------------------------------
1374 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1375 // gtid: Global Thread ID of encountering thread
1376 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1377 // serialize_immediate: if TRUE then if the task is executed immediately its execution will be serialized
1378 // returns:
1379 //
1380 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1381 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1382 kmp_int32
1383 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate )
1384 {
1385  kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1386 
1387 #if OMPT_SUPPORT
1388  if (ompt_enabled) {
1389  new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1390  __builtin_frame_address(1);
1391  }
1392 #endif
1393 
1394  /* Should we execute the new task or queue it? For now, let's just always try to
1395  queue it. If the queue fills up, then we'll execute it. */
1396 #if OMP_45_ENABLED
1397  if ( new_taskdata->td_flags.proxy == TASK_PROXY || __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1398 #else
1399  if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1400 #endif
1401  { // Execute this task immediately
1402  kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1403  if ( serialize_immediate )
1404  new_taskdata -> td_flags.task_serial = 1;
1405  __kmp_invoke_task( gtid, new_task, current_task );
1406  }
1407 
1408 #if OMPT_SUPPORT
1409  if (ompt_enabled) {
1410  new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1411  }
1412 #endif
1413 
1414  ANNOTATE_HAPPENS_BEFORE(new_task);
1415  return TASK_CURRENT_NOT_QUEUED;
1416 }
1417 
1418 //---------------------------------------------------------------------
1419 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a non-thread-switchable task from
1420 // the parent thread only!
1421 // loc_ref: location of original task pragma (ignored)
1422 // gtid: Global Thread ID of encountering thread
1423 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1424 // returns:
1425 //
1426 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1427 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1428 
1429 kmp_int32
1430 __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1431 {
1432  kmp_int32 res;
1433  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1434 
1435 #if KMP_DEBUG
1436  kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1437 #endif
1438  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n",
1439  gtid, loc_ref, new_taskdata ) );
1440 
1441  res = __kmp_omp_task(gtid,new_task,true);
1442 
1443  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1444  gtid, loc_ref, new_taskdata ) );
1445  return res;
1446 }
1447 
1448 //-------------------------------------------------------------------------------------
1449 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are complete
1450 
1451 kmp_int32
1452 __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
1453 {
1454  kmp_taskdata_t * taskdata;
1455  kmp_info_t * thread;
1456  int thread_finished = FALSE;
1457  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1458 
1459  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref) );
1460 
1461  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1462  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1463 
1464  thread = __kmp_threads[ gtid ];
1465  taskdata = thread -> th.th_current_task;
1466 
1467 #if OMPT_SUPPORT && OMPT_TRACE
1468  ompt_task_id_t my_task_id;
1469  ompt_parallel_id_t my_parallel_id;
1470 
1471  if (ompt_enabled) {
1472  kmp_team_t *team = thread->th.th_team;
1473  my_task_id = taskdata->ompt_task_info.task_id;
1474  my_parallel_id = team->t.ompt_team_info.parallel_id;
1475 
1476  taskdata->ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1);
1477  if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1478  ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(
1479  my_parallel_id, my_task_id);
1480  }
1481  }
1482 #endif
1483 
1484  // Debugger: The taskwait is active. Store location and thread encountered the taskwait.
1485 #if USE_ITT_BUILD
1486  // Note: These values are used by ITT events as well.
1487 #endif /* USE_ITT_BUILD */
1488  taskdata->td_taskwait_counter += 1;
1489  taskdata->td_taskwait_ident = loc_ref;
1490  taskdata->td_taskwait_thread = gtid + 1;
1491 
1492 #if USE_ITT_BUILD
1493  void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1494  if ( itt_sync_obj != NULL )
1495  __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1496 #endif /* USE_ITT_BUILD */
1497 
1498  bool must_wait = ! taskdata->td_flags.team_serial && ! taskdata->td_flags.final;
1499 
1500 #if OMP_45_ENABLED
1501  must_wait = must_wait || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks);
1502 #endif
1503  if (must_wait)
1504  {
1505  kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
1506  while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
1507  flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1508  USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1509  }
1510  }
1511 #if USE_ITT_BUILD
1512  if ( itt_sync_obj != NULL )
1513  __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1514 #endif /* USE_ITT_BUILD */
1515 
1516  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1517  // Debugger: The taskwait is completed. Location remains, but thread is negated.
1518  taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1519 
1520 #if OMPT_SUPPORT && OMPT_TRACE
1521  if (ompt_enabled) {
1522  if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1523  ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(
1524  my_parallel_id, my_task_id);
1525  }
1526  taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1527  }
1528 #endif
1529  ANNOTATE_HAPPENS_AFTER(taskdata);
1530  }
1531 
1532  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1533  "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1534 
1535  return TASK_CURRENT_NOT_QUEUED;
1536 }
1537 
1538 
1539 //-------------------------------------------------
1540 // __kmpc_omp_taskyield: switch to a different task
1541 
1542 kmp_int32
1543 __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
1544 {
1545  kmp_taskdata_t * taskdata;
1546  kmp_info_t * thread;
1547  int thread_finished = FALSE;
1548 
1549  KMP_COUNT_BLOCK(OMP_TASKYIELD);
1550  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1551 
1552  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1553  gtid, loc_ref, end_part) );
1554 
1555  if ( __kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel ) {
1556  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1557 
1558  thread = __kmp_threads[ gtid ];
1559  taskdata = thread -> th.th_current_task;
1560  // Should we model this as a task wait or not?
1561  // Debugger: The taskwait is active. Store location and thread encountered the taskwait.
1562 #if USE_ITT_BUILD
1563  // Note: These values are used by ITT events as well.
1564 #endif /* USE_ITT_BUILD */
1565  taskdata->td_taskwait_counter += 1;
1566  taskdata->td_taskwait_ident = loc_ref;
1567  taskdata->td_taskwait_thread = gtid + 1;
1568 
1569 #if USE_ITT_BUILD
1570  void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1571  if ( itt_sync_obj != NULL )
1572  __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1573 #endif /* USE_ITT_BUILD */
1574  if ( ! taskdata->td_flags.team_serial ) {
1575  kmp_task_team_t * task_team = thread->th.th_task_team;
1576  if (task_team != NULL) {
1577  if (KMP_TASKING_ENABLED(task_team)) {
1578  __kmp_execute_tasks_32( thread, gtid, NULL, FALSE, &thread_finished
1579  USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1580  }
1581  }
1582  }
1583 #if USE_ITT_BUILD
1584  if ( itt_sync_obj != NULL )
1585  __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1586 #endif /* USE_ITT_BUILD */
1587 
1588  // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1589  // Debugger: The taskwait is completed. Location remains, but thread is negated.
1590  taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1591  }
1592 
1593  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1594  "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1595 
1596  return TASK_CURRENT_NOT_QUEUED;
1597 }
1598 
1599 
1600 #if OMP_40_ENABLED
1601 //-------------------------------------------------------------------------------------
1602 // __kmpc_taskgroup: Start a new taskgroup
1603 
1604 void
1605 __kmpc_taskgroup( ident_t* loc, int gtid )
1606 {
1607  kmp_info_t * thread = __kmp_threads[ gtid ];
1608  kmp_taskdata_t * taskdata = thread->th.th_current_task;
1609  kmp_taskgroup_t * tg_new =
1610  (kmp_taskgroup_t *)__kmp_thread_malloc( thread, sizeof( kmp_taskgroup_t ) );
1611  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new) );
1612  tg_new->count = 0;
1613  tg_new->cancel_request = cancel_noreq;
1614  tg_new->parent = taskdata->td_taskgroup;
1615  taskdata->td_taskgroup = tg_new;
1616 }
1617 
1618 
1619 //-------------------------------------------------------------------------------------
1620 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1621 // and its descendants are complete
1622 
1623 void
1624 __kmpc_end_taskgroup( ident_t* loc, int gtid )
1625 {
1626  kmp_info_t * thread = __kmp_threads[ gtid ];
1627  kmp_taskdata_t * taskdata = thread->th.th_current_task;
1628  kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1629  int thread_finished = FALSE;
1630 
1631  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) );
1632  KMP_DEBUG_ASSERT( taskgroup != NULL );
1633  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
1634 
1635  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1636 #if USE_ITT_BUILD
1637  // For ITT the taskgroup wait is similar to taskwait until we need to distinguish them
1638  void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1639  if ( itt_sync_obj != NULL )
1640  __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1641 #endif /* USE_ITT_BUILD */
1642 
1643 #if OMP_45_ENABLED
1644  if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
1645 #else
1646  if ( ! taskdata->td_flags.team_serial )
1647 #endif
1648  {
1649  kmp_flag_32 flag(&(taskgroup->count), 0U);
1650  while ( TCR_4(taskgroup->count) != 0 ) {
1651  flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1652  USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1653  }
1654  }
1655 
1656 #if USE_ITT_BUILD
1657  if ( itt_sync_obj != NULL )
1658  __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1659 #endif /* USE_ITT_BUILD */
1660  }
1661  KMP_DEBUG_ASSERT( taskgroup->count == 0 );
1662 
1663  // Restore parent taskgroup for the current task
1664  taskdata->td_taskgroup = taskgroup->parent;
1665  __kmp_thread_free( thread, taskgroup );
1666 
1667  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", gtid, taskdata) );
1668  ANNOTATE_HAPPENS_AFTER(taskdata);
1669 }
1670 #endif
1671 
1672 
1673 //------------------------------------------------------
1674 // __kmp_remove_my_task: remove a task from my own deque
1675 
1676 static kmp_task_t *
1677 __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task_team,
1678  kmp_int32 is_constrained )
1679 {
1680  kmp_task_t * task;
1681  kmp_taskdata_t * taskdata;
1682  kmp_thread_data_t *thread_data;
1683  kmp_uint32 tail;
1684 
1685  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1686  KMP_DEBUG_ASSERT( task_team -> tt.tt_threads_data != NULL ); // Caller should check this condition
1687 
1688  thread_data = & task_team -> tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
1689 
1690  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1691  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1692  thread_data->td.td_deque_tail) );
1693 
1694  if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1695  KA_TRACE(10, ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1696  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1697  thread_data->td.td_deque_tail) );
1698  return NULL;
1699  }
1700 
1701  __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
1702 
1703  if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1704  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1705  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1706  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1707  thread_data->td.td_deque_tail) );
1708  return NULL;
1709  }
1710 
1711  tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(thread_data->td); // Wrap index.
1712  taskdata = thread_data -> td.td_deque[ tail ];
1713 
1714  if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
1715  // we need to check if the candidate obeys task scheduling constraint:
1716  // only child of current task can be scheduled
1717  kmp_taskdata_t * current = thread->th.th_current_task;
1718  kmp_int32 level = current->td_level;
1719  kmp_taskdata_t * parent = taskdata->td_parent;
1720  while ( parent != current && parent->td_level > level ) {
1721  parent = parent->td_parent; // check generation up to the level of the current task
1722  KMP_DEBUG_ASSERT(parent != NULL);
1723  }
1724  if ( parent != current ) {
1725  // If the tail task is not a child, then no other child can appear in the deque.
1726  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1727  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1728  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1729  thread_data->td.td_deque_tail) );
1730  return NULL;
1731  }
1732  }
1733 
1734  thread_data -> td.td_deque_tail = tail;
1735  TCW_4(thread_data -> td.td_deque_ntasks, thread_data -> td.td_deque_ntasks - 1);
1736 
1737  __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
1738 
1739  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: ntasks=%d head=%u tail=%u\n",
1740  gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1741  thread_data->td.td_deque_tail) );
1742 
1743  task = KMP_TASKDATA_TO_TASK( taskdata );
1744  return task;
1745 }
1746 
1747 
1748 //-----------------------------------------------------------
1749 // __kmp_steal_task: remove a task from another thread's deque
1750 // Assume that calling thread has already checked existence of
1751 // task_team thread_data before calling this routine.
1752 
1753 static kmp_task_t *
1754 __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team,
1755  volatile kmp_uint32 *unfinished_threads, int *thread_finished,
1756  kmp_int32 is_constrained )
1757 {
1758  kmp_task_t * task;
1759  kmp_taskdata_t * taskdata;
1760  kmp_thread_data_t *victim_td, *threads_data;
1761  kmp_int32 victim_tid;
1762 
1763  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1764 
1765  threads_data = task_team -> tt.tt_threads_data;
1766  KMP_DEBUG_ASSERT( threads_data != NULL ); // Caller should check this condition
1767 
1768  victim_tid = victim->th.th_info.ds.ds_tid;
1769  victim_td = & threads_data[ victim_tid ];
1770 
1771  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: task_team=%p ntasks=%d "
1772  "head=%u tail=%u\n",
1773  gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1774  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1775 
1776  if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) || // Caller should not check this condition
1777  (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1778  {
1779  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: task_team=%p "
1780  "ntasks=%d head=%u tail=%u\n",
1781  gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1782  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1783  return NULL;
1784  }
1785 
1786  __kmp_acquire_bootstrap_lock( & victim_td -> td.td_deque_lock );
1787 
1788  // Check again after we acquire the lock
1789  if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) ||
1790  (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1791  {
1792  __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1793  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
1794  "ntasks=%d head=%u tail=%u\n",
1795  gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1796  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1797  return NULL;
1798  }
1799 
1800  KMP_DEBUG_ASSERT( victim_td -> td.td_deque != NULL );
1801 
1802  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
1803  if ( is_constrained ) {
1804  // we need to check if the candidate obeys task scheduling constraint:
1805  // only descendant of current task can be scheduled
1806  kmp_taskdata_t * current = __kmp_threads[ gtid ]->th.th_current_task;
1807  kmp_int32 level = current->td_level;
1808  kmp_taskdata_t * parent = taskdata->td_parent;
1809  while ( parent != current && parent->td_level > level ) {
1810  parent = parent->td_parent; // check generation up to the level of the current task
1811  KMP_DEBUG_ASSERT(parent != NULL);
1812  }
1813  if ( parent != current ) {
1814  // If the head task is not a descendant of the current task then do not
1815  // steal it. No other task in victim's deque can be a descendant of the
1816  // current task.
1817  __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1818  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
1819  "ntasks=%d head=%u tail=%u\n",
1820  gtid, __kmp_gtid_from_thread( threads_data[victim_tid].td.td_thr ),
1821  task_team, victim_td->td.td_deque_ntasks,
1822  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1823  return NULL;
1824  }
1825  }
1826  // Bump head pointer and Wrap.
1827  victim_td->td.td_deque_head = (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
1828  if (*thread_finished) {
1829  // We need to un-mark this victim as a finished victim. This must be done before
1830  // releasing the lock, or else other threads (starting with the master victim)
1831  // might be prematurely released from the barrier!!!
1832  kmp_uint32 count;
1833 
1834  count = KMP_TEST_THEN_INC32( (kmp_int32 *)unfinished_threads );
1835 
1836  KA_TRACE(20, ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
1837  gtid, count + 1, task_team) );
1838 
1839  *thread_finished = FALSE;
1840  }
1841  TCW_4(victim_td -> td.td_deque_ntasks, TCR_4(victim_td -> td.td_deque_ntasks) - 1);
1842 
1843  __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1844 
1845  KMP_COUNT_BLOCK(TASK_stolen);
1846  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
1847  "ntasks=%d head=%u tail=%u\n",
1848  gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,
1849  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1850  victim_td->td.td_deque_tail) );
1851 
1852  task = KMP_TASKDATA_TO_TASK( taskdata );
1853  return task;
1854 }
1855 
1856 
1857 //-----------------------------------------------------------------------------
1858 // __kmp_execute_tasks_template: Choose and execute tasks until either the condition
1859 // is statisfied (return true) or there are none left (return false).
1860 // final_spin is TRUE if this is the spin at the release barrier.
1861 // thread_finished indicates whether the thread is finished executing all
1862 // the tasks it has on its deque, and is at the release barrier.
1863 // spinner is the location on which to spin.
1864 // spinner == NULL means only execute a single task and return.
1865 // checker is the value to check to terminate the spin.
1866 template <class C>
1867 static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
1868  int *thread_finished
1869  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
1870 {
1871  kmp_task_team_t * task_team = thread->th.th_task_team;
1872  kmp_thread_data_t * threads_data;
1873  kmp_task_t * task;
1874  kmp_info_t * other_thread;
1875  kmp_taskdata_t * current_task = thread -> th.th_current_task;
1876  volatile kmp_uint32 * unfinished_threads;
1877  kmp_int32 nthreads, victim=-2, use_own_tasks=1, new_victim=0, tid=thread->th.th_info.ds.ds_tid;
1878 
1879  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1880  KMP_DEBUG_ASSERT( thread == __kmp_threads[ gtid ] );
1881 
1882  if (task_team == NULL) return FALSE;
1883 
1884  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n",
1885  gtid, final_spin, *thread_finished) );
1886 
1887  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
1888  KMP_DEBUG_ASSERT( threads_data != NULL );
1889 
1890  nthreads = task_team -> tt.tt_nproc;
1891  unfinished_threads = &(task_team -> tt.tt_unfinished_threads);
1892 #if OMP_45_ENABLED
1893  KMP_DEBUG_ASSERT( nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
1894 #else
1895  KMP_DEBUG_ASSERT( nthreads > 1 );
1896 #endif
1897  KMP_DEBUG_ASSERT( (int)(TCR_4(*unfinished_threads)) >= 0 );
1898 
1899  while (1) { // Outer loop keeps trying to find tasks in case of single thread getting tasks from target constructs
1900  while (1) { // Inner loop to find a task and execute it
1901  task = NULL;
1902  if (use_own_tasks) { // check on own queue first
1903  task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained );
1904  }
1905  if ((task == NULL) && (nthreads > 1)) { // Steal a task
1906  int asleep = 1;
1907  use_own_tasks = 0;
1908  // Try to steal from the last place I stole from successfully.
1909  if (victim == -2) { // haven't stolen anything yet
1910  victim = threads_data[tid].td.td_deque_last_stolen;
1911  if (victim != -1) // if we have a last stolen from victim, get the thread
1912  other_thread = threads_data[victim].td.td_thr;
1913  }
1914  if (victim != -1) { // found last victim
1915  asleep = 0;
1916  }
1917  else if (!new_victim) { // no recent steals and we haven't already used a new victim; select a random thread
1918  do { // Find a different thread to steal work from.
1919  // Pick a random thread. Initial plan was to cycle through all the threads, and only return if
1920  // we tried to steal from every thread, and failed. Arch says that's not such a great idea.
1921  victim = __kmp_get_random(thread) % (nthreads - 1);
1922  if (victim >= tid) {
1923  ++victim; // Adjusts random distribution to exclude self
1924  }
1925  // Found a potential victim
1926  other_thread = threads_data[victim].td.td_thr;
1927  // There is a slight chance that __kmp_enable_tasking() did not wake up all threads
1928  // waiting at the barrier. If victim is sleeping, then wake it up. Since we were going to
1929  // pay the cache miss penalty for referencing another thread's kmp_info_t struct anyway,
1930  // the check shouldn't cost too much performance at this point. In extra barrier mode, tasks
1931  // do not sleep at the separate tasking barrier, so this isn't a problem.
1932  asleep = 0;
1933  if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
1934  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
1935  (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) {
1936  asleep = 1;
1937  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
1938  // A sleeping thread should not have any tasks on it's queue. There is a slight
1939  // possibility that it resumes, steals a task from another thread, which spawns more
1940  // tasks, all in the time that it takes this thread to check => don't write an assertion
1941  // that the victim's queue is empty. Try stealing from a different thread.
1942  }
1943  } while (asleep);
1944  }
1945 
1946  if (!asleep) {
1947  // We have a victim to try to steal from
1948  task = __kmp_steal_task(other_thread, gtid, task_team, unfinished_threads, thread_finished, is_constrained);
1949  }
1950  if (task != NULL) { // set last stolen to victim
1951  if (threads_data[tid].td.td_deque_last_stolen != victim) {
1952  threads_data[tid].td.td_deque_last_stolen = victim;
1953  // The pre-refactored code did not try more than 1 successful new vicitm,
1954  // unless the last one generated more local tasks; new_victim keeps track of this
1955  new_victim = 1;
1956  }
1957  }
1958  else { // No tasks found; unset last_stolen
1959  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
1960  victim = -2; // no successful victim found
1961  }
1962  }
1963 
1964  if (task == NULL) // break out of tasking loop
1965  break;
1966 
1967  // Found a task; execute it
1968 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1969  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
1970  if ( itt_sync_obj == NULL ) { // we are at fork barrier where we could not get the object reliably
1971  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1972  }
1973  __kmp_itt_task_starting( itt_sync_obj );
1974  }
1975 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1976  __kmp_invoke_task( gtid, task, current_task );
1977 #if USE_ITT_BUILD
1978  if ( itt_sync_obj != NULL ) __kmp_itt_task_finished( itt_sync_obj );
1979 #endif /* USE_ITT_BUILD */
1980  // If this thread is only partway through the barrier and the condition is met, then return now,
1981  // so that the barrier gather/release pattern can proceed. If this thread is in the last spin loop
1982  // in the barrier, waiting to be released, we know that the termination condition will not be
1983  // satisified, so don't waste any cycles checking it.
1984  if (flag == NULL || (!final_spin && flag->done_check())) {
1985  KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
1986  return TRUE;
1987  }
1988  if (thread->th.th_task_team == NULL) {
1989  break;
1990  }
1991  KMP_YIELD( __kmp_library == library_throughput ); // Yield before executing next task
1992  // If execution of a stolen task results in more tasks being placed on our run queue, reset use_own_tasks
1993  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
1994  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n", gtid));
1995  use_own_tasks = 1;
1996  new_victim = 0;
1997  }
1998  }
1999 
2000  // The task source has been exhausted. If in final spin loop of barrier, check if termination condition is satisfied.
2001 #if OMP_45_ENABLED
2002  // The work queue may be empty but there might be proxy tasks still executing
2003  if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
2004 #else
2005  if (final_spin)
2006 #endif
2007  {
2008  // First, decrement the #unfinished threads, if that has not already been done. This decrement
2009  // might be to the spin location, and result in the termination condition being satisfied.
2010  if (! *thread_finished) {
2011  kmp_uint32 count;
2012 
2013  count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
2014  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec unfinished_threads to %d task_team=%p\n",
2015  gtid, count, task_team) );
2016  *thread_finished = TRUE;
2017  }
2018 
2019  // It is now unsafe to reference thread->th.th_team !!!
2020  // Decrementing task_team->tt.tt_unfinished_threads can allow the master thread to pass through
2021  // the barrier, where it might reset each thread's th.th_team field for the next parallel region.
2022  // If we can steal more work, we know that this has not happened yet.
2023  if (flag != NULL && flag->done_check()) {
2024  KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
2025  return TRUE;
2026  }
2027  }
2028 
2029  // If this thread's task team is NULL, master has recognized that there are no more tasks; bail out
2030  if (thread->th.th_task_team == NULL) {
2031  KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid) );
2032  return FALSE;
2033  }
2034 
2035 #if OMP_45_ENABLED
2036  // We could be getting tasks from target constructs; if this is the only thread, keep trying to execute
2037  // tasks from own queue
2038  if (nthreads == 1)
2039  use_own_tasks = 1;
2040  else
2041 #endif
2042  {
2043  KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid) );
2044  return FALSE;
2045  }
2046  }
2047 }
2048 
2049 int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2050  int *thread_finished
2051  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2052 {
2053  return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2054  USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2055 }
2056 
2057 int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2058  int *thread_finished
2059  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2060 {
2061  return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2062  USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2063 }
2064 
2065 int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2066  int *thread_finished
2067  USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2068 {
2069  return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2070  USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2071 }
2072 
2073 
2074 
2075 //-----------------------------------------------------------------------------
2076 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2077 // next barrier so they can assist in executing enqueued tasks.
2078 // First thread in allocates the task team atomically.
2079 
2080 static void
2081 __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
2082 {
2083  kmp_thread_data_t *threads_data;
2084  int nthreads, i, is_init_thread;
2085 
2086  KA_TRACE( 10, ( "__kmp_enable_tasking(enter): T#%d\n",
2087  __kmp_gtid_from_thread( this_thr ) ) );
2088 
2089  KMP_DEBUG_ASSERT(task_team != NULL);
2090  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2091 
2092  nthreads = task_team->tt.tt_nproc;
2093  KMP_DEBUG_ASSERT(nthreads > 0);
2094  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2095 
2096  // Allocate or increase the size of threads_data if necessary
2097  is_init_thread = __kmp_realloc_task_threads_data( this_thr, task_team );
2098 
2099  if (!is_init_thread) {
2100  // Some other thread already set up the array.
2101  KA_TRACE( 20, ( "__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2102  __kmp_gtid_from_thread( this_thr ) ) );
2103  return;
2104  }
2105  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
2106  KMP_DEBUG_ASSERT( threads_data != NULL );
2107 
2108  if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
2109  ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) )
2110  {
2111  // Release any threads sleeping at the barrier, so that they can steal
2112  // tasks and execute them. In extra barrier mode, tasks do not sleep
2113  // at the separate tasking barrier, so this isn't a problem.
2114  for (i = 0; i < nthreads; i++) {
2115  volatile void *sleep_loc;
2116  kmp_info_t *thread = threads_data[i].td.td_thr;
2117 
2118  if (i == this_thr->th.th_info.ds.ds_tid) {
2119  continue;
2120  }
2121  // Since we haven't locked the thread's suspend mutex lock at this
2122  // point, there is a small window where a thread might be putting
2123  // itself to sleep, but hasn't set the th_sleep_loc field yet.
2124  // To work around this, __kmp_execute_tasks_template() periodically checks
2125  // see if other threads are sleeping (using the same random
2126  // mechanism that is used for task stealing) and awakens them if
2127  // they are.
2128  if ( ( sleep_loc = TCR_PTR( thread -> th.th_sleep_loc) ) != NULL )
2129  {
2130  KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2131  __kmp_gtid_from_thread( this_thr ),
2132  __kmp_gtid_from_thread( thread ) ) );
2133  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2134  }
2135  else {
2136  KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2137  __kmp_gtid_from_thread( this_thr ),
2138  __kmp_gtid_from_thread( thread ) ) );
2139  }
2140  }
2141  }
2142 
2143  KA_TRACE( 10, ( "__kmp_enable_tasking(exit): T#%d\n",
2144  __kmp_gtid_from_thread( this_thr ) ) );
2145 }
2146 
2147 
2148 /* ------------------------------------------------------------------------ */
2149 /* // TODO: Check the comment consistency
2150  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
2151  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2152  * After a child * thread checks into a barrier and calls __kmp_release() from
2153  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2154  * longer assume that the kmp_team_t structure is intact (at any moment, the
2155  * master thread may exit the barrier code and free the team data structure,
2156  * and return the threads to the thread pool).
2157  *
2158  * This does not work with the the tasking code, as the thread is still
2159  * expected to participate in the execution of any tasks that may have been
2160  * spawned my a member of the team, and the thread still needs access to all
2161  * to each thread in the team, so that it can steal work from it.
2162  *
2163  * Enter the existence of the kmp_task_team_t struct. It employs a reference
2164  * counting mechanims, and is allocated by the master thread before calling
2165  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2166  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
2167  * of the kmp_task_team_t structs for consecutive barriers can overlap
2168  * (and will, unless the master thread is the last thread to exit the barrier
2169  * release phase, which is not typical).
2170  *
2171  * The existence of such a struct is useful outside the context of tasking,
2172  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2173  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2174  * libraries.
2175  *
2176  * We currently use the existence of the threads array as an indicator that
2177  * tasks were spawned since the last barrier. If the structure is to be
2178  * useful outside the context of tasking, then this will have to change, but
2179  * not settting the field minimizes the performance impact of tasking on
2180  * barriers, when no explicit tasks were spawned (pushed, actually).
2181  */
2182 
2183 
2184 static kmp_task_team_t *__kmp_free_task_teams = NULL; // Free list for task_team data structures
2185 // Lock for task team data structures
2186 static kmp_bootstrap_lock_t __kmp_task_team_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_task_team_lock );
2187 
2188 
2189 //------------------------------------------------------------------------------
2190 // __kmp_alloc_task_deque:
2191 // Allocates a task deque for a particular thread, and initialize the necessary
2192 // data structures relating to the deque. This only happens once per thread
2193 // per task team since task teams are recycled.
2194 // No lock is needed during allocation since each thread allocates its own
2195 // deque.
2196 
2197 static void
2198 __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data )
2199 {
2200  __kmp_init_bootstrap_lock( & thread_data -> td.td_deque_lock );
2201  KMP_DEBUG_ASSERT( thread_data -> td.td_deque == NULL );
2202 
2203  // Initialize last stolen task field to "none"
2204  thread_data -> td.td_deque_last_stolen = -1;
2205 
2206  KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) == 0 );
2207  KMP_DEBUG_ASSERT( thread_data -> td.td_deque_head == 0 );
2208  KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 );
2209 
2210  KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2211  __kmp_gtid_from_thread( thread ), INITIAL_TASK_DEQUE_SIZE, thread_data ) );
2212  // Allocate space for task deque, and zero the deque
2213  // Cannot use __kmp_thread_calloc() because threads not around for
2214  // kmp_reap_task_team( ).
2215  thread_data -> td.td_deque = (kmp_taskdata_t **)
2216  __kmp_allocate( INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2217  thread_data -> td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2218 }
2219 
2220 //------------------------------------------------------------------------------
2221 // __kmp_realloc_task_deque:
2222 // Re-allocates a task deque for a particular thread, copies the content from the old deque
2223 // and adjusts the necessary data structures relating to the deque.
2224 // This operation must be done with a the deque_lock being held
2225 
2226 static void __kmp_realloc_task_deque ( kmp_info_t *thread, kmp_thread_data_t *thread_data )
2227 {
2228  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2229  kmp_int32 new_size = 2 * size;
2230 
2231  KE_TRACE( 10, ( "__kmp_realloc_task_deque: T#%d reallocating deque[from %d to %d] for thread_data %p\n",
2232  __kmp_gtid_from_thread( thread ), size, new_size, thread_data ) );
2233 
2234  kmp_taskdata_t ** new_deque = (kmp_taskdata_t **) __kmp_allocate( new_size * sizeof(kmp_taskdata_t *));
2235 
2236  int i,j;
2237  for ( i = thread_data->td.td_deque_head, j = 0; j < size; i = (i+1) & TASK_DEQUE_MASK(thread_data->td), j++ )
2238  new_deque[j] = thread_data->td.td_deque[i];
2239 
2240  __kmp_free(thread_data->td.td_deque);
2241 
2242  thread_data -> td.td_deque_head = 0;
2243  thread_data -> td.td_deque_tail = size;
2244  thread_data -> td.td_deque = new_deque;
2245  thread_data -> td.td_deque_size = new_size;
2246 }
2247 
2248 //------------------------------------------------------------------------------
2249 // __kmp_free_task_deque:
2250 // Deallocates a task deque for a particular thread.
2251 // Happens at library deallocation so don't need to reset all thread data fields.
2252 
2253 static void
2254 __kmp_free_task_deque( kmp_thread_data_t *thread_data )
2255 {
2256  __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
2257 
2258  if ( thread_data -> td.td_deque != NULL ) {
2259  TCW_4(thread_data -> td.td_deque_ntasks, 0);
2260  __kmp_free( thread_data -> td.td_deque );
2261  thread_data -> td.td_deque = NULL;
2262  }
2263  __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
2264 
2265 #ifdef BUILD_TIED_TASK_STACK
2266  // GEH: Figure out what to do here for td_susp_tied_tasks
2267  if ( thread_data -> td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY ) {
2268  __kmp_free_task_stack( __kmp_thread_from_gtid( gtid ), thread_data );
2269  }
2270 #endif // BUILD_TIED_TASK_STACK
2271 }
2272 
2273 
2274 //------------------------------------------------------------------------------
2275 // __kmp_realloc_task_threads_data:
2276 // Allocates a threads_data array for a task team, either by allocating an initial
2277 // array or enlarging an existing array. Only the first thread to get the lock
2278 // allocs or enlarges the array and re-initializes the array eleemnts.
2279 // That thread returns "TRUE", the rest return "FALSE".
2280 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2281 // The current size is given by task_team -> tt.tt_max_threads.
2282 
2283 static int
2284 __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team )
2285 {
2286  kmp_thread_data_t ** threads_data_p;
2287  kmp_int32 nthreads, maxthreads;
2288  int is_init_thread = FALSE;
2289 
2290  if ( TCR_4(task_team -> tt.tt_found_tasks) ) {
2291  // Already reallocated and initialized.
2292  return FALSE;
2293  }
2294 
2295  threads_data_p = & task_team -> tt.tt_threads_data;
2296  nthreads = task_team -> tt.tt_nproc;
2297  maxthreads = task_team -> tt.tt_max_threads;
2298 
2299  // All threads must lock when they encounter the first task of the implicit task
2300  // region to make sure threads_data fields are (re)initialized before used.
2301  __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2302 
2303  if ( ! TCR_4(task_team -> tt.tt_found_tasks) ) {
2304  // first thread to enable tasking
2305  kmp_team_t *team = thread -> th.th_team;
2306  int i;
2307 
2308  is_init_thread = TRUE;
2309  if ( maxthreads < nthreads ) {
2310 
2311  if ( *threads_data_p != NULL ) {
2312  kmp_thread_data_t *old_data = *threads_data_p;
2313  kmp_thread_data_t *new_data = NULL;
2314 
2315  KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d reallocating "
2316  "threads data for task_team %p, new_size = %d, old_size = %d\n",
2317  __kmp_gtid_from_thread( thread ), task_team,
2318  nthreads, maxthreads ) );
2319  // Reallocate threads_data to have more elements than current array
2320  // Cannot use __kmp_thread_realloc() because threads not around for
2321  // kmp_reap_task_team( ). Note all new array entries are initialized
2322  // to zero by __kmp_allocate().
2323  new_data = (kmp_thread_data_t *)
2324  __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2325  // copy old data to new data
2326  KMP_MEMCPY_S( (void *) new_data, nthreads * sizeof(kmp_thread_data_t),
2327  (void *) old_data,
2328  maxthreads * sizeof(kmp_taskdata_t *) );
2329 
2330 #ifdef BUILD_TIED_TASK_STACK
2331  // GEH: Figure out if this is the right thing to do
2332  for (i = maxthreads; i < nthreads; i++) {
2333  kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2334  __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2335  }
2336 #endif // BUILD_TIED_TASK_STACK
2337  // Install the new data and free the old data
2338  (*threads_data_p) = new_data;
2339  __kmp_free( old_data );
2340  }
2341  else {
2342  KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d allocating "
2343  "threads data for task_team %p, size = %d\n",
2344  __kmp_gtid_from_thread( thread ), task_team, nthreads ) );
2345  // Make the initial allocate for threads_data array, and zero entries
2346  // Cannot use __kmp_thread_calloc() because threads not around for
2347  // kmp_reap_task_team( ).
2348  ANNOTATE_IGNORE_WRITES_BEGIN();
2349  *threads_data_p = (kmp_thread_data_t *)
2350  __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2351  ANNOTATE_IGNORE_WRITES_END();
2352 #ifdef BUILD_TIED_TASK_STACK
2353  // GEH: Figure out if this is the right thing to do
2354  for (i = 0; i < nthreads; i++) {
2355  kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2356  __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2357  }
2358 #endif // BUILD_TIED_TASK_STACK
2359  }
2360  task_team -> tt.tt_max_threads = nthreads;
2361  }
2362  else {
2363  // If array has (more than) enough elements, go ahead and use it
2364  KMP_DEBUG_ASSERT( *threads_data_p != NULL );
2365  }
2366 
2367  // initialize threads_data pointers back to thread_info structures
2368  for (i = 0; i < nthreads; i++) {
2369  kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2370  thread_data -> td.td_thr = team -> t.t_threads[i];
2371 
2372  if ( thread_data -> td.td_deque_last_stolen >= nthreads) {
2373  // The last stolen field survives across teams / barrier, and the number
2374  // of threads may have changed. It's possible (likely?) that a new
2375  // parallel region will exhibit the same behavior as the previous region.
2376  thread_data -> td.td_deque_last_stolen = -1;
2377  }
2378  }
2379 
2380  KMP_MB();
2381  TCW_SYNC_4(task_team -> tt.tt_found_tasks, TRUE);
2382  }
2383 
2384  __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2385  return is_init_thread;
2386 }
2387 
2388 
2389 //------------------------------------------------------------------------------
2390 // __kmp_free_task_threads_data:
2391 // Deallocates a threads_data array for a task team, including any attached
2392 // tasking deques. Only occurs at library shutdown.
2393 
2394 static void
2395 __kmp_free_task_threads_data( kmp_task_team_t *task_team )
2396 {
2397  __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2398  if ( task_team -> tt.tt_threads_data != NULL ) {
2399  int i;
2400  for (i = 0; i < task_team->tt.tt_max_threads; i++ ) {
2401  __kmp_free_task_deque( & task_team -> tt.tt_threads_data[i] );
2402  }
2403  __kmp_free( task_team -> tt.tt_threads_data );
2404  task_team -> tt.tt_threads_data = NULL;
2405  }
2406  __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2407 }
2408 
2409 
2410 //------------------------------------------------------------------------------
2411 // __kmp_allocate_task_team:
2412 // Allocates a task team associated with a specific team, taking it from
2413 // the global task team free list if possible. Also initializes data structures.
2414 
2415 static kmp_task_team_t *
2416 __kmp_allocate_task_team( kmp_info_t *thread, kmp_team_t *team )
2417 {
2418  kmp_task_team_t *task_team = NULL;
2419  int nthreads;
2420 
2421  KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d entering; team = %p\n",
2422  (thread ? __kmp_gtid_from_thread( thread ) : -1), team ) );
2423 
2424  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2425  // Take a task team from the task team pool
2426  __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2427  if (__kmp_free_task_teams != NULL) {
2428  task_team = __kmp_free_task_teams;
2429  TCW_PTR(__kmp_free_task_teams, task_team -> tt.tt_next);
2430  task_team -> tt.tt_next = NULL;
2431  }
2432  __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2433  }
2434 
2435  if (task_team == NULL) {
2436  KE_TRACE( 10, ( "__kmp_allocate_task_team: T#%d allocating "
2437  "task team for team %p\n",
2438  __kmp_gtid_from_thread( thread ), team ) );
2439  // Allocate a new task team if one is not available.
2440  // Cannot use __kmp_thread_malloc() because threads not around for
2441  // kmp_reap_task_team( ).
2442  task_team = (kmp_task_team_t *) __kmp_allocate( sizeof(kmp_task_team_t) );
2443  __kmp_init_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2444  //task_team -> tt.tt_threads_data = NULL; // AC: __kmp_allocate zeroes returned memory
2445  //task_team -> tt.tt_max_threads = 0;
2446  //task_team -> tt.tt_next = NULL;
2447  }
2448 
2449  TCW_4(task_team -> tt.tt_found_tasks, FALSE);
2450 #if OMP_45_ENABLED
2451  TCW_4(task_team -> tt.tt_found_proxy_tasks, FALSE);
2452 #endif
2453  task_team -> tt.tt_nproc = nthreads = team->t.t_nproc;
2454 
2455  TCW_4( task_team -> tt.tt_unfinished_threads, nthreads );
2456  TCW_4( task_team -> tt.tt_active, TRUE );
2457 
2458  KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d exiting; task_team = %p unfinished_threads init'd to %d\n",
2459  (thread ? __kmp_gtid_from_thread( thread ) : -1), task_team, task_team -> tt.tt_unfinished_threads) );
2460  return task_team;
2461 }
2462 
2463 
2464 //------------------------------------------------------------------------------
2465 // __kmp_free_task_team:
2466 // Frees the task team associated with a specific thread, and adds it
2467 // to the global task team free list.
2468 
2469 void
2470 __kmp_free_task_team( kmp_info_t *thread, kmp_task_team_t *task_team )
2471 {
2472  KA_TRACE( 20, ( "__kmp_free_task_team: T#%d task_team = %p\n",
2473  thread ? __kmp_gtid_from_thread( thread ) : -1, task_team ) );
2474 
2475  // Put task team back on free list
2476  __kmp_acquire_bootstrap_lock( & __kmp_task_team_lock );
2477 
2478  KMP_DEBUG_ASSERT( task_team -> tt.tt_next == NULL );
2479  task_team -> tt.tt_next = __kmp_free_task_teams;
2480  TCW_PTR(__kmp_free_task_teams, task_team);
2481 
2482  __kmp_release_bootstrap_lock( & __kmp_task_team_lock );
2483 }
2484 
2485 
2486 //------------------------------------------------------------------------------
2487 // __kmp_reap_task_teams:
2488 // Free all the task teams on the task team free list.
2489 // Should only be done during library shutdown.
2490 // Cannot do anything that needs a thread structure or gtid since they are already gone.
2491 
2492 void
2493 __kmp_reap_task_teams( void )
2494 {
2495  kmp_task_team_t *task_team;
2496 
2497  if ( TCR_PTR(__kmp_free_task_teams) != NULL ) {
2498  // Free all task_teams on the free list
2499  __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2500  while ( ( task_team = __kmp_free_task_teams ) != NULL ) {
2501  __kmp_free_task_teams = task_team -> tt.tt_next;
2502  task_team -> tt.tt_next = NULL;
2503 
2504  // Free threads_data if necessary
2505  if ( task_team -> tt.tt_threads_data != NULL ) {
2506  __kmp_free_task_threads_data( task_team );
2507  }
2508  __kmp_free( task_team );
2509  }
2510  __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2511  }
2512 }
2513 
2514 //------------------------------------------------------------------------------
2515 // __kmp_wait_to_unref_task_teams:
2516 // Some threads could still be in the fork barrier release code, possibly
2517 // trying to steal tasks. Wait for each thread to unreference its task team.
2518 //
2519 void
2520 __kmp_wait_to_unref_task_teams(void)
2521 {
2522  kmp_info_t *thread;
2523  kmp_uint32 spins;
2524  int done;
2525 
2526  KMP_INIT_YIELD( spins );
2527 
2528  for (;;) {
2529  done = TRUE;
2530 
2531  // TODO: GEH - this may be is wrong because some sync would be necessary
2532  // in case threads are added to the pool during the traversal.
2533  // Need to verify that lock for thread pool is held when calling
2534  // this routine.
2535  for (thread = (kmp_info_t *)__kmp_thread_pool;
2536  thread != NULL;
2537  thread = thread->th.th_next_pool)
2538  {
2539 #if KMP_OS_WINDOWS
2540  DWORD exit_val;
2541 #endif
2542  if ( TCR_PTR(thread->th.th_task_team) == NULL ) {
2543  KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2544  __kmp_gtid_from_thread( thread ) ) );
2545  continue;
2546  }
2547 #if KMP_OS_WINDOWS
2548  // TODO: GEH - add this check for Linux* OS / OS X* as well?
2549  if (!__kmp_is_thread_alive(thread, &exit_val)) {
2550  thread->th.th_task_team = NULL;
2551  continue;
2552  }
2553 #endif
2554 
2555  done = FALSE; // Because th_task_team pointer is not NULL for this thread
2556 
2557  KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to unreference task_team\n",
2558  __kmp_gtid_from_thread( thread ) ) );
2559 
2560  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
2561  volatile void *sleep_loc;
2562  // If the thread is sleeping, awaken it.
2563  if ( ( sleep_loc = TCR_PTR( thread->th.th_sleep_loc) ) != NULL ) {
2564  KA_TRACE( 10, ( "__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2565  __kmp_gtid_from_thread( thread ), __kmp_gtid_from_thread( thread ) ) );
2566  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2567  }
2568  }
2569  }
2570  if (done) {
2571  break;
2572  }
2573 
2574  // If we are oversubscribed,
2575  // or have waited a bit (and library mode is throughput), yield.
2576  // Pause is in the following code.
2577  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2578  KMP_YIELD_SPIN( spins ); // Yields only if KMP_LIBRARY=throughput
2579  }
2580 }
2581 
2582 
2583 //------------------------------------------------------------------------------
2584 // __kmp_task_team_setup: Create a task_team for the current team, but use
2585 // an already created, unused one if it already exists.
2586 void
2587 __kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int always )
2588 {
2589  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2590 
2591  // If this task_team hasn't been created yet, allocate it. It will be used in the region after the next.
2592  // If it exists, it is the current task team and shouldn't be touched yet as it may still be in use.
2593  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && (always || team->t.t_nproc > 1) ) {
2594  team->t.t_task_team[this_thr->th.th_task_state] = __kmp_allocate_task_team( this_thr, team );
2595  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p for team %d at parity=%d\n",
2596  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[this_thr->th.th_task_state],
2597  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2598  }
2599 
2600  // After threads exit the release, they will call sync, and then point to this other task_team; make sure it is
2601  // allocated and properly initialized. As threads spin in the barrier release phase, they will continue to use the
2602  // previous task_team struct(above), until they receive the signal to stop checking for tasks (they can't safely
2603  // reference the kmp_team_t struct, which could be reallocated by the master thread). No task teams are formed for
2604  // serialized teams.
2605  if (team->t.t_nproc > 1) {
2606  int other_team = 1 - this_thr->th.th_task_state;
2607  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2608  team->t.t_task_team[other_team] = __kmp_allocate_task_team( this_thr, team );
2609  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new task_team %p for team %d at parity=%d\n",
2610  __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2611  ((team != NULL) ? team->t.t_id : -1), other_team ));
2612  }
2613  else { // Leave the old task team struct in place for the upcoming region; adjust as needed
2614  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2615  if (!task_team->tt.tt_active || team->t.t_nproc != task_team->tt.tt_nproc) {
2616  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2617  TCW_4(task_team->tt.tt_found_tasks, FALSE);
2618 #if OMP_45_ENABLED
2619  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2620 #endif
2621  TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc );
2622  TCW_4(task_team->tt.tt_active, TRUE );
2623  }
2624  // if team size has changed, the first thread to enable tasking will realloc threads_data if necessary
2625  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team %p for team %d at parity=%d\n",
2626  __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2627  ((team != NULL) ? team->t.t_id : -1), other_team ));
2628  }
2629  }
2630 }
2631 
2632 
2633 //------------------------------------------------------------------------------
2634 // __kmp_task_team_sync: Propagation of task team data from team to threads
2635 // which happens just after the release phase of a team barrier. This may be
2636 // called by any thread, but only for teams with # threads > 1.
2637 
2638 void
2639 __kmp_task_team_sync( kmp_info_t *this_thr, kmp_team_t *team )
2640 {
2641  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2642 
2643  // Toggle the th_task_state field, to switch which task_team this thread refers to
2644  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2645  // It is now safe to propagate the task team pointer from the team struct to the current thread.
2646  TCW_PTR(this_thr->th.th_task_team, team->t.t_task_team[this_thr->th.th_task_state]);
2647  KA_TRACE(20, ("__kmp_task_team_sync: Thread T#%d task team switched to task_team %p from Team #%d (parity=%d)\n",
2648  __kmp_gtid_from_thread( this_thr ), this_thr->th.th_task_team,
2649  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2650 }
2651 
2652 
2653 //--------------------------------------------------------------------------------------------
2654 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the barrier gather
2655 // phase. Only called by master thread if #threads in team > 1 or if proxy tasks were created.
2656 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off by passing in 0
2657 // optionally as the last argument. When wait is zero, master thread does not wait for
2658 // unfinished_threads to reach 0.
2659 void
2660 __kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
2661  USE_ITT_BUILD_ARG(void * itt_sync_obj)
2662  , int wait)
2663 {
2664  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2665 
2666  KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2667  KMP_DEBUG_ASSERT( task_team == this_thr->th.th_task_team );
2668 
2669  if ( ( task_team != NULL ) && KMP_TASKING_ENABLED(task_team) ) {
2670  if (wait) {
2671  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks (for unfinished_threads to reach 0) on task_team = %p\n",
2672  __kmp_gtid_from_thread(this_thr), task_team));
2673  // Worker threads may have dropped through to release phase, but could still be executing tasks. Wait
2674  // here for tasks to complete. To avoid memory contention, only master thread checks termination condition.
2675  kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
2676  flag.wait(this_thr, TRUE
2677  USE_ITT_BUILD_ARG(itt_sync_obj));
2678  }
2679  // Deactivate the old task team, so that the worker threads will stop referencing it while spinning.
2680  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2681  "setting active to false, setting local and team's pointer to NULL\n",
2682  __kmp_gtid_from_thread(this_thr), task_team));
2683 #if OMP_45_ENABLED
2684  KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 || task_team->tt.tt_found_proxy_tasks == TRUE );
2685  TCW_SYNC_4( task_team->tt.tt_found_proxy_tasks, FALSE );
2686 #else
2687  KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 );
2688 #endif
2689  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2690  KMP_MB();
2691 
2692  TCW_PTR(this_thr->th.th_task_team, NULL);
2693  }
2694 }
2695 
2696 
2697 //------------------------------------------------------------------------------
2698 // __kmp_tasking_barrier:
2699 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2700 // Internal function to execute all tasks prior to a regular barrier or a
2701 // join barrier. It is a full barrier itself, which unfortunately turns
2702 // regular barriers into double barriers and join barriers into 1 1/2
2703 // barriers.
2704 void
2705 __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
2706 {
2707  volatile kmp_uint32 *spin = &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads;
2708  int flag = FALSE;
2709  KMP_DEBUG_ASSERT( __kmp_tasking_mode == tskm_extra_barrier );
2710 
2711 #if USE_ITT_BUILD
2712  KMP_FSYNC_SPIN_INIT( spin, (kmp_uint32*) NULL );
2713 #endif /* USE_ITT_BUILD */
2714  kmp_flag_32 spin_flag(spin, 0U);
2715  while (! spin_flag.execute_tasks(thread, gtid, TRUE, &flag
2716  USE_ITT_BUILD_ARG(NULL), 0 ) ) {
2717 #if USE_ITT_BUILD
2718  // TODO: What about itt_sync_obj??
2719  KMP_FSYNC_SPIN_PREPARE( spin );
2720 #endif /* USE_ITT_BUILD */
2721 
2722  if( TCR_4(__kmp_global.g.g_done) ) {
2723  if( __kmp_global.g.g_abort )
2724  __kmp_abort_thread( );
2725  break;
2726  }
2727  KMP_YIELD( TRUE ); // GH: We always yield here
2728  }
2729 #if USE_ITT_BUILD
2730  KMP_FSYNC_SPIN_ACQUIRED( (void*) spin );
2731 #endif /* USE_ITT_BUILD */
2732 }
2733 
2734 
2735 #if OMP_45_ENABLED
2736 
2737 /* __kmp_give_task puts a task into a given thread queue if:
2738  - the queue for that thread was created
2739  - there's space in that queue
2740 
2741  Because of this, __kmp_push_task needs to check if there's space after getting the lock
2742  */
2743 static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task, kmp_int32 pass )
2744 {
2745  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
2746  kmp_task_team_t * task_team = taskdata->td_task_team;
2747 
2748  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", taskdata, tid ) );
2749 
2750  // If task_team is NULL something went really bad...
2751  KMP_DEBUG_ASSERT( task_team != NULL );
2752 
2753  bool result = false;
2754  kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
2755 
2756  if (thread_data -> td.td_deque == NULL ) {
2757  // There's no queue in this thread, go find another one
2758  // We're guaranteed that at least one thread has a queue
2759  KA_TRACE(30, ("__kmp_give_task: thread %d has no queue while giving task %p.\n", tid, taskdata ) );
2760  return result;
2761  }
2762 
2763  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
2764  {
2765  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2766 
2767  // if this deque is bigger than the pass ratio give a chance to another thread
2768  if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass ) return result;
2769 
2770  __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
2771  __kmp_realloc_task_deque(thread,thread_data);
2772 
2773  } else {
2774 
2775  __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
2776 
2777  if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
2778  {
2779  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2780 
2781  // if this deque is bigger than the pass ratio give a chance to another thread
2782  if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass )
2783  goto release_and_exit;
2784 
2785  __kmp_realloc_task_deque(thread,thread_data);
2786  }
2787  }
2788 
2789  // lock is held here, and there is space in the deque
2790 
2791  thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
2792  // Wrap index.
2793  thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
2794  TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
2795 
2796  result = true;
2797  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", taskdata, tid ) );
2798 
2799 release_and_exit:
2800  __kmp_release_bootstrap_lock( & thread_data-> td.td_deque_lock );
2801 
2802  return result;
2803 }
2804 
2805 
2806 /* The finish of the a proxy tasks is divided in two pieces:
2807  - the top half is the one that can be done from a thread outside the team
2808  - the bottom half must be run from a them within the team
2809 
2810  In order to run the bottom half the task gets queued back into one of the threads of the team.
2811  Once the td_incomplete_child_task counter of the parent is decremented the threads can leave the barriers.
2812  So, the bottom half needs to be queued before the counter is decremented. The top half is therefore divided in two parts:
2813  - things that can be run before queuing the bottom half
2814  - things that must be run after queuing the bottom half
2815 
2816  This creates a second race as the bottom half can free the task before the second top half is executed. To avoid this
2817  we use the td_incomplete_child_task of the proxy task to synchronize the top and bottom half.
2818 */
2819 
2820 static void __kmp_first_top_half_finish_proxy( kmp_taskdata_t * taskdata )
2821 {
2822  KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
2823  KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
2824  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
2825  KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
2826 
2827  taskdata -> td_flags.complete = 1; // mark the task as completed
2828 
2829  if ( taskdata->td_taskgroup )
2830  KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
2831 
2832  // Create an imaginary children for this task so the bottom half cannot release the task before we have completed the second top half
2833  TCI_4(taskdata->td_incomplete_child_tasks);
2834 }
2835 
2836 static void __kmp_second_top_half_finish_proxy( kmp_taskdata_t * taskdata )
2837 {
2838  kmp_int32 children = 0;
2839 
2840  // Predecrement simulated by "- 1" calculation
2841  children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
2842  KMP_DEBUG_ASSERT( children >= 0 );
2843 
2844  // Remove the imaginary children
2845  TCD_4(taskdata->td_incomplete_child_tasks);
2846 }
2847 
2848 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask )
2849 {
2850  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2851  kmp_info_t * thread = __kmp_threads[ gtid ];
2852 
2853  KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
2854  KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 1 ); // top half must run before bottom half
2855 
2856  // We need to wait to make sure the top half is finished
2857  // Spinning here should be ok as this should happen quickly
2858  while ( TCR_4(taskdata->td_incomplete_child_tasks) > 0 ) ;
2859 
2860  __kmp_release_deps(gtid,taskdata);
2861  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
2862 }
2863 
2871 void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask )
2872 {
2873  KMP_DEBUG_ASSERT( ptask != NULL );
2874  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2875  KA_TRACE(10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", gtid, taskdata ) );
2876 
2877  KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
2878 
2879  __kmp_first_top_half_finish_proxy(taskdata);
2880  __kmp_second_top_half_finish_proxy(taskdata);
2881  __kmp_bottom_half_finish_proxy(gtid,ptask);
2882 
2883  KA_TRACE(10, ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", gtid, taskdata ) );
2884 }
2885 
2892 void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
2893 {
2894  KMP_DEBUG_ASSERT( ptask != NULL );
2895  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2896 
2897  KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", taskdata ) );
2898 
2899  KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
2900 
2901  __kmp_first_top_half_finish_proxy(taskdata);
2902 
2903  // Enqueue task to complete bottom half completion from a thread within the corresponding team
2904  kmp_team_t * team = taskdata->td_team;
2905  kmp_int32 nthreads = team->t.t_nproc;
2906  kmp_info_t *thread;
2907 
2908  //This should be similar to start_k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
2909  kmp_int32 start_k = 0;
2910  kmp_int32 pass = 1;
2911  kmp_int32 k = start_k;
2912 
2913  do {
2914  //For now we're just linearly trying to find a thread
2915  thread = team->t.t_threads[k];
2916  k = (k+1) % nthreads;
2917 
2918  // we did a full pass through all the threads
2919  if ( k == start_k ) pass = pass << 1;
2920 
2921  } while ( !__kmp_give_task( thread, k, ptask, pass ) );
2922 
2923  __kmp_second_top_half_finish_proxy(taskdata);
2924 
2925  KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
2926 }
2927 
2928 //---------------------------------------------------------------------------------
2929 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
2930 //
2931 // thread: allocating thread
2932 // task_src: pointer to source task to be duplicated
2933 // returns: a pointer to the allocated kmp_task_t structure (task).
2934 kmp_task_t *
2935 __kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
2936 {
2937  kmp_task_t *task;
2938  kmp_taskdata_t *taskdata;
2939  kmp_taskdata_t *taskdata_src;
2940  kmp_taskdata_t *parent_task = thread->th.th_current_task;
2941  size_t shareds_offset;
2942  size_t task_size;
2943 
2944  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
2945  taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
2946  KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
2947  KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
2948  task_size = taskdata_src->td_size_alloc;
2949 
2950  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
2951  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
2952  #if USE_FAST_MEMORY
2953  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
2954  #else
2955  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
2956  #endif /* USE_FAST_MEMORY */
2957  KMP_MEMCPY(taskdata, taskdata_src, task_size);
2958 
2959  task = KMP_TASKDATA_TO_TASK(taskdata);
2960 
2961  // Initialize new task (only specific fields not affected by memcpy)
2962  taskdata->td_task_id = KMP_GEN_TASK_ID();
2963  if( task->shareds != NULL ) { // need setup shareds pointer
2964  shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
2965  task->shareds = &((char*)taskdata)[shareds_offset];
2966  KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
2967  }
2968  taskdata->td_alloc_thread = thread;
2969  taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
2970 
2971  // Only need to keep track of child task counts if team parallel and tasking not serialized
2972  if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
2973  KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
2974  if ( parent_task->td_taskgroup )
2975  KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
2976  // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
2977  if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
2978  KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
2979  }
2980 
2981  KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
2982  thread, taskdata, taskdata->td_parent) );
2983 #if OMPT_SUPPORT
2984  __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
2985 #endif
2986  return task;
2987 }
2988 
2989 // Routine optionally generated by th ecompiler for setting the lastprivate flag
2990 // and calling needed constructors for private/firstprivate objects
2991 // (used to form taskloop tasks from pattern task)
2992 typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2993 
2994 //---------------------------------------------------------------------------------
2995 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
2996 //
2997 // loc Source location information
2998 // gtid Global thread ID
2999 // task Task with whole loop iteration range
3000 // lb Pointer to loop lower bound
3001 // ub Pointer to loop upper bound
3002 // st Loop stride
3003 // sched Schedule specified 0/1/2 for none/grainsize/num_tasks
3004 // grainsize Schedule value if specified
3005 // task_dup Tasks duplication routine
3006 void
3007 __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3008  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3009  int sched, kmp_uint64 grainsize, void *task_dup )
3010 {
3011  KMP_COUNT_BLOCK(OMP_TASKLOOP);
3012  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3013  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3014  kmp_uint64 tc;
3015  kmp_uint64 lower = *lb; // compiler provides global bounds here
3016  kmp_uint64 upper = *ub;
3017  kmp_uint64 i, num_tasks = 0, extras = 0;
3018  kmp_info_t *thread = __kmp_threads[gtid];
3019  kmp_taskdata_t *current_task = thread->th.th_current_task;
3020  kmp_task_t *next_task;
3021  kmp_int32 lastpriv = 0;
3022  size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
3023  size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
3024 
3025  // compute trip count
3026  if ( st == 1 ) { // most common case
3027  tc = upper - lower + 1;
3028  } else if ( st < 0 ) {
3029  tc = (lower - upper) / (-st) + 1;
3030  } else { // st > 0
3031  tc = (upper - lower) / st + 1;
3032  }
3033  if(tc == 0) {
3034  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
3035  // free the pattern task and exit
3036  __kmp_task_start( gtid, task, current_task );
3037  // do not execute anything for zero-trip loop
3038  __kmp_task_finish( gtid, task, current_task );
3039  return;
3040  }
3041 
3042  // compute num_tasks/grainsize based on the input provided
3043  switch( sched ) {
3044  case 0: // no schedule clause specified, we can choose the default
3045  // let's try to schedule (team_size*10) tasks
3046  grainsize = thread->th.th_team_nproc * 10;
3047  case 2: // num_tasks provided
3048  if( grainsize > tc ) {
3049  num_tasks = tc; // too big num_tasks requested, adjust values
3050  grainsize = 1;
3051  extras = 0;
3052  } else {
3053  num_tasks = grainsize;
3054  grainsize = tc / num_tasks;
3055  extras = tc % num_tasks;
3056  }
3057  break;
3058  case 1: // grainsize provided
3059  if( grainsize > tc ) {
3060  num_tasks = 1; // too big grainsize requested, adjust values
3061  grainsize = tc;
3062  extras = 0;
3063  } else {
3064  num_tasks = tc / grainsize;
3065  grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
3066  extras = tc % num_tasks;
3067  }
3068  break;
3069  default:
3070  KMP_ASSERT2(0, "unknown scheduling of taskloop");
3071  }
3072  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3073  KMP_DEBUG_ASSERT(num_tasks > extras);
3074  KMP_DEBUG_ASSERT(num_tasks > 0);
3075  KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize %lld, extras %lld\n",
3076  gtid, num_tasks, grainsize, extras));
3077 
3078  // Main loop, launch num_tasks tasks, assign grainsize iterations each task
3079  for( i = 0; i < num_tasks; ++i ) {
3080  kmp_uint64 chunk_minus_1;
3081  if( extras == 0 ) {
3082  chunk_minus_1 = grainsize - 1;
3083  } else {
3084  chunk_minus_1 = grainsize;
3085  --extras; // first extras iterations get bigger chunk (grainsize+1)
3086  }
3087  upper = lower + st * chunk_minus_1;
3088  if( i == num_tasks - 1 ) {
3089  // schedule the last task, set lastprivate flag
3090  lastpriv = 1;
3091 #if KMP_DEBUG
3092  if( st == 1 )
3093  KMP_DEBUG_ASSERT(upper == *ub);
3094  else if( st > 0 )
3095  KMP_DEBUG_ASSERT(upper+st > *ub);
3096  else
3097  KMP_DEBUG_ASSERT(upper+st < *ub);
3098 #endif
3099  }
3100  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3101  *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
3102  *(kmp_uint64*)((char*)next_task + upper_offset) = upper;
3103  if( ptask_dup != NULL )
3104  ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
3105  KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper %lld (offsets %p %p)\n",
3106  gtid, next_task, lower, upper, lower_offset, upper_offset));
3107  __kmp_omp_task(gtid, next_task, true); // schedule new task
3108  lower = upper + st; // adjust lower bound for the next iteration
3109  }
3110  // free the pattern task and exit
3111  __kmp_task_start( gtid, task, current_task );
3112  // do not execute the pattern task, just do bookkeeping
3113  __kmp_task_finish( gtid, task, current_task );
3114 }
3115 
3132 void
3133 __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
3134  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3135  int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
3136 {
3137  kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
3138  KMP_DEBUG_ASSERT( task != NULL );
3139 
3140  KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
3141  gtid, taskdata, *lb, *ub, st, grainsize, sched));
3142 
3143  // check if clause value first
3144  if( if_val == 0 ) { // if(0) specified, mark task as serial
3145  taskdata->td_flags.task_serial = 1;
3146  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
3147  }
3148  if( nogroup == 0 ) {
3149  __kmpc_taskgroup( loc, gtid );
3150  }
3151 
3152  if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
3153  __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
3154  }
3155 
3156  if( nogroup == 0 ) {
3157  __kmpc_end_taskgroup( loc, gtid );
3158  }
3159  KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
3160 }
3161 
3162 #endif
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:747
Definition: kmp.h:200