LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_affinity.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
45 #endif /* defined(KMP_GOMP_COMPAT) */
46 
47 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
48 #if OMP_45_ENABLED
49  "4.5 (201511)";
50 #elif OMP_40_ENABLED
51  "4.0 (201307)";
52 #else
53  "3.1 (201107)";
54 #endif
55 
56 #ifdef KMP_DEBUG
57 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
58 #endif /* KMP_DEBUG */
59 
60 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
61 
62 /* ------------------------------------------------------------------------ */
63 /* ------------------------------------------------------------------------ */
64 
65 kmp_info_t __kmp_monitor;
66 
67 /* ------------------------------------------------------------------------ */
68 /* ------------------------------------------------------------------------ */
69 
70 /* Forward declarations */
71 
72 void __kmp_cleanup( void );
73 
74 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
75 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
76 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
77 static void __kmp_partition_places( kmp_team_t *team, int update_master_only=0 );
78 #endif
79 static void __kmp_do_serial_initialize( void );
80 void __kmp_fork_barrier( int gtid, int tid );
81 void __kmp_join_barrier( int gtid );
82 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
86 #endif
87 
88 static int __kmp_expand_threads(int nWish, int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread( int gtid );
91 #endif
92 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
93 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
94 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* ------------------------------------------------------------------------ */
97 /* ------------------------------------------------------------------------ */
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique */
101 /* identifier of executing thread. */
102 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
103 
104 int
105 __kmp_get_global_thread_id( )
106 {
107  int i;
108  kmp_info_t **other_threads;
109  size_t stack_data;
110  char *stack_addr;
111  size_t stack_size;
112  char *stack_base;
113 
114  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
115  __kmp_nth, __kmp_all_nth ));
116 
117  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
118  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
119  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
120  __kmp_init_gtid for this to work. */
121 
122  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
123 
124 #ifdef KMP_TDATA_GTID
125  if ( TCR_4(__kmp_gtid_mode) >= 3) {
126  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
127  return __kmp_gtid;
128  }
129 #endif
130  if ( TCR_4(__kmp_gtid_mode) >= 2) {
131  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
132  return __kmp_gtid_get_specific();
133  }
134  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
135 
136  stack_addr = (char*) & stack_data;
137  other_threads = __kmp_threads;
138 
139  /*
140  ATT: The code below is a source of potential bugs due to unsynchronized access to
141  __kmp_threads array. For example:
142  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
143  2. Current thread is suspended by OS.
144  3. Another thread unregisters and finishes (debug versions of free() may fill memory
145  with something like 0xEF).
146  4. Current thread is resumed.
147  5. Current thread reads junk from *thr.
148  TODO: Fix it.
149  --ln
150  */
151 
152  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
153 
154  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
155  if( !thr ) continue;
156 
157  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
158  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
159 
160  /* stack grows down -- search through all of the active threads */
161 
162  if( stack_addr <= stack_base ) {
163  size_t stack_diff = stack_base - stack_addr;
164 
165  if( stack_diff <= stack_size ) {
166  /* The only way we can be closer than the allocated */
167  /* stack size is if we are running on this thread. */
168  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
169  return i;
170  }
171  }
172  }
173 
174  /* get specific to try and determine our gtid */
175  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
176  "thread, using TLS\n" ));
177  i = __kmp_gtid_get_specific();
178 
179  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
180 
181  /* if we havn't been assigned a gtid, then return code */
182  if( i<0 ) return i;
183 
184  /* dynamically updated stack window for uber threads to avoid get_specific call */
185  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
186  KMP_FATAL( StackOverflow, i );
187  }
188 
189  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
190  if( stack_addr > stack_base ) {
191  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
192  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
193  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
194  } else {
195  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
196  }
197 
198  /* Reprint stack bounds for ubermaster since they have been refined */
199  if ( __kmp_storage_map ) {
200  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
201  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
203  other_threads[i]->th.th_info.ds.ds_stacksize,
204  "th_%d stack (refinement)", i );
205  }
206  return i;
207 }
208 
209 int
210 __kmp_get_global_thread_id_reg( )
211 {
212  int gtid;
213 
214  if ( !__kmp_init_serial ) {
215  gtid = KMP_GTID_DNE;
216  } else
217 #ifdef KMP_TDATA_GTID
218  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
219  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
220  gtid = __kmp_gtid;
221  } else
222 #endif
223  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
224  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
225  gtid = __kmp_gtid_get_specific();
226  } else {
227  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
228  gtid = __kmp_get_global_thread_id();
229  }
230 
231  /* we must be a new uber master sibling thread */
232  if( gtid == KMP_GTID_DNE ) {
233  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
234  "Registering a new gtid.\n" ));
235  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
236  if( !__kmp_init_serial ) {
237  __kmp_do_serial_initialize();
238  gtid = __kmp_gtid_get_specific();
239  } else {
240  gtid = __kmp_register_root(FALSE);
241  }
242  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
243  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244  }
245 
246  KMP_DEBUG_ASSERT( gtid >=0 );
247 
248  return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void
253 __kmp_check_stack_overlap( kmp_info_t *th )
254 {
255  int f;
256  char *stack_beg = NULL;
257  char *stack_end = NULL;
258  int gtid;
259 
260  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
261  if ( __kmp_storage_map ) {
262  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
263  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
264 
265  gtid = __kmp_gtid_from_thread( th );
266 
267  if (gtid == KMP_GTID_MONITOR) {
268  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
269  "th_%s stack (%s)", "mon",
270  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
271  } else {
272  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273  "th_%d stack (%s)", gtid,
274  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
275  }
276  }
277 
278  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
279  gtid = __kmp_gtid_from_thread( th );
280  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
281  {
282  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
283  if ( stack_beg == NULL ) {
284  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
285  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
286  }
287 
288  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
289  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
290 
291  if( f_th && f_th != th ) {
292  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
293  char *other_stack_beg = other_stack_end -
294  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
295  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
296  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
297 
298  /* Print the other stack values before the abort */
299  if ( __kmp_storage_map )
300  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
301  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
302  "th_%d stack (overlapped)",
303  __kmp_gtid_from_thread( f_th ) );
304 
305  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
306  }
307  }
308  }
309  }
310  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
311 }
312 
313 
314 /* ------------------------------------------------------------------------ */
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void
319 __kmp_infinite_loop( void )
320 {
321  static int done = FALSE;
322 
323  while (! done) {
324  KMP_YIELD( 1 );
325  }
326 }
327 
328 #define MAX_MESSAGE 512
329 
330 void
331 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
332  char buffer[MAX_MESSAGE];
333  va_list ap;
334 
335  va_start( ap, format);
336  KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
337  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
338  __kmp_vprintf( kmp_err, buffer, ap );
339 #if KMP_PRINT_DATA_PLACEMENT
340  int node;
341  if(gtid >= 0) {
342  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
343  if( __kmp_storage_map_verbose ) {
344  node = __kmp_get_host_node(p1);
345  if(node < 0) /* doesn't work, so don't try this next time */
346  __kmp_storage_map_verbose = FALSE;
347  else {
348  char *last;
349  int lastNode;
350  int localProc = __kmp_get_cpu_from_gtid(gtid);
351 
352  const int page_size = KMP_GET_PAGE_SIZE();
353 
354  p1 = (void *)( (size_t)p1 & ~((size_t)page_size - 1) );
355  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)page_size - 1) );
356  if(localProc >= 0)
357  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
358  else
359  __kmp_printf_no_lock(" GTID %d\n", gtid);
360 # if KMP_USE_PRCTL
361 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
362  do {
363  last = p1;
364  lastNode = node;
365  /* This loop collates adjacent pages with the same host node. */
366  do {
367  (char*)p1 += page_size;
368  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
369  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
370  (char*)p1 - 1, lastNode);
371  } while(p1 <= p2);
372 # else
373  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
374  (char*)p1 + (page_size - 1), __kmp_get_host_node(p1));
375  if(p1 < p2) {
376  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
377  (char*)p2 + (page_size - 1), __kmp_get_host_node(p2));
378  }
379 # endif
380  }
381  }
382  } else
383  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
384  }
385 #endif /* KMP_PRINT_DATA_PLACEMENT */
386  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
387 }
388 
389 void
390 __kmp_warn( char const * format, ... )
391 {
392  char buffer[MAX_MESSAGE];
393  va_list ap;
394 
395  if ( __kmp_generate_warnings == kmp_warnings_off ) {
396  return;
397  }
398 
399  va_start( ap, format );
400 
401  KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
402  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
403  __kmp_vprintf( kmp_err, buffer, ap );
404  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
405 
406  va_end( ap );
407 }
408 
409 void
410 __kmp_abort_process()
411 {
412 
413  // Later threads may stall here, but that's ok because abort() will kill them.
414  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
415 
416  if ( __kmp_debug_buf ) {
417  __kmp_dump_debug_buffer();
418  }; // if
419 
420  if ( KMP_OS_WINDOWS ) {
421  // Let other threads know of abnormal termination and prevent deadlock
422  // if abort happened during library initialization or shutdown
423  __kmp_global.g.g_abort = SIGABRT;
424 
425  /*
426  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
427  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
428  works well, but this function is not available in VS7 (this is not problem for DLL, but
429  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
430  not help, at least in some versions of MS C RTL.
431 
432  It seems following sequence is the only way to simulate abort() and avoid pop-up error
433  box.
434  */
435  raise( SIGABRT );
436  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
437  } else {
438  abort();
439  }; // if
440 
441  __kmp_infinite_loop();
442  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
443 
444 } // __kmp_abort_process
445 
446 void
447 __kmp_abort_thread( void )
448 {
449  // TODO: Eliminate g_abort global variable and this function.
450  // In case of abort just call abort(), it will kill all the threads.
451  __kmp_infinite_loop();
452 } // __kmp_abort_thread
453 
454 /* ------------------------------------------------------------------------ */
455 
456 /*
457  * Print out the storage map for the major kmp_info_t thread data structures
458  * that are allocated together.
459  */
460 
461 static void
462 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
463 {
464  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
465 
466  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
467  "th_%d.th_info", gtid );
468 
469  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
470  "th_%d.th_local", gtid );
471 
472  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
474 
475  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
476  &thr->th.th_bar[bs_plain_barrier+1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
478 
479  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480  &thr->th.th_bar[bs_forkjoin_barrier+1],
481  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
482 
483  #if KMP_FAST_REDUCTION_BARRIER
484  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
485  &thr->th.th_bar[bs_reduction_barrier+1],
486  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
487  #endif // KMP_FAST_REDUCTION_BARRIER
488 }
489 
490 /*
491  * Print out the storage map for the major kmp_team_t team data structures
492  * that are allocated together.
493  */
494 
495 static void
496 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
497 {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500  header, team_id );
501 
502  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
503  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
504 
505 
506  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
507  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
508 
509  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
510  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
511 
512  #if KMP_FAST_REDUCTION_BARRIER
513  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
514  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
515  #endif // KMP_FAST_REDUCTION_BARRIER
516 
517  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
518  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
519 
520  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
522 
523  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
524  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
525  header, team_id );
526 
527 
528  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
529  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
530 }
531 
532 static void __kmp_init_allocator() {}
533 static void __kmp_fini_allocator() {}
534 
535 /* ------------------------------------------------------------------------ */
536 
537 #ifdef KMP_DYNAMIC_LIB
538 # if KMP_OS_WINDOWS
539 
540 static void
541 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
542  // TODO: Change to __kmp_break_bootstrap_lock().
543  __kmp_init_bootstrap_lock( lck ); // make the lock released
544 }
545 
546 static void
547 __kmp_reset_locks_on_process_detach( int gtid_req ) {
548  int i;
549  int thread_count;
550 
551  // PROCESS_DETACH is expected to be called by a thread
552  // that executes ProcessExit() or FreeLibrary().
553  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
554  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
555  // However, in fact, some threads can be still alive here, although being about to be terminated.
556  // The threads in the array with ds_thread==0 are most suspicious.
557  // Actually, it can be not safe to access the __kmp_threads[].
558 
559  // TODO: does it make sense to check __kmp_roots[] ?
560 
561  // Let's check that there are no other alive threads registered with the OMP lib.
562  while( 1 ) {
563  thread_count = 0;
564  for( i = 0; i < __kmp_threads_capacity; ++i ) {
565  if( !__kmp_threads ) continue;
566  kmp_info_t* th = __kmp_threads[ i ];
567  if( th == NULL ) continue;
568  int gtid = th->th.th_info.ds.ds_gtid;
569  if( gtid == gtid_req ) continue;
570  if( gtid < 0 ) continue;
571  DWORD exit_val;
572  int alive = __kmp_is_thread_alive( th, &exit_val );
573  if( alive ) {
574  ++thread_count;
575  }
576  }
577  if( thread_count == 0 ) break; // success
578  }
579 
580  // Assume that I'm alone.
581 
582  // Now it might be probably safe to check and reset locks.
583  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
584  __kmp_reset_lock( &__kmp_forkjoin_lock );
585  #ifdef KMP_DEBUG
586  __kmp_reset_lock( &__kmp_stdio_lock );
587  #endif // KMP_DEBUG
588 }
589 
590 BOOL WINAPI
591 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
592  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
593 
594  switch( fdwReason ) {
595 
596  case DLL_PROCESS_ATTACH:
597  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
598 
599  return TRUE;
600 
601  case DLL_PROCESS_DETACH:
602  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
603  __kmp_gtid_get_specific() ));
604 
605  if( lpReserved != NULL )
606  {
607  // lpReserved is used for telling the difference:
608  // lpReserved == NULL when FreeLibrary() was called,
609  // lpReserved != NULL when the process terminates.
610  // When FreeLibrary() is called, worker threads remain alive.
611  // So they will release the forkjoin lock by themselves.
612  // When the process terminates, worker threads disappear triggering
613  // the problem of unreleased forkjoin lock as described below.
614 
615  // A worker thread can take the forkjoin lock.
616  // The problem comes up if that worker thread becomes dead
617  // before it releases the forkjoin lock.
618  // The forkjoin lock remains taken, while the thread
619  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
620  // will try to take the forkjoin lock and will always fail,
621  // so that the application will never finish [normally].
622  // This scenario is possible if __kmpc_end() has not been executed.
623  // It looks like it's not a corner case, but common cases:
624  // - the main function was compiled by an alternative compiler;
625  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
626  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
627  // - alive foreign thread prevented __kmpc_end from doing cleanup.
628 
629  // This is a hack to work around the problem.
630  // TODO: !!! to figure out something better.
631  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
632  }
633 
634  __kmp_internal_end_library( __kmp_gtid_get_specific() );
635 
636  return TRUE;
637 
638  case DLL_THREAD_ATTACH:
639  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
640 
641  /* if we wanted to register new siblings all the time here call
642  * __kmp_get_gtid(); */
643  return TRUE;
644 
645  case DLL_THREAD_DETACH:
646  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
647  __kmp_gtid_get_specific() ));
648 
649  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
650  return TRUE;
651  }
652 
653  return TRUE;
654 }
655 
656 # endif /* KMP_OS_WINDOWS */
657 #endif /* KMP_DYNAMIC_LIB */
658 
659 
660 /* ------------------------------------------------------------------------ */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int
665 __kmp_change_library( int status )
666 {
667  int old_status;
668 
669  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
670 
671  if (status) {
672  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
673  }
674  else {
675  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
676  }
677 
678  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
679 }
680 
681 /* ------------------------------------------------------------------------ */
682 /* ------------------------------------------------------------------------ */
683 
684 /* __kmp_parallel_deo --
685  * Wait until it's our turn.
686  */
687 void
688 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
689 {
690  int gtid = *gtid_ref;
691 #ifdef BUILD_PARALLEL_ORDERED
692  kmp_team_t *team = __kmp_team_from_gtid( gtid );
693 #endif /* BUILD_PARALLEL_ORDERED */
694 
695  if( __kmp_env_consistency_check ) {
696  if( __kmp_threads[gtid]->th.th_root->r.r_active )
697 #if KMP_USE_DYNAMIC_LOCK
698  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
699 #else
700  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
701 #endif
702  }
703 #ifdef BUILD_PARALLEL_ORDERED
704  if( !team->t.t_serialized ) {
705  KMP_MB();
706  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
707  KMP_MB();
708  }
709 #endif /* BUILD_PARALLEL_ORDERED */
710 }
711 
712 /* __kmp_parallel_dxo --
713  * Signal the next task.
714  */
715 
716 void
717 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
718 {
719  int gtid = *gtid_ref;
720 #ifdef BUILD_PARALLEL_ORDERED
721  int tid = __kmp_tid_from_gtid( gtid );
722  kmp_team_t *team = __kmp_team_from_gtid( gtid );
723 #endif /* BUILD_PARALLEL_ORDERED */
724 
725  if( __kmp_env_consistency_check ) {
726  if( __kmp_threads[gtid]->th.th_root->r.r_active )
727  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
728  }
729 #ifdef BUILD_PARALLEL_ORDERED
730  if ( ! team->t.t_serialized ) {
731  KMP_MB(); /* Flush all pending memory write invalidates. */
732 
733  /* use the tid of the next thread in this team */
734  /* TODO repleace with general release procedure */
735  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
736 
737 #if OMPT_SUPPORT && OMPT_BLAME
738  if (ompt_enabled &&
739  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
740  /* accept blame for "ordered" waiting */
741  kmp_info_t *this_thread = __kmp_threads[gtid];
742  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
743  this_thread->th.ompt_thread_info.wait_id);
744  }
745 #endif
746 
747  KMP_MB(); /* Flush all pending memory write invalidates. */
748  }
749 #endif /* BUILD_PARALLEL_ORDERED */
750 }
751 
752 /* ------------------------------------------------------------------------ */
753 /* ------------------------------------------------------------------------ */
754 
755 /* ------------------------------------------------------------------------ */
756 /* ------------------------------------------------------------------------ */
757 
758 /* The BARRIER for a SINGLE process section is always explicit */
759 
760 int
761 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
762 {
763  int status;
764  kmp_info_t *th;
765  kmp_team_t *team;
766 
767  if( ! TCR_4(__kmp_init_parallel) )
768  __kmp_parallel_initialize();
769 
770  th = __kmp_threads[ gtid ];
771  team = th->th.th_team;
772  status = 0;
773 
774  th->th.th_ident = id_ref;
775 
776  if ( team->t.t_serialized ) {
777  status = 1;
778  } else {
779  kmp_int32 old_this = th->th.th_local.this_construct;
780 
781  ++th->th.th_local.this_construct;
782  /* try to set team count to thread count--success means thread got the
783  single block
784  */
785  /* TODO: Should this be acquire or release? */
786  if (team->t.t_construct == old_this) {
787  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
788  th->th.th_local.this_construct);
789  }
790 #if USE_ITT_BUILD
791  if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
792 #if OMP_40_ENABLED
793  th->th.th_teams_microtask == NULL &&
794 #endif
795  team->t.t_active_level == 1 )
796  { // Only report metadata by master of active team at level 1
797  __kmp_itt_metadata_single( id_ref );
798  }
799 #endif /* USE_ITT_BUILD */
800  }
801 
802  if( __kmp_env_consistency_check ) {
803  if (status && push_ws) {
804  __kmp_push_workshare( gtid, ct_psingle, id_ref );
805  } else {
806  __kmp_check_workshare( gtid, ct_psingle, id_ref );
807  }
808  }
809 #if USE_ITT_BUILD
810  if ( status ) {
811  __kmp_itt_single_start( gtid );
812  }
813 #endif /* USE_ITT_BUILD */
814  return status;
815 }
816 
817 void
818 __kmp_exit_single( int gtid )
819 {
820 #if USE_ITT_BUILD
821  __kmp_itt_single_end( gtid );
822 #endif /* USE_ITT_BUILD */
823  if( __kmp_env_consistency_check )
824  __kmp_pop_workshare( gtid, ct_psingle, NULL );
825 }
826 
827 
828 /*
829  * determine if we can go parallel or must use a serialized parallel region and
830  * how many threads we can use
831  * set_nproc is the number of threads requested for the team
832  * returns 0 if we should serialize or only use one thread,
833  * otherwise the number of threads to use
834  * The forkjoin lock is held by the caller.
835  */
836 static int
837 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
838  int master_tid, int set_nthreads
839 #if OMP_40_ENABLED
840  , int enter_teams
841 #endif /* OMP_40_ENABLED */
842 )
843 {
844  int capacity;
845  int new_nthreads;
846  KMP_DEBUG_ASSERT( __kmp_init_serial );
847  KMP_DEBUG_ASSERT( root && parent_team );
848 
849  //
850  // If dyn-var is set, dynamically adjust the number of desired threads,
851  // according to the method specified by dynamic_mode.
852  //
853  new_nthreads = set_nthreads;
854  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
855  ;
856  }
857 #ifdef USE_LOAD_BALANCE
858  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
859  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
860  if ( new_nthreads == 1 ) {
861  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
862  master_tid ));
863  return 1;
864  }
865  if ( new_nthreads < set_nthreads ) {
866  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
867  master_tid, new_nthreads ));
868  }
869  }
870 #endif /* USE_LOAD_BALANCE */
871  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
872  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
873  : root->r.r_hot_team->t.t_nproc);
874  if ( new_nthreads <= 1 ) {
875  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
876  master_tid ));
877  return 1;
878  }
879  if ( new_nthreads < set_nthreads ) {
880  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
881  master_tid, new_nthreads ));
882  }
883  else {
884  new_nthreads = set_nthreads;
885  }
886  }
887  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
888  if ( set_nthreads > 2 ) {
889  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
890  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
891  if ( new_nthreads == 1 ) {
892  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
893  master_tid ));
894  return 1;
895  }
896  if ( new_nthreads < set_nthreads ) {
897  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
898  master_tid, new_nthreads ));
899  }
900  }
901  }
902  else {
903  KMP_ASSERT( 0 );
904  }
905 
906  //
907  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
908  //
909  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
910  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
911  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
912  root->r.r_hot_team->t.t_nproc );
913  if ( tl_nthreads <= 0 ) {
914  tl_nthreads = 1;
915  }
916 
917  //
918  // If dyn-var is false, emit a 1-time warning.
919  //
920  if ( ! get__dynamic_2( parent_team, master_tid )
921  && ( ! __kmp_reserve_warn ) ) {
922  __kmp_reserve_warn = 1;
923  __kmp_msg(
924  kmp_ms_warning,
925  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
926  KMP_HNT( Unset_ALL_THREADS ),
927  __kmp_msg_null
928  );
929  }
930  if ( tl_nthreads == 1 ) {
931  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
932  master_tid ));
933  return 1;
934  }
935  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
936  master_tid, tl_nthreads ));
937  new_nthreads = tl_nthreads;
938  }
939 
940  //
941  // Check if the threads array is large enough, or needs expanding.
942  //
943  // See comment in __kmp_register_root() about the adjustment if
944  // __kmp_threads[0] == NULL.
945  //
946  capacity = __kmp_threads_capacity;
947  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
948  --capacity;
949  }
950  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
951  root->r.r_hot_team->t.t_nproc ) > capacity ) {
952  //
953  // Expand the threads array.
954  //
955  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
956  root->r.r_hot_team->t.t_nproc ) - capacity;
957  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
958  if ( slotsAdded < slotsRequired ) {
959  //
960  // The threads array was not expanded enough.
961  //
962  new_nthreads -= ( slotsRequired - slotsAdded );
963  KMP_ASSERT( new_nthreads >= 1 );
964 
965  //
966  // If dyn-var is false, emit a 1-time warning.
967  //
968  if ( ! get__dynamic_2( parent_team, master_tid )
969  && ( ! __kmp_reserve_warn ) ) {
970  __kmp_reserve_warn = 1;
971  if ( __kmp_tp_cached ) {
972  __kmp_msg(
973  kmp_ms_warning,
974  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
975  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
976  KMP_HNT( PossibleSystemLimitOnThreads ),
977  __kmp_msg_null
978  );
979  }
980  else {
981  __kmp_msg(
982  kmp_ms_warning,
983  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
984  KMP_HNT( SystemLimitOnThreads ),
985  __kmp_msg_null
986  );
987  }
988  }
989  }
990  }
991 
992  if ( new_nthreads == 1 ) {
993  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
994  __kmp_get_gtid(), set_nthreads ) );
995  return 1;
996  }
997 
998  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
999  __kmp_get_gtid(), new_nthreads, set_nthreads ));
1000  return new_nthreads;
1001 }
1002 
1003 /* ------------------------------------------------------------------------ */
1004 /* ------------------------------------------------------------------------ */
1005 
1006 /* allocate threads from the thread pool and assign them to the new team */
1007 /* we are assured that there are enough threads available, because we
1008  * checked on that earlier within critical section forkjoin */
1009 
1010 static void
1011 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1012  kmp_info_t *master_th, int master_gtid )
1013 {
1014  int i;
1015  int use_hot_team;
1016 
1017  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1018  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1019  KMP_MB();
1020 
1021  /* first, let's setup the master thread */
1022  master_th->th.th_info.ds.ds_tid = 0;
1023  master_th->th.th_team = team;
1024  master_th->th.th_team_nproc = team->t.t_nproc;
1025  master_th->th.th_team_master = master_th;
1026  master_th->th.th_team_serialized = FALSE;
1027  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1028 
1029  /* make sure we are not the optimized hot team */
1030 #if KMP_NESTED_HOT_TEAMS
1031  use_hot_team = 0;
1032  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1033  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1034  int level = team->t.t_active_level - 1; // index in array of hot teams
1035  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1036  if( master_th->th.th_teams_size.nteams > 1 ) {
1037  ++level; // level was not increased in teams construct for team_of_masters
1038  }
1039  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1040  master_th->th.th_teams_level == team->t.t_level ) {
1041  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1042  } // team->t.t_level will be increased inside parallel
1043  }
1044  if( level < __kmp_hot_teams_max_level ) {
1045  if( hot_teams[level].hot_team ) {
1046  // hot team has already been allocated for given level
1047  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1048  use_hot_team = 1; // the team is ready to use
1049  } else {
1050  use_hot_team = 0; // AC: threads are not allocated yet
1051  hot_teams[level].hot_team = team; // remember new hot team
1052  hot_teams[level].hot_team_nth = team->t.t_nproc;
1053  }
1054  } else {
1055  use_hot_team = 0;
1056  }
1057  }
1058 #else
1059  use_hot_team = team == root->r.r_hot_team;
1060 #endif
1061  if ( !use_hot_team ) {
1062 
1063  /* install the master thread */
1064  team->t.t_threads[ 0 ] = master_th;
1065  __kmp_initialize_info( master_th, team, 0, master_gtid );
1066 
1067  /* now, install the worker threads */
1068  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1069 
1070  /* fork or reallocate a new thread and install it in team */
1071  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1072  team->t.t_threads[ i ] = thr;
1073  KMP_DEBUG_ASSERT( thr );
1074  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1075  /* align team and thread arrived states */
1076  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1077  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1078  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1079  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1080  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1081 #if OMP_40_ENABLED
1082  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1083  thr->th.th_teams_level = master_th->th.th_teams_level;
1084  thr->th.th_teams_size = master_th->th.th_teams_size;
1085 #endif
1086  { // Initialize threads' barrier data.
1087  int b;
1088  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1089  for ( b = 0; b < bs_last_barrier; ++ b ) {
1090  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1091  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1092 #if USE_DEBUGGER
1093  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1094 #endif
1095  }; // for b
1096  }
1097  }
1098 
1099 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1100  __kmp_partition_places( team );
1101 #endif
1102 
1103  }
1104 
1105  KMP_MB();
1106 }
1107 
1108 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1109 //
1110 // Propagate any changes to the floating point control registers out to the team
1111 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1112 // so we don't make changes unless they are needed.
1113 //
1114 inline static void
1115 propagateFPControl(kmp_team_t * team)
1116 {
1117  if ( __kmp_inherit_fp_control ) {
1118  kmp_int16 x87_fpu_control_word;
1119  kmp_uint32 mxcsr;
1120 
1121  // Get master values of FPU control flags (both X87 and vector)
1122  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1123  __kmp_store_mxcsr( &mxcsr );
1124  mxcsr &= KMP_X86_MXCSR_MASK;
1125 
1126  // There is no point looking at t_fp_control_saved here.
1127  // If it is TRUE, we still have to update the values if they are different from those we now have.
1128  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1129  // that the values in the team are the same as those we have.
1130  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1131  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1132  // cache-line into a written state, causing all threads in the team to have to read it again.
1133  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1134  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1135  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1136  // So we must ensure it is correct.
1137  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1138  }
1139  else {
1140  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1141  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1142  }
1143 }
1144 
1145 // Do the opposite, setting the hardware registers to the updated values from the team.
1146 inline static void
1147 updateHWFPControl(kmp_team_t * team)
1148 {
1149  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1150  //
1151  // Only reset the fp control regs if they have been changed in the team.
1152  // the parallel region that we are exiting.
1153  //
1154  kmp_int16 x87_fpu_control_word;
1155  kmp_uint32 mxcsr;
1156  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1157  __kmp_store_mxcsr( &mxcsr );
1158  mxcsr &= KMP_X86_MXCSR_MASK;
1159 
1160  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1161  __kmp_clear_x87_fpu_status_word();
1162  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1163  }
1164 
1165  if ( team->t.t_mxcsr != mxcsr ) {
1166  __kmp_load_mxcsr( &team->t.t_mxcsr );
1167  }
1168  }
1169 }
1170 #else
1171 # define propagateFPControl(x) ((void)0)
1172 # define updateHWFPControl(x) ((void)0)
1173 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1174 
1175 static void
1176 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1177 
1178 /*
1179  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1180  */
1181 void
1182 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1183 {
1184  kmp_info_t *this_thr;
1185  kmp_team_t *serial_team;
1186 
1187  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1188 
1189  /* Skip all this code for autopar serialized loops since it results in
1190  unacceptable overhead */
1191  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1192  return;
1193 
1194  if( ! TCR_4( __kmp_init_parallel ) )
1195  __kmp_parallel_initialize();
1196 
1197  this_thr = __kmp_threads[ global_tid ];
1198  serial_team = this_thr->th.th_serial_team;
1199 
1200  /* utilize the serialized team held by this thread */
1201  KMP_DEBUG_ASSERT( serial_team );
1202  KMP_MB();
1203 
1204  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1205  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1206  KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1207  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1208  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1209  this_thr->th.th_task_team = NULL;
1210  }
1211 
1212 #if OMP_40_ENABLED
1213  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1214  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1215  proc_bind = proc_bind_false;
1216  }
1217  else if ( proc_bind == proc_bind_default ) {
1218  //
1219  // No proc_bind clause was specified, so use the current value
1220  // of proc-bind-var for this parallel region.
1221  //
1222  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1223  }
1224  //
1225  // Reset for next parallel region
1226  //
1227  this_thr->th.th_set_proc_bind = proc_bind_default;
1228 #endif /* OMP_40_ENABLED */
1229 
1230  if( this_thr->th.th_team != serial_team ) {
1231  // Nested level will be an index in the nested nthreads array
1232  int level = this_thr->th.th_team->t.t_level;
1233 
1234  if( serial_team->t.t_serialized ) {
1235  /* this serial team was already used
1236  * TODO increase performance by making this locks more specific */
1237  kmp_team_t *new_team;
1238 
1239  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1240 
1241 #if OMPT_SUPPORT
1242  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1243 #endif
1244 
1245  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1246 #if OMPT_SUPPORT
1247  ompt_parallel_id,
1248 #endif
1249 #if OMP_40_ENABLED
1250  proc_bind,
1251 #endif
1252  & this_thr->th.th_current_task->td_icvs,
1253  0 USE_NESTED_HOT_ARG(NULL) );
1254  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1255  KMP_ASSERT( new_team );
1256 
1257  /* setup new serialized team and install it */
1258  new_team->t.t_threads[0] = this_thr;
1259  new_team->t.t_parent = this_thr->th.th_team;
1260  serial_team = new_team;
1261  this_thr->th.th_serial_team = serial_team;
1262 
1263  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1264  global_tid, serial_team ) );
1265 
1266 
1267  /* TODO the above breaks the requirement that if we run out of
1268  * resources, then we can still guarantee that serialized teams
1269  * are ok, since we may need to allocate a new one */
1270  } else {
1271  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1272  global_tid, serial_team ) );
1273  }
1274 
1275  /* we have to initialize this serial team */
1276  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1277  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1278  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1279  serial_team->t.t_ident = loc;
1280  serial_team->t.t_serialized = 1;
1281  serial_team->t.t_nproc = 1;
1282  serial_team->t.t_parent = this_thr->th.th_team;
1283  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1284  this_thr->th.th_team = serial_team;
1285  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1286 
1287  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1288  global_tid, this_thr->th.th_current_task ) );
1289  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1290  this_thr->th.th_current_task->td_flags.executing = 0;
1291 
1292  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1293 
1294  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1295  each serialized task represented by team->t.t_serialized? */
1296  copy_icvs(
1297  & this_thr->th.th_current_task->td_icvs,
1298  & this_thr->th.th_current_task->td_parent->td_icvs );
1299 
1300  // Thread value exists in the nested nthreads array for the next nested level
1301  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1302  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1303  }
1304 
1305 #if OMP_40_ENABLED
1306  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1307  this_thr->th.th_current_task->td_icvs.proc_bind
1308  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1309  }
1310 #endif /* OMP_40_ENABLED */
1311 
1312 #if USE_DEBUGGER
1313  serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1314 #endif
1315  this_thr->th.th_info.ds.ds_tid = 0;
1316 
1317  /* set thread cache values */
1318  this_thr->th.th_team_nproc = 1;
1319  this_thr->th.th_team_master = this_thr;
1320  this_thr->th.th_team_serialized = 1;
1321 
1322  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1323  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1324 
1325  propagateFPControl (serial_team);
1326 
1327  /* check if we need to allocate dispatch buffers stack */
1328  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1329  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1330  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1331  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1332  }
1333  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334 
1335 #if OMPT_SUPPORT
1336  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1337  __ompt_team_assign_id(serial_team, ompt_parallel_id);
1338 #endif
1339 
1340  KMP_MB();
1341 
1342  } else {
1343  /* this serialized team is already being used,
1344  * that's fine, just add another nested level */
1345  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1346  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1347  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1348  ++ serial_team->t.t_serialized;
1349  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1350 
1351  // Nested level will be an index in the nested nthreads array
1352  int level = this_thr->th.th_team->t.t_level;
1353  // Thread value exists in the nested nthreads array for the next nested level
1354  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1355  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1356  }
1357  serial_team->t.t_level++;
1358  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1359  global_tid, serial_team, serial_team->t.t_level ) );
1360 
1361  /* allocate/push dispatch buffers stack */
1362  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1363  {
1364  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1365  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1366  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1367  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1368  }
1369  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1370 
1371  KMP_MB();
1372  }
1373 
1374  if ( __kmp_env_consistency_check )
1375  __kmp_push_parallel( global_tid, NULL );
1376 
1377 }
1378 
1379 /* most of the work for a fork */
1380 /* return true if we really went parallel, false if serialized */
1381 int
1382 __kmp_fork_call(
1383  ident_t * loc,
1384  int gtid,
1385  enum fork_context_e call_context, // Intel, GNU, ...
1386  kmp_int32 argc,
1387 #if OMPT_SUPPORT
1388  void *unwrapped_task,
1389 #endif
1390  microtask_t microtask,
1391  launch_t invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394  va_list * ap
1395 #else
1396  va_list ap
1397 #endif
1398  )
1399 {
1400  void **argv;
1401  int i;
1402  int master_tid;
1403  int master_this_cons;
1404  kmp_team_t *team;
1405  kmp_team_t *parent_team;
1406  kmp_info_t *master_th;
1407  kmp_root_t *root;
1408  int nthreads;
1409  int master_active;
1410  int master_set_numthreads;
1411  int level;
1412 #if OMP_40_ENABLED
1413  int active_level;
1414  int teams_level;
1415 #endif
1416 #if KMP_NESTED_HOT_TEAMS
1417  kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419  { // KMP_TIME_BLOCK
1420  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1424  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1425  /* Some systems prefer the stack for the root thread(s) to start with */
1426  /* some gap from the parent stack to prevent false sharing. */
1427  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428  /* These 2 lines below are so this does not get optimized out */
1429  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1430  __kmp_stkpadding += (short)((kmp_int64)dummy);
1431  }
1432 
1433  /* initialize if needed */
1434  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1435  if( ! TCR_4(__kmp_init_parallel) )
1436  __kmp_parallel_initialize();
1437 
1438  /* setup current data */
1439  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1440  parent_team = master_th->th.th_team;
1441  master_tid = master_th->th.th_info.ds.ds_tid;
1442  master_this_cons = master_th->th.th_local.this_construct;
1443  root = master_th->th.th_root;
1444  master_active = root->r.r_active;
1445  master_set_numthreads = master_th->th.th_set_nproc;
1446 
1447 #if OMPT_SUPPORT
1448  ompt_parallel_id_t ompt_parallel_id;
1449  ompt_task_id_t ompt_task_id;
1450  ompt_frame_t *ompt_frame;
1451  ompt_task_id_t my_task_id;
1452  ompt_parallel_id_t my_parallel_id;
1453 
1454  if (ompt_enabled) {
1455  ompt_parallel_id = __ompt_parallel_id_new(gtid);
1456  ompt_task_id = __ompt_get_task_id_internal(0);
1457  ompt_frame = __ompt_get_task_frame_internal(0);
1458  }
1459 #endif
1460 
1461  // Nested level will be an index in the nested nthreads array
1462  level = parent_team->t.t_level;
1463  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1464 #if OMP_40_ENABLED
1465  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1466 #endif
1467 #if KMP_NESTED_HOT_TEAMS
1468  p_hot_teams = &master_th->th.th_hot_teams;
1469  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1470  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1471  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1474  }
1475 #endif
1476 
1477 #if OMPT_SUPPORT
1478  if (ompt_enabled &&
1479  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1480  int team_size = master_set_numthreads;
1481 
1482  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1483  ompt_task_id, ompt_frame, ompt_parallel_id,
1484  team_size, unwrapped_task, OMPT_INVOKER(call_context));
1485  }
1486 #endif
1487 
1488  master_th->th.th_ident = loc;
1489 
1490 #if OMP_40_ENABLED
1491  if ( master_th->th.th_teams_microtask &&
1492  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1493  // AC: This is start of parallel that is nested inside teams construct.
1494  // The team is actual (hot), all workers are ready at the fork barrier.
1495  // No lock needed to initialize the team a bit, then free workers.
1496  parent_team->t.t_ident = loc;
1497  __kmp_alloc_argv_entries( argc, parent_team, TRUE );
1498  parent_team->t.t_argc = argc;
1499  argv = (void**)parent_team->t.t_argv;
1500  for( i=argc-1; i >= 0; --i )
1501 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1502 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1503  *argv++ = va_arg( *ap, void * );
1504 #else
1505  *argv++ = va_arg( ap, void * );
1506 #endif
1507  /* Increment our nested depth levels, but not increase the serialization */
1508  if ( parent_team == master_th->th.th_serial_team ) {
1509  // AC: we are in serialized parallel
1510  __kmpc_serialized_parallel(loc, gtid);
1511  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1512  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1513  // work correctly, will restore at join time
1514 
1515 #if OMPT_SUPPORT
1516  void *dummy;
1517  void **exit_runtime_p;
1518 
1519  ompt_lw_taskteam_t lw_taskteam;
1520 
1521  if (ompt_enabled) {
1522  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1523  unwrapped_task, ompt_parallel_id);
1524  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1525  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1526 
1527  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1528 
1529 #if OMPT_TRACE
1530  /* OMPT implicit task begin */
1531  my_task_id = lw_taskteam.ompt_task_info.task_id;
1532  my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1533  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1534  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1535  my_parallel_id, my_task_id);
1536  }
1537 #endif
1538 
1539  /* OMPT state */
1540  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1541  } else {
1542  exit_runtime_p = &dummy;
1543  }
1544 #endif
1545 
1546  {
1547  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1548  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1549  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1550 #if OMPT_SUPPORT
1551  , exit_runtime_p
1552 #endif
1553  );
1554  }
1555 
1556 #if OMPT_SUPPORT
1557  *exit_runtime_p = NULL;
1558  if (ompt_enabled) {
1559 #if OMPT_TRACE
1560  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1561 
1562  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1563  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1564  ompt_parallel_id, ompt_task_id);
1565  }
1566 
1567  __ompt_lw_taskteam_unlink(master_th);
1568  // reset clear the task id only after unlinking the task
1569  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1570 #endif
1571 
1572  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1573  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1574  ompt_parallel_id, ompt_task_id,
1575  OMPT_INVOKER(call_context));
1576  }
1577  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1578  }
1579 #endif
1580  return TRUE;
1581  }
1582 
1583  parent_team->t.t_pkfn = microtask;
1584 #if OMPT_SUPPORT
1585  parent_team->t.ompt_team_info.microtask = unwrapped_task;
1586 #endif
1587  parent_team->t.t_invoke = invoker;
1588  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1589  parent_team->t.t_active_level ++;
1590  parent_team->t.t_level ++;
1591 
1592  /* Change number of threads in the team if requested */
1593  if ( master_set_numthreads ) { // The parallel has num_threads clause
1594  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1595  // AC: only can reduce the number of threads dynamically, cannot increase
1596  kmp_info_t **other_threads = parent_team->t.t_threads;
1597  parent_team->t.t_nproc = master_set_numthreads;
1598  for ( i = 0; i < master_set_numthreads; ++i ) {
1599  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1600  }
1601  // Keep extra threads hot in the team for possible next parallels
1602  }
1603  master_th->th.th_set_nproc = 0;
1604  }
1605 
1606 #if USE_DEBUGGER
1607  if ( __kmp_debugging ) { // Let debugger override number of threads.
1608  int nth = __kmp_omp_num_threads( loc );
1609  if ( nth > 0 ) { // 0 means debugger does not want to change number of threads.
1610  master_set_numthreads = nth;
1611  }; // if
1612  }; // if
1613 #endif
1614 
1615  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1616  __kmp_internal_fork( loc, gtid, parent_team );
1617  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1618 
1619  /* Invoke microtask for MASTER thread */
1620  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1621  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1622 
1623  {
1624  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1625  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1626  if (! parent_team->t.t_invoke( gtid )) {
1627  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1628  }
1629  }
1630  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1631  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1632  KMP_MB(); /* Flush all pending memory write invalidates. */
1633 
1634  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1635 
1636  return TRUE;
1637  } // Parallel closely nested in teams construct
1638 #endif /* OMP_40_ENABLED */
1639 
1640 #if KMP_DEBUG
1641  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1642  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1643  }
1644 #endif
1645 
1646  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1647  nthreads = 1;
1648  } else {
1649 #if OMP_40_ENABLED
1650  int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1651 #endif
1652  nthreads = master_set_numthreads ?
1653  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1654 
1655  // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1656  // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1657  if (nthreads > 1) {
1658  if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1659 #if OMP_40_ENABLED
1660  && !enter_teams
1661 #endif /* OMP_40_ENABLED */
1662  ) ) || ( __kmp_library == library_serial ) ) {
1663  KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1664  gtid, nthreads ));
1665  nthreads = 1;
1666  }
1667  }
1668  if ( nthreads > 1 ) {
1669  /* determine how many new threads we can use */
1670  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1671 
1672  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1673 #if OMP_40_ENABLED
1674 /* AC: If we execute teams from parallel region (on host), then teams should be created
1675  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1676  then teams and their threads should be created regardless of the nesting setting. */
1677  , enter_teams
1678 #endif /* OMP_40_ENABLED */
1679  );
1680  if ( nthreads == 1 ) {
1681  // Free lock for single thread execution here;
1682  // for multi-thread execution it will be freed later
1683  // after team of threads created and initialized
1684  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1685  }
1686  }
1687  }
1688  KMP_DEBUG_ASSERT( nthreads > 0 );
1689 
1690  /* If we temporarily changed the set number of threads then restore it now */
1691  master_th->th.th_set_nproc = 0;
1692 
1693  /* create a serialized parallel region? */
1694  if ( nthreads == 1 ) {
1695  /* josh todo: hypothetical question: what do we do for OS X*? */
1696 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1697  void * args[ argc ];
1698 #else
1699  void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1700 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1701 
1702  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1703 
1704  __kmpc_serialized_parallel(loc, gtid);
1705 
1706  if ( call_context == fork_context_intel ) {
1707  /* TODO this sucks, use the compiler itself to pass args! :) */
1708  master_th->th.th_serial_team->t.t_ident = loc;
1709 #if OMP_40_ENABLED
1710  if ( !ap ) {
1711  // revert change made in __kmpc_serialized_parallel()
1712  master_th->th.th_serial_team->t.t_level--;
1713  // Get args from parent team for teams construct
1714 
1715 #if OMPT_SUPPORT
1716  void *dummy;
1717  void **exit_runtime_p;
1718 
1719  ompt_lw_taskteam_t lw_taskteam;
1720 
1721  if (ompt_enabled) {
1722  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1723  unwrapped_task, ompt_parallel_id);
1724  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1725  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1726 
1727  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1728 
1729 #if OMPT_TRACE
1730  my_task_id = lw_taskteam.ompt_task_info.task_id;
1731  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1732  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1733  ompt_parallel_id, my_task_id);
1734  }
1735 #endif
1736 
1737  /* OMPT state */
1738  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1739  } else {
1740  exit_runtime_p = &dummy;
1741  }
1742 #endif
1743 
1744  {
1745  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1746  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1747  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1748 #if OMPT_SUPPORT
1749  , exit_runtime_p
1750 #endif
1751  );
1752  }
1753 
1754 #if OMPT_SUPPORT
1755  *exit_runtime_p = NULL;
1756  if (ompt_enabled) {
1757  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1758 
1759 #if OMPT_TRACE
1760  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1761  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1762  ompt_parallel_id, ompt_task_id);
1763  }
1764 #endif
1765 
1766  __ompt_lw_taskteam_unlink(master_th);
1767  // reset clear the task id only after unlinking the task
1768  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1769 
1770  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1771  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1772  ompt_parallel_id, ompt_task_id,
1773  OMPT_INVOKER(call_context));
1774  }
1775  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1776  }
1777 #endif
1778  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1779  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1780  team = master_th->th.th_team;
1781  //team->t.t_pkfn = microtask;
1782  team->t.t_invoke = invoker;
1783  __kmp_alloc_argv_entries( argc, team, TRUE );
1784  team->t.t_argc = argc;
1785  argv = (void**) team->t.t_argv;
1786  if ( ap ) {
1787  for( i=argc-1; i >= 0; --i )
1788 // TODO: revert workaround for Intel(R) 64 tracker #96
1789 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1790  *argv++ = va_arg( *ap, void * );
1791 # else
1792  *argv++ = va_arg( ap, void * );
1793 # endif
1794  } else {
1795  for( i=0; i < argc; ++i )
1796  // Get args from parent team for teams construct
1797  argv[i] = parent_team->t.t_argv[i];
1798  }
1799  // AC: revert change made in __kmpc_serialized_parallel()
1800  // because initial code in teams should have level=0
1801  team->t.t_level--;
1802  // AC: call special invoker for outer "parallel" of the teams construct
1803  {
1804  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1805  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1806  invoker(gtid);
1807  }
1808  } else {
1809 #endif /* OMP_40_ENABLED */
1810  argv = args;
1811  for( i=argc-1; i >= 0; --i )
1812 // TODO: revert workaround for Intel(R) 64 tracker #96
1813 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1814  *argv++ = va_arg( *ap, void * );
1815 #else
1816  *argv++ = va_arg( ap, void * );
1817 #endif
1818  KMP_MB();
1819 
1820 #if OMPT_SUPPORT
1821  void *dummy;
1822  void **exit_runtime_p;
1823 
1824  ompt_lw_taskteam_t lw_taskteam;
1825 
1826  if (ompt_enabled) {
1827  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1828  unwrapped_task, ompt_parallel_id);
1829  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1830  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1831 
1832  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1833 
1834 #if OMPT_TRACE
1835  /* OMPT implicit task begin */
1836  my_task_id = lw_taskteam.ompt_task_info.task_id;
1837  my_parallel_id = ompt_parallel_id;
1838  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1839  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1840  my_parallel_id, my_task_id);
1841  }
1842 #endif
1843 
1844  /* OMPT state */
1845  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1846  } else {
1847  exit_runtime_p = &dummy;
1848  }
1849 #endif
1850 
1851  {
1852  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1853  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1854  __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1855 #if OMPT_SUPPORT
1856  , exit_runtime_p
1857 #endif
1858  );
1859  }
1860 
1861 #if OMPT_SUPPORT
1862  *exit_runtime_p = NULL;
1863  if (ompt_enabled) {
1864 #if OMPT_TRACE
1865  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1866 
1867  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1868  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1869  my_parallel_id, my_task_id);
1870  }
1871 #endif
1872 
1873  __ompt_lw_taskteam_unlink(master_th);
1874  // reset clear the task id only after unlinking the task
1875  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1876 
1877  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1878  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1879  ompt_parallel_id, ompt_task_id,
1880  OMPT_INVOKER(call_context));
1881  }
1882  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1883  }
1884 #endif
1885 #if OMP_40_ENABLED
1886  }
1887 #endif /* OMP_40_ENABLED */
1888  }
1889  else if ( call_context == fork_context_gnu ) {
1890 #if OMPT_SUPPORT
1891  ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1892  __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1893  __ompt_lw_taskteam_init(lwt, master_th, gtid,
1894  unwrapped_task, ompt_parallel_id);
1895 
1896  lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1897  lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1898  __ompt_lw_taskteam_link(lwt, master_th);
1899 #endif
1900 
1901  // we were called from GNU native code
1902  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1903  return FALSE;
1904  }
1905  else {
1906  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1907  }
1908 
1909 
1910  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1911  KMP_MB();
1912  return FALSE;
1913  }
1914 
1915  // GEH: only modify the executing flag in the case when not serialized
1916  // serialized case is handled in kmpc_serialized_parallel
1917  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1918  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1919  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1920  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1921  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1922  master_th->th.th_current_task->td_flags.executing = 0;
1923 
1924 #if OMP_40_ENABLED
1925  if ( !master_th->th.th_teams_microtask || level > teams_level )
1926 #endif /* OMP_40_ENABLED */
1927  {
1928  /* Increment our nested depth level */
1929  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1930  }
1931 
1932  // See if we need to make a copy of the ICVs.
1933  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1934  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1935  nthreads_icv = __kmp_nested_nth.nth[level+1];
1936  }
1937  else {
1938  nthreads_icv = 0; // don't update
1939  }
1940 
1941 #if OMP_40_ENABLED
1942  // Figure out the proc_bind_policy for the new team.
1943  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1944  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1945  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1946  proc_bind = proc_bind_false;
1947  }
1948  else {
1949  if (proc_bind == proc_bind_default) {
1950  // No proc_bind clause specified; use current proc-bind-var for this parallel region
1951  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1952  }
1953  /* else: The proc_bind policy was specified explicitly on parallel clause. This
1954  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1955  // Figure the value of proc-bind-var for the child threads.
1956  if ((level+1 < __kmp_nested_proc_bind.used)
1957  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1958  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1959  }
1960  }
1961 
1962  // Reset for next parallel region
1963  master_th->th.th_set_proc_bind = proc_bind_default;
1964 #endif /* OMP_40_ENABLED */
1965 
1966  if ((nthreads_icv > 0)
1967 #if OMP_40_ENABLED
1968  || (proc_bind_icv != proc_bind_default)
1969 #endif /* OMP_40_ENABLED */
1970  ) {
1971  kmp_internal_control_t new_icvs;
1972  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1973  new_icvs.next = NULL;
1974  if (nthreads_icv > 0) {
1975  new_icvs.nproc = nthreads_icv;
1976  }
1977 
1978 #if OMP_40_ENABLED
1979  if (proc_bind_icv != proc_bind_default) {
1980  new_icvs.proc_bind = proc_bind_icv;
1981  }
1982 #endif /* OMP_40_ENABLED */
1983 
1984  /* allocate a new parallel team */
1985  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1986  team = __kmp_allocate_team(root, nthreads, nthreads,
1987 #if OMPT_SUPPORT
1988  ompt_parallel_id,
1989 #endif
1990 #if OMP_40_ENABLED
1991  proc_bind,
1992 #endif
1993  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1994  } else {
1995  /* allocate a new parallel team */
1996  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1997  team = __kmp_allocate_team(root, nthreads, nthreads,
1998 #if OMPT_SUPPORT
1999  ompt_parallel_id,
2000 #endif
2001 #if OMP_40_ENABLED
2002  proc_bind,
2003 #endif
2004  &master_th->th.th_current_task->td_icvs, argc
2005  USE_NESTED_HOT_ARG(master_th) );
2006  }
2007  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2008 
2009  /* setup the new team */
2010  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2011  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2012  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2013  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2014  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2015 #if OMPT_SUPPORT
2016  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2017 #endif
2018  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); /* TODO move this to root, maybe */
2019  // TODO: parent_team->t.t_level == INT_MAX ???
2020 #if OMP_40_ENABLED
2021  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2022 #endif /* OMP_40_ENABLED */
2023  int new_level = parent_team->t.t_level + 1;
2024  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2025  new_level = parent_team->t.t_active_level + 1;
2026  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2027 #if OMP_40_ENABLED
2028  } else {
2029  // AC: Do not increase parallel level at start of the teams construct
2030  int new_level = parent_team->t.t_level;
2031  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2032  new_level = parent_team->t.t_active_level;
2033  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2034  }
2035 #endif /* OMP_40_ENABLED */
2036  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2037  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || team->t.t_sched.chunk != new_sched.chunk)
2038  team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
2039 
2040 #if OMP_40_ENABLED
2041  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2042 #endif
2043 
2044  // Update the floating point rounding in the team if required.
2045  propagateFPControl(team);
2046 
2047  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2048  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2049 #if 0
2050  // Patch out an assertion that trips while the runtime seems to operate correctly.
2051  // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2052  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2053 #endif
2054  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2055  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2056  parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2057 
2058  if ( active_level || master_th->th.th_task_team ) {
2059  // Take a memo of master's task_state
2060  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2061  if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2062  kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2063  kmp_uint8 *old_stack, *new_stack;
2064  kmp_uint32 i;
2065  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2066  for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2067  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2068  }
2069  for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2070  new_stack[i] = 0;
2071  }
2072  old_stack = master_th->th.th_task_state_memo_stack;
2073  master_th->th.th_task_state_memo_stack = new_stack;
2074  master_th->th.th_task_state_stack_sz = new_size;
2075  __kmp_free(old_stack);
2076  }
2077  // Store master's task_state on stack
2078  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2079  master_th->th.th_task_state_top++;
2080 #if KMP_NESTED_HOT_TEAMS
2081  if (team == master_th->th.th_hot_teams[active_level].hot_team) { // Restore master's nested state if nested hot team
2082  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2083  }
2084  else {
2085 #endif
2086  master_th->th.th_task_state = 0;
2087 #if KMP_NESTED_HOT_TEAMS
2088  }
2089 #endif
2090  }
2091 #if !KMP_NESTED_HOT_TEAMS
2092  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2093 #endif
2094  }
2095 
2096  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2097  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2098  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2099  ( team->t.t_master_tid == 0 &&
2100  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2101  KMP_MB();
2102 
2103  /* now, setup the arguments */
2104  argv = (void**)team->t.t_argv;
2105 #if OMP_40_ENABLED
2106  if ( ap ) {
2107 #endif /* OMP_40_ENABLED */
2108  for ( i=argc-1; i >= 0; --i ) {
2109 // TODO: revert workaround for Intel(R) 64 tracker #96
2110 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2111  void *new_argv = va_arg(*ap, void *);
2112 #else
2113  void *new_argv = va_arg(ap, void *);
2114 #endif
2115  KMP_CHECK_UPDATE(*argv, new_argv);
2116  argv++;
2117  }
2118 #if OMP_40_ENABLED
2119  } else {
2120  for ( i=0; i < argc; ++i ) {
2121  // Get args from parent team for teams construct
2122  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2123  }
2124  }
2125 #endif /* OMP_40_ENABLED */
2126 
2127  /* now actually fork the threads */
2128  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2129  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2130  root->r.r_active = TRUE;
2131 
2132  __kmp_fork_team_threads( root, team, master_th, gtid );
2133  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2134 
2135 #if OMPT_SUPPORT
2136  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2137 #endif
2138 
2139  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2140 
2141 #if USE_ITT_BUILD
2142  if ( team->t.t_active_level == 1 // only report frames at level 1
2143 # if OMP_40_ENABLED
2144  && !master_th->th.th_teams_microtask // not in teams construct
2145 # endif /* OMP_40_ENABLED */
2146  ) {
2147 #if USE_ITT_NOTIFY
2148  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2149  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2150  {
2151  kmp_uint64 tmp_time = 0;
2152  if ( __itt_get_timestamp_ptr )
2153  tmp_time = __itt_get_timestamp();
2154  // Internal fork - report frame begin
2155  master_th->th.th_frame_time = tmp_time;
2156  if ( __kmp_forkjoin_frames_mode == 3 )
2157  team->t.t_region_time = tmp_time;
2158  } else // only one notification scheme (either "submit" or "forking/joined", not both)
2159 #endif /* USE_ITT_NOTIFY */
2160  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2161  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2162  { // Mark start of "parallel" region for VTune.
2163  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2164  }
2165  }
2166 #endif /* USE_ITT_BUILD */
2167 
2168  /* now go on and do the work */
2169  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2170  KMP_MB();
2171  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2172  root, team, master_th, gtid));
2173 
2174 #if USE_ITT_BUILD
2175  if ( __itt_stack_caller_create_ptr ) {
2176  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2177  }
2178 #endif /* USE_ITT_BUILD */
2179 
2180 #if OMP_40_ENABLED
2181  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2182 #endif /* OMP_40_ENABLED */
2183  {
2184  __kmp_internal_fork( loc, gtid, team );
2185  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2186  root, team, master_th, gtid));
2187  }
2188 
2189  if (call_context == fork_context_gnu) {
2190  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2191  return TRUE;
2192  }
2193 
2194  /* Invoke microtask for MASTER thread */
2195  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2196  gtid, team->t.t_id, team->t.t_pkfn ) );
2197  } // END of timer KMP_fork_call block
2198 
2199  {
2200  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2201  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2202  if (! team->t.t_invoke( gtid )) {
2203  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2204  }
2205  }
2206  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2207  gtid, team->t.t_id, team->t.t_pkfn ) );
2208  KMP_MB(); /* Flush all pending memory write invalidates. */
2209 
2210  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2211 
2212 #if OMPT_SUPPORT
2213  if (ompt_enabled) {
2214  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2215  }
2216 #endif
2217 
2218  return TRUE;
2219 }
2220 
2221 #if OMPT_SUPPORT
2222 static inline void
2223 __kmp_join_restore_state(
2224  kmp_info_t *thread,
2225  kmp_team_t *team)
2226 {
2227  // restore state outside the region
2228  thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2229  ompt_state_work_serial : ompt_state_work_parallel);
2230 }
2231 
2232 static inline void
2233 __kmp_join_ompt(
2234  kmp_info_t *thread,
2235  kmp_team_t *team,
2236  ompt_parallel_id_t parallel_id,
2237  fork_context_e fork_context)
2238 {
2239  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2240  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2241  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2242  parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2243  }
2244 
2245  task_info->frame.reenter_runtime_frame = NULL;
2246  __kmp_join_restore_state(thread,team);
2247 }
2248 #endif
2249 
2250 void
2251 __kmp_join_call(ident_t *loc, int gtid
2252 #if OMPT_SUPPORT
2253  , enum fork_context_e fork_context
2254 #endif
2255 #if OMP_40_ENABLED
2256  , int exit_teams
2257 #endif /* OMP_40_ENABLED */
2258 )
2259 {
2260  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2261  kmp_team_t *team;
2262  kmp_team_t *parent_team;
2263  kmp_info_t *master_th;
2264  kmp_root_t *root;
2265  int master_active;
2266  int i;
2267 
2268  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2269 
2270  /* setup current data */
2271  master_th = __kmp_threads[ gtid ];
2272  root = master_th->th.th_root;
2273  team = master_th->th.th_team;
2274  parent_team = team->t.t_parent;
2275 
2276  master_th->th.th_ident = loc;
2277 
2278 #if OMPT_SUPPORT
2279  if (ompt_enabled) {
2280  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2281  }
2282 #endif
2283 
2284 #if KMP_DEBUG
2285  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2286  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2287  __kmp_gtid_from_thread( master_th ), team,
2288  team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2289  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2290  }
2291 #endif
2292 
2293  if( team->t.t_serialized ) {
2294 #if OMP_40_ENABLED
2295  if ( master_th->th.th_teams_microtask ) {
2296  // We are in teams construct
2297  int level = team->t.t_level;
2298  int tlevel = master_th->th.th_teams_level;
2299  if ( level == tlevel ) {
2300  // AC: we haven't incremented it earlier at start of teams construct,
2301  // so do it here - at the end of teams construct
2302  team->t.t_level++;
2303  } else if ( level == tlevel + 1 ) {
2304  // AC: we are exiting parallel inside teams, need to increment serialization
2305  // in order to restore it in the next call to __kmpc_end_serialized_parallel
2306  team->t.t_serialized++;
2307  }
2308  }
2309 #endif /* OMP_40_ENABLED */
2310  __kmpc_end_serialized_parallel( loc, gtid );
2311 
2312 #if OMPT_SUPPORT
2313  if (ompt_enabled) {
2314  __kmp_join_restore_state(master_th, parent_team);
2315  }
2316 #endif
2317 
2318  return;
2319  }
2320 
2321  master_active = team->t.t_master_active;
2322 
2323 #if OMP_40_ENABLED
2324  if (!exit_teams)
2325 #endif /* OMP_40_ENABLED */
2326  {
2327  // AC: No barrier for internal teams at exit from teams construct.
2328  // But there is barrier for external team (league).
2329  __kmp_internal_join( loc, gtid, team );
2330  }
2331 #if OMP_40_ENABLED
2332  else {
2333  master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2334  }
2335 #endif /* OMP_40_ENABLED */
2336 
2337  KMP_MB();
2338 
2339 #if OMPT_SUPPORT
2340  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2341 #endif
2342 
2343 #if USE_ITT_BUILD
2344  if ( __itt_stack_caller_create_ptr ) {
2345  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2346  }
2347 
2348  // Mark end of "parallel" region for VTune.
2349  if ( team->t.t_active_level == 1
2350 # if OMP_40_ENABLED
2351  && !master_th->th.th_teams_microtask /* not in teams construct */
2352 # endif /* OMP_40_ENABLED */
2353  ) {
2354  master_th->th.th_ident = loc;
2355  // only one notification scheme (either "submit" or "forking/joined", not both)
2356  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2357  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2358  0, loc, master_th->th.th_team_nproc, 1 );
2359  else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2360  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2361  __kmp_itt_region_joined( gtid );
2362  } // active_level == 1
2363 #endif /* USE_ITT_BUILD */
2364 
2365 #if OMP_40_ENABLED
2366  if ( master_th->th.th_teams_microtask &&
2367  !exit_teams &&
2368  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2369  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2370  // AC: We need to leave the team structure intact at the end
2371  // of parallel inside the teams construct, so that at the next
2372  // parallel same (hot) team works, only adjust nesting levels
2373 
2374  /* Decrement our nested depth level */
2375  team->t.t_level --;
2376  team->t.t_active_level --;
2377  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2378 
2379  /* Restore number of threads in the team if needed */
2380  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2381  int old_num = master_th->th.th_team_nproc;
2382  int new_num = master_th->th.th_teams_size.nth;
2383  kmp_info_t **other_threads = team->t.t_threads;
2384  team->t.t_nproc = new_num;
2385  for ( i = 0; i < old_num; ++i ) {
2386  other_threads[i]->th.th_team_nproc = new_num;
2387  }
2388  // Adjust states of non-used threads of the team
2389  for ( i = old_num; i < new_num; ++i ) {
2390  // Re-initialize thread's barrier data.
2391  int b;
2392  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2393  for ( b = 0; b < bs_last_barrier; ++ b ) {
2394  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2395  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2396 #if USE_DEBUGGER
2397  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2398 #endif
2399  }
2400  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2401  // Synchronize thread's task state
2402  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2403  }
2404  }
2405  }
2406 
2407 #if OMPT_SUPPORT
2408  if (ompt_enabled) {
2409  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2410  }
2411 #endif
2412 
2413  return;
2414  }
2415 #endif /* OMP_40_ENABLED */
2416 
2417  /* do cleanup and restore the parent team */
2418  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2419  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2420 
2421  master_th->th.th_dispatch =
2422  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2423 
2424  /* jc: The following lock has instructions with REL and ACQ semantics,
2425  separating the parallel user code called in this parallel region
2426  from the serial user code called after this function returns.
2427  */
2428  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2429 
2430 #if OMP_40_ENABLED
2431  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2432 #endif /* OMP_40_ENABLED */
2433  {
2434  /* Decrement our nested depth level */
2435  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2436  }
2437  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2438 
2439 #if OMPT_SUPPORT && OMPT_TRACE
2440  if(ompt_enabled){
2441  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2442  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2443  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2444  parallel_id, task_info->task_id);
2445  }
2446  task_info->frame.exit_runtime_frame = NULL;
2447  task_info->task_id = 0;
2448  }
2449 #endif
2450 
2451  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2452  0, master_th, team ) );
2453  __kmp_pop_current_task_from_thread( master_th );
2454 
2455 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2456  //
2457  // Restore master thread's partition.
2458  //
2459  master_th->th.th_first_place = team->t.t_first_place;
2460  master_th->th.th_last_place = team->t.t_last_place;
2461 #endif /* OMP_40_ENABLED */
2462 
2463  updateHWFPControl (team);
2464 
2465  if ( root->r.r_active != master_active )
2466  root->r.r_active = master_active;
2467 
2468  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2469 
2470  /* this race was fun to find. make sure the following is in the critical
2471  * region otherwise assertions may fail occasionally since the old team
2472  * may be reallocated and the hierarchy appears inconsistent. it is
2473  * actually safe to run and won't cause any bugs, but will cause those
2474  * assertion failures. it's only one deref&assign so might as well put this
2475  * in the critical region */
2476  master_th->th.th_team = parent_team;
2477  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2478  master_th->th.th_team_master = parent_team->t.t_threads[0];
2479  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2480 
2481  /* restore serialized team, if need be */
2482  if( parent_team->t.t_serialized &&
2483  parent_team != master_th->th.th_serial_team &&
2484  parent_team != root->r.r_root_team ) {
2485  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2486  master_th->th.th_serial_team = parent_team;
2487  }
2488 
2489  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2490  if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2491  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2492  // Remember master's state if we re-use this nested hot team
2493  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2494  --master_th->th.th_task_state_top; // pop
2495  // Now restore state at this level
2496  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2497  }
2498  // Copy the task team from the parent team to the master thread
2499  master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2500  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2501  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2502  }
2503 
2504  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2505  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2506  master_th->th.th_current_task->td_flags.executing = 1;
2507 
2508  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2509 
2510 #if OMPT_SUPPORT
2511  if (ompt_enabled) {
2512  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2513  }
2514 #endif
2515 
2516  KMP_MB();
2517  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2518 }
2519 
2520 /* ------------------------------------------------------------------------ */
2521 /* ------------------------------------------------------------------------ */
2522 
2523 /* Check whether we should push an internal control record onto the
2524  serial team stack. If so, do it. */
2525 void
2526 __kmp_save_internal_controls ( kmp_info_t * thread )
2527 {
2528 
2529  if ( thread->th.th_team != thread->th.th_serial_team ) {
2530  return;
2531  }
2532  if (thread->th.th_team->t.t_serialized > 1) {
2533  int push = 0;
2534 
2535  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2536  push = 1;
2537  } else {
2538  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2539  thread->th.th_team->t.t_serialized ) {
2540  push = 1;
2541  }
2542  }
2543  if (push) { /* push a record on the serial team's stack */
2544  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2545 
2546  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2547 
2548  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2549 
2550  control->next = thread->th.th_team->t.t_control_stack_top;
2551  thread->th.th_team->t.t_control_stack_top = control;
2552  }
2553  }
2554 }
2555 
2556 /* Changes set_nproc */
2557 void
2558 __kmp_set_num_threads( int new_nth, int gtid )
2559 {
2560  kmp_info_t *thread;
2561  kmp_root_t *root;
2562 
2563  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2564  KMP_DEBUG_ASSERT( __kmp_init_serial );
2565 
2566  if (new_nth < 1)
2567  new_nth = 1;
2568  else if (new_nth > __kmp_max_nth)
2569  new_nth = __kmp_max_nth;
2570 
2571  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2572  thread = __kmp_threads[gtid];
2573 
2574  __kmp_save_internal_controls( thread );
2575 
2576  set__nproc( thread, new_nth );
2577 
2578  //
2579  // If this omp_set_num_threads() call will cause the hot team size to be
2580  // reduced (in the absence of a num_threads clause), then reduce it now,
2581  // rather than waiting for the next parallel region.
2582  //
2583  root = thread->th.th_root;
2584  if ( __kmp_init_parallel && ( ! root->r.r_active )
2585  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2586 #if KMP_NESTED_HOT_TEAMS
2587  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2588 #endif
2589  ) {
2590  kmp_team_t *hot_team = root->r.r_hot_team;
2591  int f;
2592 
2593  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2594 
2595  // Release the extra threads we don't need any more.
2596  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2597  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2598  if ( __kmp_tasking_mode != tskm_immediate_exec) {
2599  // When decreasing team size, threads no longer in the team should unref task team.
2600  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2601  }
2602  __kmp_free_thread( hot_team->t.t_threads[f] );
2603  hot_team->t.t_threads[f] = NULL;
2604  }
2605  hot_team->t.t_nproc = new_nth;
2606 #if KMP_NESTED_HOT_TEAMS
2607  if( thread->th.th_hot_teams ) {
2608  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2609  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2610  }
2611 #endif
2612 
2613  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2614 
2615  //
2616  // Update the t_nproc field in the threads that are still active.
2617  //
2618  for( f=0 ; f < new_nth; f++ ) {
2619  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2620  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2621  }
2622  // Special flag in case omp_set_num_threads() call
2623  hot_team->t.t_size_changed = -1;
2624  }
2625 }
2626 
2627 /* Changes max_active_levels */
2628 void
2629 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2630 {
2631  kmp_info_t *thread;
2632 
2633  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2634  KMP_DEBUG_ASSERT( __kmp_init_serial );
2635 
2636  // validate max_active_levels
2637  if( max_active_levels < 0 ) {
2638  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2639  // We ignore this call if the user has specified a negative value.
2640  // The current setting won't be changed. The last valid setting will be used.
2641  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2642  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2643  return;
2644  }
2645  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2646  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2647  // We allow a zero value. (implementation defined behavior)
2648  } else {
2649  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2650  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2651  // Current upper limit is MAX_INT. (implementation defined behavior)
2652  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2653  // Actually, the flow should never get here until we use MAX_INT limit.
2654  }
2655  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2656 
2657  thread = __kmp_threads[ gtid ];
2658 
2659  __kmp_save_internal_controls( thread );
2660 
2661  set__max_active_levels( thread, max_active_levels );
2662 
2663 }
2664 
2665 /* Gets max_active_levels */
2666 int
2667 __kmp_get_max_active_levels( int gtid )
2668 {
2669  kmp_info_t *thread;
2670 
2671  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2672  KMP_DEBUG_ASSERT( __kmp_init_serial );
2673 
2674  thread = __kmp_threads[ gtid ];
2675  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2676  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2677  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2678  return thread->th.th_current_task->td_icvs.max_active_levels;
2679 }
2680 
2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2682 void
2683 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2684 {
2685  kmp_info_t *thread;
2686 // kmp_team_t *team;
2687 
2688  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2689  KMP_DEBUG_ASSERT( __kmp_init_serial );
2690 
2691  // Check if the kind parameter is valid, correct if needed.
2692  // Valid parameters should fit in one of two intervals - standard or extended:
2693  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2695  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2696  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2697  {
2698  // TODO: Hint needs attention in case we change the default schedule.
2699  __kmp_msg(
2700  kmp_ms_warning,
2701  KMP_MSG( ScheduleKindOutOfRange, kind ),
2702  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2703  __kmp_msg_null
2704  );
2705  kind = kmp_sched_default;
2706  chunk = 0; // ignore chunk value in case of bad kind
2707  }
2708 
2709  thread = __kmp_threads[ gtid ];
2710 
2711  __kmp_save_internal_controls( thread );
2712 
2713  if ( kind < kmp_sched_upper_std ) {
2714  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2715  // differ static chunked vs. unchunked:
2716  // chunk should be invalid to indicate unchunked schedule (which is the default)
2717  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2718  } else {
2719  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2720  }
2721  } else {
2722  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2723  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2724  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2725  }
2726  if ( kind == kmp_sched_auto ) {
2727  // ignore parameter chunk for schedule auto
2728  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2729  } else {
2730  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2731  }
2732 }
2733 
2734 /* Gets def_sched_var ICV values */
2735 void
2736 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2737 {
2738  kmp_info_t *thread;
2739  enum sched_type th_type;
2740 
2741  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2742  KMP_DEBUG_ASSERT( __kmp_init_serial );
2743 
2744  thread = __kmp_threads[ gtid ];
2745 
2746  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2747 
2748  switch ( th_type ) {
2749  case kmp_sch_static:
2750  case kmp_sch_static_greedy:
2751  case kmp_sch_static_balanced:
2752  *kind = kmp_sched_static;
2753  *chunk = 0; // chunk was not set, try to show this fact via zero value
2754  return;
2755  case kmp_sch_static_chunked:
2756  *kind = kmp_sched_static;
2757  break;
2758  case kmp_sch_dynamic_chunked:
2759  *kind = kmp_sched_dynamic;
2760  break;
2762  case kmp_sch_guided_iterative_chunked:
2763  case kmp_sch_guided_analytical_chunked:
2764  *kind = kmp_sched_guided;
2765  break;
2766  case kmp_sch_auto:
2767  *kind = kmp_sched_auto;
2768  break;
2769  case kmp_sch_trapezoidal:
2770  *kind = kmp_sched_trapezoidal;
2771  break;
2772 #if KMP_STATIC_STEAL_ENABLED
2773  case kmp_sch_static_steal:
2774  *kind = kmp_sched_static_steal;
2775  break;
2776 #endif
2777  default:
2778  KMP_FATAL( UnknownSchedulingType, th_type );
2779  }
2780 
2781  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2782 }
2783 
2784 int
2785 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2786 
2787  int ii, dd;
2788  kmp_team_t *team;
2789  kmp_info_t *thr;
2790 
2791  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2792  KMP_DEBUG_ASSERT( __kmp_init_serial );
2793 
2794  // validate level
2795  if( level == 0 ) return 0;
2796  if( level < 0 ) return -1;
2797  thr = __kmp_threads[ gtid ];
2798  team = thr->th.th_team;
2799  ii = team->t.t_level;
2800  if( level > ii ) return -1;
2801 
2802 #if OMP_40_ENABLED
2803  if( thr->th.th_teams_microtask ) {
2804  // AC: we are in teams region where multiple nested teams have same level
2805  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2806  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2807  KMP_DEBUG_ASSERT( ii >= tlevel );
2808  // AC: As we need to pass by the teams league, we need to artificially increase ii
2809  if ( ii == tlevel ) {
2810  ii += 2; // three teams have same level
2811  } else {
2812  ii ++; // two teams have same level
2813  }
2814  }
2815  }
2816 #endif
2817 
2818  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2819 
2820  dd = team->t.t_serialized;
2821  level++;
2822  while( ii > level )
2823  {
2824  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2825  {
2826  }
2827  if( ( team->t.t_serialized ) && ( !dd ) ) {
2828  team = team->t.t_parent;
2829  continue;
2830  }
2831  if( ii > level ) {
2832  team = team->t.t_parent;
2833  dd = team->t.t_serialized;
2834  ii--;
2835  }
2836  }
2837 
2838  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2839 }
2840 
2841 int
2842 __kmp_get_team_size( int gtid, int level ) {
2843 
2844  int ii, dd;
2845  kmp_team_t *team;
2846  kmp_info_t *thr;
2847 
2848  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2849  KMP_DEBUG_ASSERT( __kmp_init_serial );
2850 
2851  // validate level
2852  if( level == 0 ) return 1;
2853  if( level < 0 ) return -1;
2854  thr = __kmp_threads[ gtid ];
2855  team = thr->th.th_team;
2856  ii = team->t.t_level;
2857  if( level > ii ) return -1;
2858 
2859 #if OMP_40_ENABLED
2860  if( thr->th.th_teams_microtask ) {
2861  // AC: we are in teams region where multiple nested teams have same level
2862  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2863  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2864  KMP_DEBUG_ASSERT( ii >= tlevel );
2865  // AC: As we need to pass by the teams league, we need to artificially increase ii
2866  if ( ii == tlevel ) {
2867  ii += 2; // three teams have same level
2868  } else {
2869  ii ++; // two teams have same level
2870  }
2871  }
2872  }
2873 #endif
2874 
2875  while( ii > level )
2876  {
2877  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2878  {
2879  }
2880  if( team->t.t_serialized && ( !dd ) ) {
2881  team = team->t.t_parent;
2882  continue;
2883  }
2884  if( ii > level ) {
2885  team = team->t.t_parent;
2886  ii--;
2887  }
2888  }
2889 
2890  return team->t.t_nproc;
2891 }
2892 
2893 kmp_r_sched_t
2894 __kmp_get_schedule_global() {
2895 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2896 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2897 
2898  kmp_r_sched_t r_sched;
2899 
2900  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2901  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2902  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2903  if ( __kmp_sched == kmp_sch_static ) {
2904  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2905  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2906  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2907  } else {
2908  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2909  }
2910 
2911  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2912  r_sched.chunk = KMP_DEFAULT_CHUNK;
2913  } else {
2914  r_sched.chunk = __kmp_chunk;
2915  }
2916 
2917  return r_sched;
2918 }
2919 
2920 /* ------------------------------------------------------------------------ */
2921 /* ------------------------------------------------------------------------ */
2922 
2923 
2924 /*
2925  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2926  * at least argc number of *t_argv entries for the requested team.
2927  */
2928 static void
2929 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2930 {
2931 
2932  KMP_DEBUG_ASSERT( team );
2933  if( !realloc || argc > team->t.t_max_argc ) {
2934 
2935  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2936  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2937  /* if previously allocated heap space for args, free them */
2938  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2939  __kmp_free( (void *) team->t.t_argv );
2940 
2941  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2942  /* use unused space in the cache line for arguments */
2943  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2944  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2945  team->t.t_id, team->t.t_max_argc ));
2946  team->t.t_argv = &team->t.t_inline_argv[0];
2947  if ( __kmp_storage_map ) {
2948  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2949  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2950  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2951  "team_%d.t_inline_argv",
2952  team->t.t_id );
2953  }
2954  } else {
2955  /* allocate space for arguments in the heap */
2956  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2957  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2958  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2959  team->t.t_id, team->t.t_max_argc ));
2960  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2961  if ( __kmp_storage_map ) {
2962  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2963  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2964  team->t.t_id );
2965  }
2966  }
2967  }
2968 }
2969 
2970 static void
2971 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2972 {
2973  int i;
2974  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2975  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2976  team->t.t_disp_buffer = (dispatch_shared_info_t*)
2977  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2978  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2979  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2980  team->t.t_max_nproc = max_nth;
2981 
2982  /* setup dispatch buffers */
2983  for(i = 0 ; i < num_disp_buff; ++i) {
2984  team->t.t_disp_buffer[i].buffer_index = i;
2985 #if OMP_45_ENABLED
2986  team->t.t_disp_buffer[i].doacross_buf_idx = i;
2987 #endif
2988  }
2989 }
2990 
2991 static void
2992 __kmp_free_team_arrays(kmp_team_t *team) {
2993  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2994  int i;
2995  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2996  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2997  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
2998  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
2999  }; // if
3000  }; // for
3001  __kmp_free(team->t.t_threads);
3002  __kmp_free(team->t.t_disp_buffer);
3003  __kmp_free(team->t.t_dispatch);
3004  __kmp_free(team->t.t_implicit_task_taskdata);
3005  team->t.t_threads = NULL;
3006  team->t.t_disp_buffer = NULL;
3007  team->t.t_dispatch = NULL;
3008  team->t.t_implicit_task_taskdata = 0;
3009 }
3010 
3011 static void
3012 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3013  kmp_info_t **oldThreads = team->t.t_threads;
3014 
3015  __kmp_free(team->t.t_disp_buffer);
3016  __kmp_free(team->t.t_dispatch);
3017  __kmp_free(team->t.t_implicit_task_taskdata);
3018  __kmp_allocate_team_arrays(team, max_nth);
3019 
3020  KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3021 
3022  __kmp_free(oldThreads);
3023 }
3024 
3025 static kmp_internal_control_t
3026 __kmp_get_global_icvs( void ) {
3027 
3028  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3029 
3030 #if OMP_40_ENABLED
3031  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3032 #endif /* OMP_40_ENABLED */
3033 
3034  kmp_internal_control_t g_icvs = {
3035  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3036  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3037  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3038  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3039  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3040 #if KMP_USE_MONITOR
3041  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3042 #endif
3043  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
3044  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3045  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3046  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3047 #if OMP_40_ENABLED
3048  __kmp_nested_proc_bind.bind_types[0],
3049  __kmp_default_device,
3050 #endif /* OMP_40_ENABLED */
3051  NULL //struct kmp_internal_control *next;
3052  };
3053 
3054  return g_icvs;
3055 }
3056 
3057 static kmp_internal_control_t
3058 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3059 
3060  kmp_internal_control_t gx_icvs;
3061  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3062  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3063  gx_icvs.next = NULL;
3064 
3065  return gx_icvs;
3066 }
3067 
3068 static void
3069 __kmp_initialize_root( kmp_root_t *root )
3070 {
3071  int f;
3072  kmp_team_t *root_team;
3073  kmp_team_t *hot_team;
3074  int hot_team_max_nth;
3075  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3076  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3077  KMP_DEBUG_ASSERT( root );
3078  KMP_ASSERT( ! root->r.r_begin );
3079 
3080  /* setup the root state structure */
3081  __kmp_init_lock( &root->r.r_begin_lock );
3082  root->r.r_begin = FALSE;
3083  root->r.r_active = FALSE;
3084  root->r.r_in_parallel = 0;
3085  root->r.r_blocktime = __kmp_dflt_blocktime;
3086  root->r.r_nested = __kmp_dflt_nested;
3087 
3088  /* setup the root team for this task */
3089  /* allocate the root team structure */
3090  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3091 
3092  root_team =
3093  __kmp_allocate_team(
3094  root,
3095  1, // new_nproc
3096  1, // max_nproc
3097 #if OMPT_SUPPORT
3098  0, // root parallel id
3099 #endif
3100 #if OMP_40_ENABLED
3101  __kmp_nested_proc_bind.bind_types[0],
3102 #endif
3103  &r_icvs,
3104  0 // argc
3105  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3106  );
3107 #if USE_DEBUGGER
3108  // Non-NULL value should be assigned to make the debugger display the root team.
3109  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3110 #endif
3111 
3112  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3113 
3114  root->r.r_root_team = root_team;
3115  root_team->t.t_control_stack_top = NULL;
3116 
3117  /* initialize root team */
3118  root_team->t.t_threads[0] = NULL;
3119  root_team->t.t_nproc = 1;
3120  root_team->t.t_serialized = 1;
3121  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3122  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3123  root_team->t.t_sched.chunk = r_sched.chunk;
3124  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3125  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3126 
3127  /* setup the hot team for this task */
3128  /* allocate the hot team structure */
3129  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3130 
3131  hot_team =
3132  __kmp_allocate_team(
3133  root,
3134  1, // new_nproc
3135  __kmp_dflt_team_nth_ub * 2, // max_nproc
3136 #if OMPT_SUPPORT
3137  0, // root parallel id
3138 #endif
3139 #if OMP_40_ENABLED
3140  __kmp_nested_proc_bind.bind_types[0],
3141 #endif
3142  &r_icvs,
3143  0 // argc
3144  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3145  );
3146  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3147 
3148  root->r.r_hot_team = hot_team;
3149  root_team->t.t_control_stack_top = NULL;
3150 
3151  /* first-time initialization */
3152  hot_team->t.t_parent = root_team;
3153 
3154  /* initialize hot team */
3155  hot_team_max_nth = hot_team->t.t_max_nproc;
3156  for ( f = 0; f < hot_team_max_nth; ++ f ) {
3157  hot_team->t.t_threads[ f ] = NULL;
3158  }; // for
3159  hot_team->t.t_nproc = 1;
3160  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3161  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3162  hot_team->t.t_sched.chunk = r_sched.chunk;
3163  hot_team->t.t_size_changed = 0;
3164 }
3165 
3166 #ifdef KMP_DEBUG
3167 
3168 
3169 typedef struct kmp_team_list_item {
3170  kmp_team_p const * entry;
3171  struct kmp_team_list_item * next;
3172 } kmp_team_list_item_t;
3173 typedef kmp_team_list_item_t * kmp_team_list_t;
3174 
3175 
3176 static void
3177 __kmp_print_structure_team_accum( // Add team to list of teams.
3178  kmp_team_list_t list, // List of teams.
3179  kmp_team_p const * team // Team to add.
3180 ) {
3181 
3182  // List must terminate with item where both entry and next are NULL.
3183  // Team is added to the list only once.
3184  // List is sorted in ascending order by team id.
3185  // Team id is *not* a key.
3186 
3187  kmp_team_list_t l;
3188 
3189  KMP_DEBUG_ASSERT( list != NULL );
3190  if ( team == NULL ) {
3191  return;
3192  }; // if
3193 
3194  __kmp_print_structure_team_accum( list, team->t.t_parent );
3195  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3196 
3197  // Search list for the team.
3198  l = list;
3199  while ( l->next != NULL && l->entry != team ) {
3200  l = l->next;
3201  }; // while
3202  if ( l->next != NULL ) {
3203  return; // Team has been added before, exit.
3204  }; // if
3205 
3206  // Team is not found. Search list again for insertion point.
3207  l = list;
3208  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3209  l = l->next;
3210  }; // while
3211 
3212  // Insert team.
3213  {
3214  kmp_team_list_item_t * item =
3215  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3216  * item = * l;
3217  l->entry = team;
3218  l->next = item;
3219  }
3220 
3221 }
3222 
3223 static void
3224 __kmp_print_structure_team(
3225  char const * title,
3226  kmp_team_p const * team
3227 
3228 ) {
3229  __kmp_printf( "%s", title );
3230  if ( team != NULL ) {
3231  __kmp_printf( "%2x %p\n", team->t.t_id, team );
3232  } else {
3233  __kmp_printf( " - (nil)\n" );
3234  }; // if
3235 }
3236 
3237 static void
3238 __kmp_print_structure_thread(
3239  char const * title,
3240  kmp_info_p const * thread
3241 
3242 ) {
3243  __kmp_printf( "%s", title );
3244  if ( thread != NULL ) {
3245  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3246  } else {
3247  __kmp_printf( " - (nil)\n" );
3248  }; // if
3249 }
3250 
3251 void
3252 __kmp_print_structure(
3253  void
3254 ) {
3255 
3256  kmp_team_list_t list;
3257 
3258  // Initialize list of teams.
3259  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3260  list->entry = NULL;
3261  list->next = NULL;
3262 
3263  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3264  {
3265  int gtid;
3266  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3267  __kmp_printf( "%2d", gtid );
3268  if ( __kmp_threads != NULL ) {
3269  __kmp_printf( " %p", __kmp_threads[ gtid ] );
3270  }; // if
3271  if ( __kmp_root != NULL ) {
3272  __kmp_printf( " %p", __kmp_root[ gtid ] );
3273  }; // if
3274  __kmp_printf( "\n" );
3275  }; // for gtid
3276  }
3277 
3278  // Print out __kmp_threads array.
3279  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3280  if ( __kmp_threads != NULL ) {
3281  int gtid;
3282  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3283  kmp_info_t const * thread = __kmp_threads[ gtid ];
3284  if ( thread != NULL ) {
3285  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3286  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
3287  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
3288  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
3289  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
3290  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
3291  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
3292  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
3293 #if OMP_40_ENABLED
3294  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
3295 #endif
3296  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
3297  __kmp_printf( "\n" );
3298  __kmp_print_structure_team_accum( list, thread->th.th_team );
3299  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3300  }; // if
3301  }; // for gtid
3302  } else {
3303  __kmp_printf( "Threads array is not allocated.\n" );
3304  }; // if
3305 
3306  // Print out __kmp_root array.
3307  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3308  if ( __kmp_root != NULL ) {
3309  int gtid;
3310  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3311  kmp_root_t const * root = __kmp_root[ gtid ];
3312  if ( root != NULL ) {
3313  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3314  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3315  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3316  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3317  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3318  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3319  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3320  __kmp_printf( "\n" );
3321  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3322  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3323  }; // if
3324  }; // for gtid
3325  } else {
3326  __kmp_printf( "Ubers array is not allocated.\n" );
3327  }; // if
3328 
3329  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3330  while ( list->next != NULL ) {
3331  kmp_team_p const * team = list->entry;
3332  int i;
3333  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3334  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3335  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3336  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3337  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3338  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3339  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3340  __kmp_printf( " Thread %2d: ", i );
3341  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3342  }; // for i
3343  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3344  __kmp_printf( "\n" );
3345  list = list->next;
3346  }; // while
3347 
3348  // Print out __kmp_thread_pool and __kmp_team_pool.
3349  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3350  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3351  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3352  __kmp_printf( "\n" );
3353 
3354  // Free team list.
3355  while ( list != NULL ) {
3356  kmp_team_list_item_t * item = list;
3357  list = list->next;
3358  KMP_INTERNAL_FREE( item );
3359  }; // while
3360 
3361 }
3362 
3363 #endif
3364 
3365 
3366 //---------------------------------------------------------------------------
3367 // Stuff for per-thread fast random number generator
3368 // Table of primes
3369 
3370 static const unsigned __kmp_primes[] = {
3371  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3372  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3373  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3374  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3375  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3376  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3377  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3378  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3379  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3380  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3381  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3382  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3383  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3384  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3385  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3386  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3387 };
3388 
3389 //---------------------------------------------------------------------------
3390 // __kmp_get_random: Get a random number using a linear congruential method.
3391 
3392 unsigned short
3393 __kmp_get_random( kmp_info_t * thread )
3394 {
3395  unsigned x = thread->th.th_x;
3396  unsigned short r = x>>16;
3397 
3398  thread->th.th_x = x*thread->th.th_a+1;
3399 
3400  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3401  thread->th.th_info.ds.ds_tid, r) );
3402 
3403  return r;
3404 }
3405 //--------------------------------------------------------
3406 // __kmp_init_random: Initialize a random number generator
3407 
3408 void
3409 __kmp_init_random( kmp_info_t * thread )
3410 {
3411  unsigned seed = thread->th.th_info.ds.ds_tid;
3412 
3413  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3414  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3415  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3416 }
3417 
3418 
3419 #if KMP_OS_WINDOWS
3420 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3421 static int
3422 __kmp_reclaim_dead_roots(void) {
3423  int i, r = 0;
3424 
3425  for(i = 0; i < __kmp_threads_capacity; ++i) {
3426  if( KMP_UBER_GTID( i ) &&
3427  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3428  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3429  r += __kmp_unregister_root_other_thread(i);
3430  }
3431  }
3432  return r;
3433 }
3434 #endif
3435 
3436 /*
3437  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3438  free entries generated.
3439 
3440  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3441  already dead.
3442 
3443  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3444  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3445  __kmp_tp_capacity, if threadprivate cache array has been created.
3446  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3447 
3448  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3449  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3450  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3451  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3452  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3453  as many free slots as possible up to nWish.
3454 
3455  If any argument is negative, the behavior is undefined.
3456 */
3457 static int
3458 __kmp_expand_threads(int nWish, int nNeed) {
3459  int added = 0;
3460  int old_tp_cached;
3461  int __kmp_actual_max_nth;
3462 
3463  if(nNeed > nWish) /* normalize the arguments */
3464  nWish = nNeed;
3465 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3466 /* only for Windows static library */
3467  /* reclaim array entries for root threads that are already dead */
3468  added = __kmp_reclaim_dead_roots();
3469 
3470  if(nNeed) {
3471  nNeed -= added;
3472  if(nNeed < 0)
3473  nNeed = 0;
3474  }
3475  if(nWish) {
3476  nWish -= added;
3477  if(nWish < 0)
3478  nWish = 0;
3479  }
3480 #endif
3481  if(nWish <= 0)
3482  return added;
3483 
3484  while(1) {
3485  int nTarget;
3486  int minimumRequiredCapacity;
3487  int newCapacity;
3488  kmp_info_t **newThreads;
3489  kmp_root_t **newRoot;
3490 
3491  //
3492  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3493  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3494  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3495  // become > __kmp_max_nth in one of two ways:
3496  //
3497  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3498  // may not be resused by another thread, so we may need to increase
3499  // __kmp_threads_capacity to __kmp_max_threads + 1.
3500  //
3501  // 2) New foreign root(s) are encountered. We always register new
3502  // foreign roots. This may cause a smaller # of threads to be
3503  // allocated at subsequent parallel regions, but the worker threads
3504  // hang around (and eventually go to sleep) and need slots in the
3505  // __kmp_threads[] array.
3506  //
3507  // Anyway, that is the reason for moving the check to see if
3508  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3509  // instead of having it performed here. -BB
3510  //
3511  old_tp_cached = __kmp_tp_cached;
3512  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3513  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3514 
3515  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3516  nTarget = nWish;
3517  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3518  /* can't fulfil nWish, so try nNeed */
3519  if(nNeed) {
3520  nTarget = nNeed;
3521  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3522  /* possible expansion too small -- give up */
3523  break;
3524  }
3525  } else {
3526  /* best-effort */
3527  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3528  if(!nTarget) {
3529  /* can expand at all -- give up */
3530  break;
3531  }
3532  }
3533  }
3534  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3535 
3536  newCapacity = __kmp_threads_capacity;
3537  do{
3538  newCapacity =
3539  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3540  (newCapacity << 1) :
3541  __kmp_actual_max_nth;
3542  } while(newCapacity < minimumRequiredCapacity);
3543  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3544  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3545  KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3546  KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3547  memset(newThreads + __kmp_threads_capacity, 0,
3548  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3549  memset(newRoot + __kmp_threads_capacity, 0,
3550  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3551 
3552  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3553  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3554  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3555  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3556  of a double-check pair.
3557  */
3558  __kmp_free(newThreads);
3559  continue; /* start over and try again */
3560  }
3561  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3562  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3563  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3564  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3565  __kmp_free(newThreads);
3566  continue; /* start over and try again */
3567  } else {
3568  /* success */
3569  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3570  //
3571  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3572  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3573  added += newCapacity - __kmp_threads_capacity;
3574  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3575  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3576  break; /* succeeded, so we can exit the loop */
3577  }
3578  }
3579  return added;
3580 }
3581 
3582 /* register the current thread as a root thread and obtain our gtid */
3583 /* we must have the __kmp_initz_lock held at this point */
3584 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3585 int
3586 __kmp_register_root( int initial_thread )
3587 {
3588  kmp_info_t *root_thread;
3589  kmp_root_t *root;
3590  int gtid;
3591  int capacity;
3592  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3593  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3594  KMP_MB();
3595 
3596 
3597  /*
3598  2007-03-02:
3599 
3600  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3601  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3602  return false (that means there is at least one empty slot in __kmp_threads array), but it
3603  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3604  used for this one. Following code workarounds this bug.
3605 
3606  However, right solution seems to be not reserving slot #0 for initial thread because:
3607  (1) there is no magic in slot #0,
3608  (2) we cannot detect initial thread reliably (the first thread which does serial
3609  initialization may be not a real initial thread).
3610  */
3611  capacity = __kmp_threads_capacity;
3612  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3613  -- capacity;
3614  }; // if
3615 
3616  /* see if there are too many threads */
3617  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3618  if ( __kmp_tp_cached ) {
3619  __kmp_msg(
3620  kmp_ms_fatal,
3621  KMP_MSG( CantRegisterNewThread ),
3622  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3623  KMP_HNT( PossibleSystemLimitOnThreads ),
3624  __kmp_msg_null
3625  );
3626  }
3627  else {
3628  __kmp_msg(
3629  kmp_ms_fatal,
3630  KMP_MSG( CantRegisterNewThread ),
3631  KMP_HNT( SystemLimitOnThreads ),
3632  __kmp_msg_null
3633  );
3634  }
3635  }; // if
3636 
3637  /* find an available thread slot */
3638  /* Don't reassign the zero slot since we need that to only be used by initial
3639  thread */
3640  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3641  ;
3642  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3643  KMP_ASSERT( gtid < __kmp_threads_capacity );
3644 
3645  /* update global accounting */
3646  __kmp_all_nth ++;
3647  TCW_4(__kmp_nth, __kmp_nth + 1);
3648 
3649  //
3650  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3651  // for low numbers of procs, and method #2 (keyed API call) for higher
3652  // numbers of procs.
3653  //
3654  if ( __kmp_adjust_gtid_mode ) {
3655  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3656  if ( TCR_4(__kmp_gtid_mode) != 2) {
3657  TCW_4(__kmp_gtid_mode, 2);
3658  }
3659  }
3660  else {
3661  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3662  TCW_4(__kmp_gtid_mode, 1);
3663  }
3664  }
3665  }
3666 
3667 #ifdef KMP_ADJUST_BLOCKTIME
3668  /* Adjust blocktime to zero if necessary */
3669  /* Middle initialization might not have occurred yet */
3670  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3671  if ( __kmp_nth > __kmp_avail_proc ) {
3672  __kmp_zero_bt = TRUE;
3673  }
3674  }
3675 #endif /* KMP_ADJUST_BLOCKTIME */
3676 
3677  /* setup this new hierarchy */
3678  if( ! ( root = __kmp_root[gtid] )) {
3679  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3680  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3681  }
3682 
3683 #if KMP_STATS_ENABLED
3684  // Initialize stats as soon as possible (right after gtid assignment).
3685  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3686  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3687  KMP_SET_THREAD_STATE(SERIAL_REGION);
3688  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3689 #endif
3690  __kmp_initialize_root( root );
3691 
3692  /* setup new root thread structure */
3693  if( root->r.r_uber_thread ) {
3694  root_thread = root->r.r_uber_thread;
3695  } else {
3696  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3697  if ( __kmp_storage_map ) {
3698  __kmp_print_thread_storage_map( root_thread, gtid );
3699  }
3700  root_thread->th.th_info .ds.ds_gtid = gtid;
3701  root_thread->th.th_root = root;
3702  if( __kmp_env_consistency_check ) {
3703  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3704  }
3705  #if USE_FAST_MEMORY
3706  __kmp_initialize_fast_memory( root_thread );
3707  #endif /* USE_FAST_MEMORY */
3708 
3709  #if KMP_USE_BGET
3710  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3711  __kmp_initialize_bget( root_thread );
3712  #endif
3713  __kmp_init_random( root_thread ); // Initialize random number generator
3714  }
3715 
3716  /* setup the serial team held in reserve by the root thread */
3717  if( ! root_thread->th.th_serial_team ) {
3718  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3719  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3720 
3721  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3722 #if OMPT_SUPPORT
3723  0, // root parallel id
3724 #endif
3725 #if OMP_40_ENABLED
3726  proc_bind_default,
3727 #endif
3728  &r_icvs,
3729  0 USE_NESTED_HOT_ARG(NULL) );
3730  }
3731  KMP_ASSERT( root_thread->th.th_serial_team );
3732  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3733  root_thread->th.th_serial_team ) );
3734 
3735  /* drop root_thread into place */
3736  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3737 
3738  root->r.r_root_team->t.t_threads[0] = root_thread;
3739  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3740  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3741  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3742  root->r.r_uber_thread = root_thread;
3743 
3744  /* initialize the thread, get it ready to go */
3745  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3746  TCW_4(__kmp_init_gtid, TRUE);
3747 
3748  /* prepare the master thread for get_gtid() */
3749  __kmp_gtid_set_specific( gtid );
3750 
3751 #if USE_ITT_BUILD
3752  __kmp_itt_thread_name( gtid );
3753 #endif /* USE_ITT_BUILD */
3754 
3755  #ifdef KMP_TDATA_GTID
3756  __kmp_gtid = gtid;
3757  #endif
3758  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3759  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3760 
3761  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3762  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3763  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3764  KMP_INIT_BARRIER_STATE ) );
3765  { // Initialize barrier data.
3766  int b;
3767  for ( b = 0; b < bs_last_barrier; ++ b ) {
3768  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3769 #if USE_DEBUGGER
3770  root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3771 #endif
3772  }; // for
3773  }
3774  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3775 
3776 #if KMP_AFFINITY_SUPPORTED
3777 # if OMP_40_ENABLED
3778  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3779  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3780  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3781  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3782 # endif
3783 
3784  if ( TCR_4(__kmp_init_middle) ) {
3785  __kmp_affinity_set_init_mask( gtid, TRUE );
3786  }
3787 #endif /* KMP_AFFINITY_SUPPORTED */
3788 
3789  __kmp_root_counter ++;
3790 
3791  KMP_MB();
3792  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3793 
3794  return gtid;
3795 }
3796 
3797 #if KMP_NESTED_HOT_TEAMS
3798 static int
3799 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3800 {
3801  int i, n, nth;
3802  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3803  if( !hot_teams || !hot_teams[level].hot_team ) {
3804  return 0;
3805  }
3806  KMP_DEBUG_ASSERT( level < max_level );
3807  kmp_team_t *team = hot_teams[level].hot_team;
3808  nth = hot_teams[level].hot_team_nth;
3809  n = nth - 1; // master is not freed
3810  if( level < max_level - 1 ) {
3811  for( i = 0; i < nth; ++i ) {
3812  kmp_info_t *th = team->t.t_threads[i];
3813  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3814  if( i > 0 && th->th.th_hot_teams ) {
3815  __kmp_free( th->th.th_hot_teams );
3816  th->th.th_hot_teams = NULL;
3817  }
3818  }
3819  }
3820  __kmp_free_team( root, team, NULL );
3821  return n;
3822 }
3823 #endif
3824 
3825 /* Resets a root thread and clear its root and hot teams.
3826  Returns the number of __kmp_threads entries directly and indirectly freed.
3827 */
3828 static int
3829 __kmp_reset_root(int gtid, kmp_root_t *root)
3830 {
3831  kmp_team_t * root_team = root->r.r_root_team;
3832  kmp_team_t * hot_team = root->r.r_hot_team;
3833  int n = hot_team->t.t_nproc;
3834  int i;
3835 
3836  KMP_DEBUG_ASSERT( ! root->r.r_active );
3837 
3838  root->r.r_root_team = NULL;
3839  root->r.r_hot_team = NULL;
3840  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3841  // to __kmp_free_team().
3842  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3843 #if KMP_NESTED_HOT_TEAMS
3844  if( __kmp_hot_teams_max_level > 0 ) { // need to free nested hot teams and their threads if any
3845  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3846  kmp_info_t *th = hot_team->t.t_threads[i];
3847  if( __kmp_hot_teams_max_level > 1 ) {
3848  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3849  }
3850  if( th->th.th_hot_teams ) {
3851  __kmp_free( th->th.th_hot_teams );
3852  th->th.th_hot_teams = NULL;
3853  }
3854  }
3855  }
3856 #endif
3857  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3858 
3859  //
3860  // Before we can reap the thread, we need to make certain that all
3861  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3862  //
3863  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3864  __kmp_wait_to_unref_task_teams();
3865  }
3866 
3867  #if KMP_OS_WINDOWS
3868  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3869  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3870  (LPVOID)&(root->r.r_uber_thread->th),
3871  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3872  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3873  #endif /* KMP_OS_WINDOWS */
3874 
3875 #if OMPT_SUPPORT
3876  if (ompt_enabled &&
3877  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3878  int gtid = __kmp_get_gtid();
3879  __ompt_thread_end(ompt_thread_initial, gtid);
3880  }
3881 #endif
3882 
3883  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3884  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3885 
3886  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3887  root->r.r_uber_thread = NULL;
3888  /* mark root as no longer in use */
3889  root->r.r_begin = FALSE;
3890 
3891  return n;
3892 }
3893 
3894 void
3895 __kmp_unregister_root_current_thread( int gtid )
3896 {
3897  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3898  /* this lock should be ok, since unregister_root_current_thread is never called during
3899  * and abort, only during a normal close. furthermore, if you have the
3900  * forkjoin lock, you should never try to get the initz lock */
3901 
3902  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3903  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3904  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3905  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3906  return;
3907  }
3908  kmp_root_t *root = __kmp_root[gtid];
3909 
3910  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3911  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3912  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3913  KMP_ASSERT( root->r.r_active == FALSE );
3914 
3915 
3916  KMP_MB();
3917 
3918 #if OMP_45_ENABLED
3919  kmp_info_t * thread = __kmp_threads[gtid];
3920  kmp_team_t * team = thread->th.th_team;
3921  kmp_task_team_t * task_team = thread->th.th_task_team;
3922 
3923  // we need to wait for the proxy tasks before finishing the thread
3924  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3925 #if OMPT_SUPPORT
3926  // the runtime is shutting down so we won't report any events
3927  thread->th.ompt_thread_info.state = ompt_state_undefined;
3928 #endif
3929  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3930  }
3931 #endif
3932 
3933  __kmp_reset_root(gtid, root);
3934 
3935  /* free up this thread slot */
3936  __kmp_gtid_set_specific( KMP_GTID_DNE );
3937 #ifdef KMP_TDATA_GTID
3938  __kmp_gtid = KMP_GTID_DNE;
3939 #endif
3940 
3941  KMP_MB();
3942  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3943 
3944  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3945 }
3946 
3947 #if KMP_OS_WINDOWS
3948 /* __kmp_forkjoin_lock must be already held
3949  Unregisters a root thread that is not the current thread. Returns the number of
3950  __kmp_threads entries freed as a result.
3951  */
3952 static int
3953 __kmp_unregister_root_other_thread( int gtid )
3954 {
3955  kmp_root_t *root = __kmp_root[gtid];
3956  int r;
3957 
3958  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3959  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3960  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3961  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3962  KMP_ASSERT( root->r.r_active == FALSE );
3963 
3964  r = __kmp_reset_root(gtid, root);
3965  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3966  return r;
3967 }
3968 #endif
3969 
3970 #if KMP_DEBUG
3971 void __kmp_task_info() {
3972 
3973  kmp_int32 gtid = __kmp_entry_gtid();
3974  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
3975  kmp_info_t *this_thr = __kmp_threads[ gtid ];
3976  kmp_team_t *steam = this_thr->th.th_serial_team;
3977  kmp_team_t *team = this_thr->th.th_team;
3978 
3979  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3980  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3981 }
3982 #endif // KMP_DEBUG
3983 
3984 /* TODO optimize with one big memclr, take out what isn't needed,
3985  * split responsibility to workers as much as possible, and delay
3986  * initialization of features as much as possible */
3987 static void
3988 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3989 {
3990  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3991  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3992  kmp_info_t *master = team->t.t_threads[0];
3993  KMP_DEBUG_ASSERT( this_thr != NULL );
3994  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3995  KMP_DEBUG_ASSERT( team );
3996  KMP_DEBUG_ASSERT( team->t.t_threads );
3997  KMP_DEBUG_ASSERT( team->t.t_dispatch );
3998  KMP_DEBUG_ASSERT( master );
3999  KMP_DEBUG_ASSERT( master->th.th_root );
4000 
4001  KMP_MB();
4002 
4003  TCW_SYNC_PTR(this_thr->th.th_team, team);
4004 
4005  this_thr->th.th_info.ds.ds_tid = tid;
4006  this_thr->th.th_set_nproc = 0;
4007 #if OMP_40_ENABLED
4008  this_thr->th.th_set_proc_bind = proc_bind_default;
4009 # if KMP_AFFINITY_SUPPORTED
4010  this_thr->th.th_new_place = this_thr->th.th_current_place;
4011 # endif
4012 #endif
4013  this_thr->th.th_root = master->th.th_root;
4014 
4015  /* setup the thread's cache of the team structure */
4016  this_thr->th.th_team_nproc = team->t.t_nproc;
4017  this_thr->th.th_team_master = master;
4018  this_thr->th.th_team_serialized = team->t.t_serialized;
4019  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4020 
4021  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4022 
4023  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4024  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4025 
4026  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4027 
4028  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4029  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4030  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4031 
4032  /* TODO no worksharing in speculative threads */
4033  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
4034 
4035  this_thr->th.th_local.this_construct = 0;
4036 
4037 #ifdef BUILD_TV
4038  this_thr->th.th_local.tv_data = 0;
4039 #endif
4040 
4041  if ( ! this_thr->th.th_pri_common ) {
4042  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4043  if ( __kmp_storage_map ) {
4044  __kmp_print_storage_map_gtid(
4045  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4046  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4047  );
4048  }; // if
4049  this_thr->th.th_pri_head = NULL;
4050  }; // if
4051 
4052  /* Initialize dynamic dispatch */
4053  {
4054  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4055  /*
4056  * Use team max_nproc since this will never change for the team.
4057  */
4058  size_t disp_size = sizeof( dispatch_private_info_t ) *
4059  ( team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers );
4060  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4061  KMP_ASSERT( dispatch );
4062  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4063  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4064 
4065  dispatch->th_disp_index = 0;
4066 #if OMP_45_ENABLED
4067  dispatch->th_doacross_buf_idx = 0;
4068 #endif
4069  if( ! dispatch->th_disp_buffer ) {
4070  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4071 
4072  if ( __kmp_storage_map ) {
4073  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4074  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers ],
4075  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4076  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4077  gtid, team->t.t_id, gtid );
4078  }
4079  } else {
4080  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4081  }
4082 
4083  dispatch->th_dispatch_pr_current = 0;
4084  dispatch->th_dispatch_sh_current = 0;
4085 
4086  dispatch->th_deo_fcn = 0; /* ORDERED */
4087  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4088  }
4089 
4090  this_thr->th.th_next_pool = NULL;
4091 
4092  if (!this_thr->th.th_task_state_memo_stack) {
4093  size_t i;
4094  this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4095  this_thr->th.th_task_state_top = 0;
4096  this_thr->th.th_task_state_stack_sz = 4;
4097  for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4098  this_thr->th.th_task_state_memo_stack[i] = 0;
4099  }
4100 
4101  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4102  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4103 
4104  KMP_MB();
4105 }
4106 
4107 
4108 /* allocate a new thread for the requesting team. this is only called from within a
4109  * forkjoin critical section. we will first try to get an available thread from the
4110  * thread pool. if none is available, we will fork a new one assuming we are able
4111  * to create a new one. this should be assured, as the caller should check on this
4112  * first.
4113  */
4114 kmp_info_t *
4115 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4116 {
4117  kmp_team_t *serial_team;
4118  kmp_info_t *new_thr;
4119  int new_gtid;
4120 
4121  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4122  KMP_DEBUG_ASSERT( root && team );
4123 #if !KMP_NESTED_HOT_TEAMS
4124  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4125 #endif
4126  KMP_MB();
4127 
4128  /* first, try to get one from the thread pool */
4129  if ( __kmp_thread_pool ) {
4130 
4131  new_thr = (kmp_info_t*)__kmp_thread_pool;
4132  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4133  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4134  __kmp_thread_pool_insert_pt = NULL;
4135  }
4136  TCW_4(new_thr->th.th_in_pool, FALSE);
4137  //
4138  // Don't touch th_active_in_pool or th_active.
4139  // The worker thread adjusts those flags as it sleeps/awakens.
4140  //
4141  __kmp_thread_pool_nth--;
4142 
4143  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4144  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4145  KMP_ASSERT( ! new_thr->th.th_team );
4146  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4147  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4148 
4149  /* setup the thread structure */
4150  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4151  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4152 
4153  TCW_4(__kmp_nth, __kmp_nth + 1);
4154 
4155  new_thr->th.th_task_state = 0;
4156  new_thr->th.th_task_state_top = 0;
4157  new_thr->th.th_task_state_stack_sz = 4;
4158 
4159 #ifdef KMP_ADJUST_BLOCKTIME
4160  /* Adjust blocktime back to zero if necessar y */
4161  /* Middle initialization might not have occurred yet */
4162  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4163  if ( __kmp_nth > __kmp_avail_proc ) {
4164  __kmp_zero_bt = TRUE;
4165  }
4166  }
4167 #endif /* KMP_ADJUST_BLOCKTIME */
4168 
4169 #if KMP_DEBUG
4170  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4171  int b;
4172  kmp_balign_t * balign = new_thr->th.th_bar;
4173  for( b = 0; b < bs_last_barrier; ++ b )
4174  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4175 #endif
4176 
4177  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4178  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4179 
4180  KMP_MB();
4181  return new_thr;
4182  }
4183 
4184 
4185  /* no, well fork a new one */
4186  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
4187  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4188 
4189 #if KMP_USE_MONITOR
4190  //
4191  // If this is the first worker thread the RTL is creating, then also
4192  // launch the monitor thread. We try to do this as early as possible.
4193  //
4194  if ( ! TCR_4( __kmp_init_monitor ) ) {
4195  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4196  if ( ! TCR_4( __kmp_init_monitor ) ) {
4197  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4198  TCW_4( __kmp_init_monitor, 1 );
4199  __kmp_create_monitor( & __kmp_monitor );
4200  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4201  #if KMP_OS_WINDOWS
4202  // AC: wait until monitor has started. This is a fix for CQ232808.
4203  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4204  // work in between, then there is high probability that monitor thread started after
4205  // the library shutdown. At shutdown it is too late to cope with the problem, because
4206  // when the master is in DllMain (process detach) the monitor has no chances to start
4207  // (it is blocked), and master has no means to inform the monitor that the library has gone,
4208  // because all the memory which the monitor can access is going to be released/reset.
4209  while ( TCR_4(__kmp_init_monitor) < 2 ) {
4210  KMP_YIELD( TRUE );
4211  }
4212  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4213  #endif
4214  }
4215  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4216  }
4217 #endif
4218 
4219  KMP_MB();
4220  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4221  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4222  }
4223 
4224  /* allocate space for it. */
4225  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4226 
4227  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4228 
4229  if ( __kmp_storage_map ) {
4230  __kmp_print_thread_storage_map( new_thr, new_gtid );
4231  }
4232 
4233  /* add the reserve serialized team, initialized from the team's master thread */
4234  {
4235  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4236  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4237 
4238  new_thr->th.th_serial_team = serial_team =
4239  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4240 #if OMPT_SUPPORT
4241  0, // root parallel id
4242 #endif
4243 #if OMP_40_ENABLED
4244  proc_bind_default,
4245 #endif
4246  &r_icvs,
4247  0 USE_NESTED_HOT_ARG(NULL) );
4248  }
4249  KMP_ASSERT ( serial_team );
4250  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
4251  serial_team->t.t_threads[0] = new_thr;
4252  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4253  new_thr ) );
4254 
4255  /* setup the thread structures */
4256  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4257 
4258  #if USE_FAST_MEMORY
4259  __kmp_initialize_fast_memory( new_thr );
4260  #endif /* USE_FAST_MEMORY */
4261 
4262  #if KMP_USE_BGET
4263  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4264  __kmp_initialize_bget( new_thr );
4265  #endif
4266 
4267  __kmp_init_random( new_thr ); // Initialize random number generator
4268 
4269  /* Initialize these only once when thread is grabbed for a team allocation */
4270  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4271  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4272 
4273  int b;
4274  kmp_balign_t * balign = new_thr->th.th_bar;
4275  for(b=0; b<bs_last_barrier; ++b) {
4276  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4277  balign[b].bb.team = NULL;
4278  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4279  balign[b].bb.use_oncore_barrier = 0;
4280  }
4281 
4282  new_thr->th.th_spin_here = FALSE;
4283  new_thr->th.th_next_waiting = 0;
4284 
4285 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4286  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4287  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4288  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4289  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4290 #endif
4291 
4292  TCW_4(new_thr->th.th_in_pool, FALSE);
4293  new_thr->th.th_active_in_pool = FALSE;
4294  TCW_4(new_thr->th.th_active, TRUE);
4295 
4296  /* adjust the global counters */
4297  __kmp_all_nth ++;
4298  __kmp_nth ++;
4299 
4300  //
4301  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4302  // for low numbers of procs, and method #2 (keyed API call) for higher
4303  // numbers of procs.
4304  //
4305  if ( __kmp_adjust_gtid_mode ) {
4306  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4307  if ( TCR_4(__kmp_gtid_mode) != 2) {
4308  TCW_4(__kmp_gtid_mode, 2);
4309  }
4310  }
4311  else {
4312  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4313  TCW_4(__kmp_gtid_mode, 1);
4314  }
4315  }
4316  }
4317 
4318 #ifdef KMP_ADJUST_BLOCKTIME
4319  /* Adjust blocktime back to zero if necessary */
4320  /* Middle initialization might not have occurred yet */
4321  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4322  if ( __kmp_nth > __kmp_avail_proc ) {
4323  __kmp_zero_bt = TRUE;
4324  }
4325  }
4326 #endif /* KMP_ADJUST_BLOCKTIME */
4327 
4328  /* actually fork it and create the new worker thread */
4329  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4330  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4331  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4332 
4333  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4334  KMP_MB();
4335  return new_thr;
4336 }
4337 
4338 /*
4339  * reinitialize team for reuse.
4340  *
4341  * The hot team code calls this case at every fork barrier, so EPCC barrier
4342  * test are extremely sensitive to changes in it, esp. writes to the team
4343  * struct, which cause a cache invalidation in all threads.
4344  *
4345  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4346  */
4347 static void
4348 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4349  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4350  team->t.t_threads[0], team ) );
4351  KMP_DEBUG_ASSERT( team && new_icvs);
4352  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4353  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4354 
4355  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4356 
4357  // Copy ICVs to the master thread's implicit taskdata
4358  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4359  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4360 
4361  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4362  team->t.t_threads[0], team ) );
4363 }
4364 
4365 
4366 /* initialize the team data structure
4367  * this assumes the t_threads and t_max_nproc are already set
4368  * also, we don't touch the arguments */
4369 static void
4370 __kmp_initialize_team(
4371  kmp_team_t * team,
4372  int new_nproc,
4373  kmp_internal_control_t * new_icvs,
4374  ident_t * loc
4375 ) {
4376  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4377 
4378  /* verify */
4379  KMP_DEBUG_ASSERT( team );
4380  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4381  KMP_DEBUG_ASSERT( team->t.t_threads );
4382  KMP_MB();
4383 
4384  team->t.t_master_tid = 0; /* not needed */
4385  /* team->t.t_master_bar; not needed */
4386  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4387  team->t.t_nproc = new_nproc;
4388 
4389  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4390  team->t.t_next_pool = NULL;
4391  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4392 
4393  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4394  team->t.t_invoke = NULL; /* not needed */
4395 
4396  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4397  team->t.t_sched = new_icvs->sched;
4398 
4399 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4400  team->t.t_fp_control_saved = FALSE; /* not needed */
4401  team->t.t_x87_fpu_control_word = 0; /* not needed */
4402  team->t.t_mxcsr = 0; /* not needed */
4403 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4404 
4405  team->t.t_construct = 0;
4406  __kmp_init_lock( & team->t.t_single_lock );
4407 
4408  team->t.t_ordered .dt.t_value = 0;
4409  team->t.t_master_active = FALSE;
4410 
4411  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4412 
4413 #ifdef KMP_DEBUG
4414  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4415 #endif
4416  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4417 
4418  team->t.t_control_stack_top = NULL;
4419 
4420  __kmp_reinitialize_team( team, new_icvs, loc );
4421 
4422  KMP_MB();
4423  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4424 }
4425 
4426 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4427 /* Sets full mask for thread and returns old mask, no changes to structures. */
4428 static void
4429 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4430 {
4431  if ( KMP_AFFINITY_CAPABLE() ) {
4432  int status;
4433  if ( old_mask != NULL ) {
4434  status = __kmp_get_system_affinity( old_mask, TRUE );
4435  int error = errno;
4436  if ( status != 0 ) {
4437  __kmp_msg(
4438  kmp_ms_fatal,
4439  KMP_MSG( ChangeThreadAffMaskError ),
4440  KMP_ERR( error ),
4441  __kmp_msg_null
4442  );
4443  }
4444  }
4445  __kmp_set_system_affinity( __kmp_affin_fullMask, TRUE );
4446  }
4447 }
4448 #endif
4449 
4450 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4451 
4452 //
4453 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4454 // It calculats the worker + master thread's partition based upon the parent
4455 // thread's partition, and binds each worker to a thread in their partition.
4456 // The master thread's partition should already include its current binding.
4457 //
4458 static void
4459 __kmp_partition_places( kmp_team_t *team, int update_master_only )
4460 {
4461  //
4462  // Copy the master thread's place partion to the team struct
4463  //
4464  kmp_info_t *master_th = team->t.t_threads[0];
4465  KMP_DEBUG_ASSERT( master_th != NULL );
4466  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4467  int first_place = master_th->th.th_first_place;
4468  int last_place = master_th->th.th_last_place;
4469  int masters_place = master_th->th.th_current_place;
4470  team->t.t_first_place = first_place;
4471  team->t.t_last_place = last_place;
4472 
4473  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4474  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4475  masters_place, first_place, last_place ) );
4476 
4477  switch ( proc_bind ) {
4478 
4479  case proc_bind_default:
4480  //
4481  // serial teams might have the proc_bind policy set to
4482  // proc_bind_default. It doesn't matter, as we don't
4483  // rebind the master thread for any proc_bind policy.
4484  //
4485  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4486  break;
4487 
4488  case proc_bind_master:
4489  {
4490  int f;
4491  int n_th = team->t.t_nproc;
4492  for ( f = 1; f < n_th; f++ ) {
4493  kmp_info_t *th = team->t.t_threads[f];
4494  KMP_DEBUG_ASSERT( th != NULL );
4495  th->th.th_first_place = first_place;
4496  th->th.th_last_place = last_place;
4497  th->th.th_new_place = masters_place;
4498 
4499  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4500  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4501  team->t.t_id, f, masters_place, first_place, last_place ) );
4502  }
4503  }
4504  break;
4505 
4506  case proc_bind_close:
4507  {
4508  int f;
4509  int n_th = team->t.t_nproc;
4510  int n_places;
4511  if ( first_place <= last_place ) {
4512  n_places = last_place - first_place + 1;
4513  }
4514  else {
4515  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4516  }
4517  if ( n_th <= n_places ) {
4518  int place = masters_place;
4519  for ( f = 1; f < n_th; f++ ) {
4520  kmp_info_t *th = team->t.t_threads[f];
4521  KMP_DEBUG_ASSERT( th != NULL );
4522 
4523  if ( place == last_place ) {
4524  place = first_place;
4525  }
4526  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4527  place = 0;
4528  }
4529  else {
4530  place++;
4531  }
4532  th->th.th_first_place = first_place;
4533  th->th.th_last_place = last_place;
4534  th->th.th_new_place = place;
4535 
4536  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4537  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4538  team->t.t_id, f, place, first_place, last_place ) );
4539  }
4540  }
4541  else {
4542  int S, rem, gap, s_count;
4543  S = n_th / n_places;
4544  s_count = 0;
4545  rem = n_th - ( S * n_places );
4546  gap = rem > 0 ? n_places/rem : n_places;
4547  int place = masters_place;
4548  int gap_ct = gap;
4549  for ( f = 0; f < n_th; f++ ) {
4550  kmp_info_t *th = team->t.t_threads[f];
4551  KMP_DEBUG_ASSERT( th != NULL );
4552 
4553  th->th.th_first_place = first_place;
4554  th->th.th_last_place = last_place;
4555  th->th.th_new_place = place;
4556  s_count++;
4557 
4558  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4559  // do nothing, add an extra thread to place on next iteration
4560  }
4561  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4562  // we added an extra thread to this place; move to next place
4563  if ( place == last_place ) {
4564  place = first_place;
4565  }
4566  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4567  place = 0;
4568  }
4569  else {
4570  place++;
4571  }
4572  s_count = 0;
4573  gap_ct = 1;
4574  rem--;
4575  }
4576  else if (s_count == S) { // place full; don't add extra
4577  if ( place == last_place ) {
4578  place = first_place;
4579  }
4580  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4581  place = 0;
4582  }
4583  else {
4584  place++;
4585  }
4586  gap_ct++;
4587  s_count = 0;
4588  }
4589 
4590  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4591  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4592  team->t.t_id, f, th->th.th_new_place, first_place,
4593  last_place ) );
4594  }
4595  KMP_DEBUG_ASSERT( place == masters_place );
4596  }
4597  }
4598  break;
4599 
4600  case proc_bind_spread:
4601  {
4602  int f;
4603  int n_th = team->t.t_nproc;
4604  int n_places;
4605  int thidx;
4606  if ( first_place <= last_place ) {
4607  n_places = last_place - first_place + 1;
4608  }
4609  else {
4610  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4611  }
4612  if ( n_th <= n_places ) {
4613  int place = masters_place;
4614  int S = n_places/n_th;
4615  int s_count, rem, gap, gap_ct;
4616  rem = n_places - n_th*S;
4617  gap = rem ? n_th/rem : 1;
4618  gap_ct = gap;
4619  thidx = n_th;
4620  if (update_master_only == 1)
4621  thidx = 1;
4622  for ( f = 0; f < thidx; f++ ) {
4623  kmp_info_t *th = team->t.t_threads[f];
4624  KMP_DEBUG_ASSERT( th != NULL );
4625 
4626  th->th.th_first_place = place;
4627  th->th.th_new_place = place;
4628  s_count = 1;
4629  while (s_count < S) {
4630  if ( place == last_place ) {
4631  place = first_place;
4632  }
4633  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4634  place = 0;
4635  }
4636  else {
4637  place++;
4638  }
4639  s_count++;
4640  }
4641  if (rem && (gap_ct == gap)) {
4642  if ( place == last_place ) {
4643  place = first_place;
4644  }
4645  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4646  place = 0;
4647  }
4648  else {
4649  place++;
4650  }
4651  rem--;
4652  gap_ct = 0;
4653  }
4654  th->th.th_last_place = place;
4655  gap_ct++;
4656 
4657  if ( place == last_place ) {
4658  place = first_place;
4659  }
4660  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4661  place = 0;
4662  }
4663  else {
4664  place++;
4665  }
4666 
4667  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4668  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4669  team->t.t_id, f, th->th.th_new_place,
4670  th->th.th_first_place, th->th.th_last_place ) );
4671  }
4672  KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4673  }
4674  else {
4675  int S, rem, gap, s_count;
4676  S = n_th / n_places;
4677  s_count = 0;
4678  rem = n_th - ( S * n_places );
4679  gap = rem > 0 ? n_places/rem : n_places;
4680  int place = masters_place;
4681  int gap_ct = gap;
4682  thidx = n_th;
4683  if (update_master_only == 1)
4684  thidx = 1;
4685  for ( f = 0; f < thidx; f++ ) {
4686  kmp_info_t *th = team->t.t_threads[f];
4687  KMP_DEBUG_ASSERT( th != NULL );
4688 
4689  th->th.th_first_place = place;
4690  th->th.th_last_place = place;
4691  th->th.th_new_place = place;
4692  s_count++;
4693 
4694  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4695  // do nothing, add an extra thread to place on next iteration
4696  }
4697  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4698  // we added an extra thread to this place; move on to next place
4699  if ( place == last_place ) {
4700  place = first_place;
4701  }
4702  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4703  place = 0;
4704  }
4705  else {
4706  place++;
4707  }
4708  s_count = 0;
4709  gap_ct = 1;
4710  rem--;
4711  }
4712  else if (s_count == S) { // place is full; don't add extra thread
4713  if ( place == last_place ) {
4714  place = first_place;
4715  }
4716  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4717  place = 0;
4718  }
4719  else {
4720  place++;
4721  }
4722  gap_ct++;
4723  s_count = 0;
4724  }
4725 
4726  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4727  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4728  team->t.t_id, f, th->th.th_new_place,
4729  th->th.th_first_place, th->th.th_last_place) );
4730  }
4731  KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4732  }
4733  }
4734  break;
4735 
4736  default:
4737  break;
4738  }
4739 
4740  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4741 }
4742 
4743 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4744 
4745 /* allocate a new team data structure to use. take one off of the free pool if available */
4746 kmp_team_t *
4747 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4748 #if OMPT_SUPPORT
4749  ompt_parallel_id_t ompt_parallel_id,
4750 #endif
4751 #if OMP_40_ENABLED
4752  kmp_proc_bind_t new_proc_bind,
4753 #endif
4754  kmp_internal_control_t *new_icvs,
4755  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4756 {
4757  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4758  int f;
4759  kmp_team_t *team;
4760  int use_hot_team = ! root->r.r_active;
4761  int level = 0;
4762 
4763  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4764  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4765  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4766  KMP_MB();
4767 
4768 #if KMP_NESTED_HOT_TEAMS
4769  kmp_hot_team_ptr_t *hot_teams;
4770  if( master ) {
4771  team = master->th.th_team;
4772  level = team->t.t_active_level;
4773  if( master->th.th_teams_microtask ) { // in teams construct?
4774  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4775  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4776  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4777  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4778  }
4779  }
4780  hot_teams = master->th.th_hot_teams;
4781  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4782  { // hot team has already been allocated for given level
4783  use_hot_team = 1;
4784  } else {
4785  use_hot_team = 0;
4786  }
4787  }
4788 #endif
4789  // Optimization to use a "hot" team
4790  if( use_hot_team && new_nproc > 1 ) {
4791  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4792 #if KMP_NESTED_HOT_TEAMS
4793  team = hot_teams[level].hot_team;
4794 #else
4795  team = root->r.r_hot_team;
4796 #endif
4797 #if KMP_DEBUG
4798  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4799  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4800  team->t.t_task_team[0], team->t.t_task_team[1] ));
4801  }
4802 #endif
4803 
4804  // Has the number of threads changed?
4805  /* Let's assume the most common case is that the number of threads is unchanged, and
4806  put that case first. */
4807  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4808  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4809  // This case can mean that omp_set_num_threads() was called and the hot team size
4810  // was already reduced, so we check the special flag
4811  if ( team->t.t_size_changed == -1 ) {
4812  team->t.t_size_changed = 1;
4813  } else {
4814  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4815  }
4816 
4817  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4818  kmp_r_sched_t new_sched = new_icvs->sched;
4819  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4820  team->t.t_sched.chunk != new_sched.chunk)
4821  team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
4822 
4823  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4824 
4825  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4826  0, team->t.t_threads[0], team ) );
4827  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4828 
4829 #if OMP_40_ENABLED
4830 # if KMP_AFFINITY_SUPPORTED
4831  if ( ( team->t.t_size_changed == 0 )
4832  && ( team->t.t_proc_bind == new_proc_bind ) ) {
4833  if (new_proc_bind == proc_bind_spread) {
4834  __kmp_partition_places(team, 1); // add flag to update only master for spread
4835  }
4836  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4837  team->t.t_id, new_proc_bind, team->t.t_first_place,
4838  team->t.t_last_place ) );
4839  }
4840  else {
4841  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4842  __kmp_partition_places( team );
4843  }
4844 # else
4845  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4846 # endif /* KMP_AFFINITY_SUPPORTED */
4847 #endif /* OMP_40_ENABLED */
4848  }
4849  else if( team->t.t_nproc > new_nproc ) {
4850  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4851 
4852  team->t.t_size_changed = 1;
4853 #if KMP_NESTED_HOT_TEAMS
4854  if( __kmp_hot_teams_mode == 0 ) {
4855  // AC: saved number of threads should correspond to team's value in this mode,
4856  // can be bigger in mode 1, when hot team has some threads in reserve
4857  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4858  hot_teams[level].hot_team_nth = new_nproc;
4859 #endif // KMP_NESTED_HOT_TEAMS
4860  /* release the extra threads we don't need any more */
4861  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4862  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4863  if ( __kmp_tasking_mode != tskm_immediate_exec) {
4864  // When decreasing team size, threads no longer in the team should unref task team.
4865  team->t.t_threads[f]->th.th_task_team = NULL;
4866  }
4867  __kmp_free_thread( team->t.t_threads[ f ] );
4868  team->t.t_threads[ f ] = NULL;
4869  }
4870 #if KMP_NESTED_HOT_TEAMS
4871  } // (__kmp_hot_teams_mode == 0)
4872  else {
4873  // When keeping extra threads in team, switch threads to wait on own b_go flag
4874  for (f=new_nproc; f<team->t.t_nproc; ++f) {
4875  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4876  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4877  for (int b=0; b<bs_last_barrier; ++b) {
4878  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4879  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4880  }
4881  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4882  }
4883  }
4884  }
4885 #endif // KMP_NESTED_HOT_TEAMS
4886  team->t.t_nproc = new_nproc;
4887  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4888  if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4889  team->t.t_sched.chunk != new_icvs->sched.chunk)
4890  team->t.t_sched = new_icvs->sched;
4891  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4892 
4893  /* update the remaining threads */
4894  for(f = 0; f < new_nproc; ++f) {
4895  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4896  }
4897  // restore the current task state of the master thread: should be the implicit task
4898  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4899  0, team->t.t_threads[0], team ) );
4900 
4901  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4902 
4903 #ifdef KMP_DEBUG
4904  for ( f = 0; f < team->t.t_nproc; f++ ) {
4905  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4906  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4907  }
4908 #endif
4909 
4910 #if OMP_40_ENABLED
4911  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4912 # if KMP_AFFINITY_SUPPORTED
4913  __kmp_partition_places( team );
4914 # endif
4915 #endif
4916  }
4917  else { // team->t.t_nproc < new_nproc
4918 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4919  kmp_affin_mask_t *old_mask;
4920  if ( KMP_AFFINITY_CAPABLE() ) {
4921  KMP_CPU_ALLOC(old_mask);
4922  }
4923 #endif
4924 
4925  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4926 
4927  team->t.t_size_changed = 1;
4928 
4929 #if KMP_NESTED_HOT_TEAMS
4930  int avail_threads = hot_teams[level].hot_team_nth;
4931  if( new_nproc < avail_threads )
4932  avail_threads = new_nproc;
4933  kmp_info_t **other_threads = team->t.t_threads;
4934  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4935  // Adjust barrier data of reserved threads (if any) of the team
4936  // Other data will be set in __kmp_initialize_info() below.
4937  int b;
4938  kmp_balign_t * balign = other_threads[f]->th.th_bar;
4939  for ( b = 0; b < bs_last_barrier; ++ b ) {
4940  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4941  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4942 #if USE_DEBUGGER
4943  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4944 #endif
4945  }
4946  }
4947  if( hot_teams[level].hot_team_nth >= new_nproc ) {
4948  // we have all needed threads in reserve, no need to allocate any
4949  // this only possible in mode 1, cannot have reserved threads in mode 0
4950  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4951  team->t.t_nproc = new_nproc; // just get reserved threads involved
4952  } else {
4953  // we may have some threads in reserve, but not enough
4954  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4955  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4956 #endif // KMP_NESTED_HOT_TEAMS
4957  if(team->t.t_max_nproc < new_nproc) {
4958  /* reallocate larger arrays */
4959  __kmp_reallocate_team_arrays(team, new_nproc);
4960  __kmp_reinitialize_team( team, new_icvs, NULL );
4961  }
4962 
4963 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4964  /* Temporarily set full mask for master thread before
4965  creation of workers. The reason is that workers inherit
4966  the affinity from master, so if a lot of workers are
4967  created on the single core quickly, they don't get
4968  a chance to set their own affinity for a long time.
4969  */
4970  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4971 #endif
4972 
4973  /* allocate new threads for the hot team */
4974  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
4975  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4976  KMP_DEBUG_ASSERT( new_worker );
4977  team->t.t_threads[ f ] = new_worker;
4978 
4979  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4980  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4981  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4982  team->t.t_bar[bs_plain_barrier].b_arrived ) );
4983 
4984  { // Initialize barrier data for new threads.
4985  int b;
4986  kmp_balign_t * balign = new_worker->th.th_bar;
4987  for( b = 0; b < bs_last_barrier; ++ b ) {
4988  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
4989  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4990 #if USE_DEBUGGER
4991  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
4992 #endif
4993  }
4994  }
4995  }
4996 
4997 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4998  if ( KMP_AFFINITY_CAPABLE() ) {
4999  /* Restore initial master thread's affinity mask */
5000  __kmp_set_system_affinity( old_mask, TRUE );
5001  KMP_CPU_FREE(old_mask);
5002  }
5003 #endif
5004 #if KMP_NESTED_HOT_TEAMS
5005  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5006 #endif // KMP_NESTED_HOT_TEAMS
5007  /* make sure everyone is syncronized */
5008  int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
5009  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5010 
5011  /* reinitialize the threads */
5012  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5013  for (f=0; f < team->t.t_nproc; ++f)
5014  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5015  if (level) { // set th_task_state for new threads in nested hot team
5016  // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
5017  // th_task_state for the new threads. th_task_state for master thread will not be accurate until
5018  // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
5019  for (f=old_nproc; f < team->t.t_nproc; ++f)
5020  team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5021  }
5022  else { // set th_task_state for new threads in non-nested hot team
5023  int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
5024  for (f=old_nproc; f < team->t.t_nproc; ++f)
5025  team->t.t_threads[f]->th.th_task_state = old_state;
5026  }
5027 
5028 #ifdef KMP_DEBUG
5029  for ( f = 0; f < team->t.t_nproc; ++ f ) {
5030  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5031  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5032  }
5033 #endif
5034 
5035 #if OMP_40_ENABLED
5036  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5037 # if KMP_AFFINITY_SUPPORTED
5038  __kmp_partition_places( team );
5039 # endif
5040 #endif
5041  } // Check changes in number of threads
5042 
5043 #if OMP_40_ENABLED
5044  kmp_info_t *master = team->t.t_threads[0];
5045  if( master->th.th_teams_microtask ) {
5046  for( f = 1; f < new_nproc; ++f ) {
5047  // propagate teams construct specific info to workers
5048  kmp_info_t *thr = team->t.t_threads[f];
5049  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5050  thr->th.th_teams_level = master->th.th_teams_level;
5051  thr->th.th_teams_size = master->th.th_teams_size;
5052  }
5053  }
5054 #endif /* OMP_40_ENABLED */
5055 #if KMP_NESTED_HOT_TEAMS
5056  if( level ) {
5057  // Sync barrier state for nested hot teams, not needed for outermost hot team.
5058  for( f = 1; f < new_nproc; ++f ) {
5059  kmp_info_t *thr = team->t.t_threads[f];
5060  int b;
5061  kmp_balign_t * balign = thr->th.th_bar;
5062  for( b = 0; b < bs_last_barrier; ++ b ) {
5063  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5064  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5065 #if USE_DEBUGGER
5066  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5067 #endif
5068  }
5069  }
5070  }
5071 #endif // KMP_NESTED_HOT_TEAMS
5072 
5073  /* reallocate space for arguments if necessary */
5074  __kmp_alloc_argv_entries( argc, team, TRUE );
5075  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5076  //
5077  // The hot team re-uses the previous task team,
5078  // if untouched during the previous release->gather phase.
5079  //
5080 
5081  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5082 
5083 #if KMP_DEBUG
5084  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5085  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5086  team->t.t_task_team[0], team->t.t_task_team[1] ));
5087  }
5088 #endif
5089 
5090 #if OMPT_SUPPORT
5091  __ompt_team_assign_id(team, ompt_parallel_id);
5092 #endif
5093 
5094  KMP_MB();
5095 
5096  return team;
5097  }
5098 
5099  /* next, let's try to take one from the team pool */
5100  KMP_MB();
5101  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5102  {
5103  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5104  if ( team->t.t_max_nproc >= max_nproc ) {
5105  /* take this team from the team pool */
5106  __kmp_team_pool = team->t.t_next_pool;
5107 
5108  /* setup the team for fresh use */
5109  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5110 
5111  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5112  &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5113  team->t.t_task_team[0] = NULL;
5114  team->t.t_task_team[1] = NULL;
5115 
5116  /* reallocate space for arguments if necessary */
5117  __kmp_alloc_argv_entries( argc, team, TRUE );
5118  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5119 
5120  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5121  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5122  { // Initialize barrier data.
5123  int b;
5124  for ( b = 0; b < bs_last_barrier; ++ b) {
5125  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5126 #if USE_DEBUGGER
5127  team->t.t_bar[ b ].b_master_arrived = 0;
5128  team->t.t_bar[ b ].b_team_arrived = 0;
5129 #endif
5130  }
5131  }
5132 
5133 #if OMP_40_ENABLED
5134  team->t.t_proc_bind = new_proc_bind;
5135 #endif
5136 
5137  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5138 
5139 #if OMPT_SUPPORT
5140  __ompt_team_assign_id(team, ompt_parallel_id);
5141 #endif
5142 
5143  KMP_MB();
5144 
5145  return team;
5146  }
5147 
5148  /* reap team if it is too small, then loop back and check the next one */
5149  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5150  /* TODO: Use technique to find the right size hot-team, don't reap them */
5151  team = __kmp_reap_team( team );
5152  __kmp_team_pool = team;
5153  }
5154 
5155  /* nothing available in the pool, no matter, make a new team! */
5156  KMP_MB();
5157  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5158 
5159  /* and set it up */
5160  team->t.t_max_nproc = max_nproc;
5161  /* NOTE well, for some reason allocating one big buffer and dividing it
5162  * up seems to really hurt performance a lot on the P4, so, let's not use
5163  * this... */
5164  __kmp_allocate_team_arrays( team, max_nproc );
5165 
5166  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5167  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5168 
5169  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5170  &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5171  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5172  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5173 
5174  if ( __kmp_storage_map ) {
5175  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5176  }
5177 
5178  /* allocate space for arguments */
5179  __kmp_alloc_argv_entries( argc, team, FALSE );
5180  team->t.t_argc = argc;
5181 
5182  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5183  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5184  { // Initialize barrier data.
5185  int b;
5186  for ( b = 0; b < bs_last_barrier; ++ b ) {
5187  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5188 #if USE_DEBUGGER
5189  team->t.t_bar[ b ].b_master_arrived = 0;
5190  team->t.t_bar[ b ].b_team_arrived = 0;
5191 #endif
5192  }
5193  }
5194 
5195 #if OMP_40_ENABLED
5196  team->t.t_proc_bind = new_proc_bind;
5197 #endif
5198 
5199 #if OMPT_SUPPORT
5200  __ompt_team_assign_id(team, ompt_parallel_id);
5201  team->t.ompt_serialized_team_info = NULL;
5202 #endif
5203 
5204  KMP_MB();
5205 
5206  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5207 
5208  return team;
5209 }
5210 
5211 /* TODO implement hot-teams at all levels */
5212 /* TODO implement lazy thread release on demand (disband request) */
5213 
5214 /* free the team. return it to the team pool. release all the threads
5215  * associated with it */
5216 void
5217 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
5218 {
5219  int f;
5220  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5221 
5222  /* verify state */
5223  KMP_DEBUG_ASSERT( root );
5224  KMP_DEBUG_ASSERT( team );
5225  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5226  KMP_DEBUG_ASSERT( team->t.t_threads );
5227 
5228  int use_hot_team = team == root->r.r_hot_team;
5229 #if KMP_NESTED_HOT_TEAMS
5230  int level;
5231  kmp_hot_team_ptr_t *hot_teams;
5232  if( master ) {
5233  level = team->t.t_active_level - 1;
5234  if( master->th.th_teams_microtask ) { // in teams construct?
5235  if( master->th.th_teams_size.nteams > 1 ) {
5236  ++level; // level was not increased in teams construct for team_of_masters
5237  }
5238  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5239  master->th.th_teams_level == team->t.t_level ) {
5240  ++level; // level was not increased in teams construct for team_of_workers before the parallel
5241  } // team->t.t_level will be increased inside parallel
5242  }
5243  hot_teams = master->th.th_hot_teams;
5244  if( level < __kmp_hot_teams_max_level ) {
5245  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5246  use_hot_team = 1;
5247  }
5248  }
5249 #endif // KMP_NESTED_HOT_TEAMS
5250 
5251  /* team is done working */
5252  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5253  team->t.t_copyin_counter = 0; // init counter for possible reuse
5254  // Do not reset pointer to parent team to NULL for hot teams.
5255 
5256  /* if we are non-hot team, release our threads */
5257  if( ! use_hot_team ) {
5258  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5259  // Delete task teams
5260  int tt_idx;
5261  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5262  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5263  if ( task_team != NULL ) {
5264  for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5265  team->t.t_threads[f]->th.th_task_team = NULL;
5266  }
5267  KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5268 #if KMP_NESTED_HOT_TEAMS
5269  __kmp_free_task_team( master, task_team );
5270 #endif
5271  team->t.t_task_team[tt_idx] = NULL;
5272  }
5273  }
5274  }
5275 
5276  // Reset pointer to parent team only for non-hot teams.
5277  team->t.t_parent = NULL;
5278  team->t.t_level = 0;
5279  team->t.t_active_level = 0;
5280 
5281  /* free the worker threads */
5282  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5283  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5284  __kmp_free_thread( team->t.t_threads[ f ] );
5285  team->t.t_threads[ f ] = NULL;
5286  }
5287 
5288  /* put the team back in the team pool */
5289  /* TODO limit size of team pool, call reap_team if pool too large */
5290  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5291  __kmp_team_pool = (volatile kmp_team_t*) team;
5292  }
5293 
5294  KMP_MB();
5295 }
5296 
5297 
5298 /* reap the team. destroy it, reclaim all its resources and free its memory */
5299 kmp_team_t *
5300 __kmp_reap_team( kmp_team_t *team )
5301 {
5302  kmp_team_t *next_pool = team->t.t_next_pool;
5303 
5304  KMP_DEBUG_ASSERT( team );
5305  KMP_DEBUG_ASSERT( team->t.t_dispatch );
5306  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5307  KMP_DEBUG_ASSERT( team->t.t_threads );
5308  KMP_DEBUG_ASSERT( team->t.t_argv );
5309 
5310  /* TODO clean the threads that are a part of this? */
5311 
5312  /* free stuff */
5313 
5314  __kmp_free_team_arrays( team );
5315  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5316  __kmp_free( (void*) team->t.t_argv );
5317  __kmp_free( team );
5318 
5319  KMP_MB();
5320  return next_pool;
5321 }
5322 
5323 //
5324 // Free the thread. Don't reap it, just place it on the pool of available
5325 // threads.
5326 //
5327 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5328 // binding for the affinity mechanism to be useful.
5329 //
5330 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5331 // However, we want to avoid a potential performance problem by always
5332 // scanning through the list to find the correct point at which to insert
5333 // the thread (potential N**2 behavior). To do this we keep track of the
5334 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5335 // With single-level parallelism, threads will always be added to the tail
5336 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5337 // parallelism, all bets are off and we may need to scan through the entire
5338 // free list.
5339 //
5340 // This change also has a potentially large performance benefit, for some
5341 // applications. Previously, as threads were freed from the hot team, they
5342 // would be placed back on the free list in inverse order. If the hot team
5343 // grew back to it's original size, then the freed thread would be placed
5344 // back on the hot team in reverse order. This could cause bad cache
5345 // locality problems on programs where the size of the hot team regularly
5346 // grew and shrunk.
5347 //
5348 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5349 //
5350 void
5351 __kmp_free_thread( kmp_info_t *this_th )
5352 {
5353  int gtid;
5354  kmp_info_t **scan;
5355 
5356  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5357  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5358 
5359  KMP_DEBUG_ASSERT( this_th );
5360 
5361  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5362  int b;
5363  kmp_balign_t *balign = this_th->th.th_bar;
5364  for (b=0; b<bs_last_barrier; ++b) {
5365  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5366  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5367  balign[b].bb.team = NULL;
5368  balign[b].bb.leaf_kids = 0;
5369  }
5370  this_th->th.th_task_state = 0;
5371 
5372  /* put thread back on the free pool */
5373  TCW_PTR(this_th->th.th_team, NULL);
5374  TCW_PTR(this_th->th.th_root, NULL);
5375  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5376 
5377  //
5378  // If the __kmp_thread_pool_insert_pt is already past the new insert
5379  // point, then we need to re-scan the entire list.
5380  //
5381  gtid = this_th->th.th_info.ds.ds_gtid;
5382  if ( __kmp_thread_pool_insert_pt != NULL ) {
5383  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5384  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5385  __kmp_thread_pool_insert_pt = NULL;
5386  }
5387  }
5388 
5389  //
5390  // Scan down the list to find the place to insert the thread.
5391  // scan is the address of a link in the list, possibly the address of
5392  // __kmp_thread_pool itself.
5393  //
5394  // In the absence of nested parallism, the for loop will have 0 iterations.
5395  //
5396  if ( __kmp_thread_pool_insert_pt != NULL ) {
5397  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5398  }
5399  else {
5400  scan = (kmp_info_t **)&__kmp_thread_pool;
5401  }
5402  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5403  scan = &( (*scan)->th.th_next_pool ) );
5404 
5405  //
5406  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5407  // to its address.
5408  //
5409  TCW_PTR(this_th->th.th_next_pool, *scan);
5410  __kmp_thread_pool_insert_pt = *scan = this_th;
5411  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5412  || ( this_th->th.th_info.ds.ds_gtid
5413  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5414  TCW_4(this_th->th.th_in_pool, TRUE);
5415  __kmp_thread_pool_nth++;
5416 
5417  TCW_4(__kmp_nth, __kmp_nth - 1);
5418 
5419 #ifdef KMP_ADJUST_BLOCKTIME
5420  /* Adjust blocktime back to user setting or default if necessary */
5421  /* Middle initialization might never have occurred */
5422  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5423  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5424  if ( __kmp_nth <= __kmp_avail_proc ) {
5425  __kmp_zero_bt = FALSE;
5426  }
5427  }
5428 #endif /* KMP_ADJUST_BLOCKTIME */
5429 
5430  KMP_MB();
5431 }
5432 
5433 
5434 /* ------------------------------------------------------------------------ */
5435 
5436 void *
5437 __kmp_launch_thread( kmp_info_t *this_thr )
5438 {
5439  int gtid = this_thr->th.th_info.ds.ds_gtid;
5440 /* void *stack_data;*/
5441  kmp_team_t *(*volatile pteam);
5442 
5443  KMP_MB();
5444  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5445 
5446  if( __kmp_env_consistency_check ) {
5447  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5448  }
5449 
5450 #if OMPT_SUPPORT
5451  if (ompt_enabled) {
5452  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5453  this_thr->th.ompt_thread_info.wait_id = 0;
5454  this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5455  if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5456  __ompt_thread_begin(ompt_thread_worker, gtid);
5457  }
5458  }
5459 #endif
5460 
5461  /* This is the place where threads wait for work */
5462  while( ! TCR_4(__kmp_global.g.g_done) ) {
5463  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5464  KMP_MB();
5465 
5466  /* wait for work to do */
5467  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5468 
5469 #if OMPT_SUPPORT
5470  if (ompt_enabled) {
5471  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5472  }
5473 #endif
5474 
5475  /* No tid yet since not part of a team */
5476  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5477 
5478 #if OMPT_SUPPORT
5479  if (ompt_enabled) {
5480  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5481  }
5482 #endif
5483 
5484  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5485 
5486  /* have we been allocated? */
5487  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5488 #if OMPT_SUPPORT
5489  ompt_task_info_t *task_info;
5490  ompt_parallel_id_t my_parallel_id;
5491  if (ompt_enabled) {
5492  task_info = __ompt_get_taskinfo(0);
5493  my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5494  }
5495 #endif
5496  /* we were just woken up, so run our new task */
5497  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5498  int rc;
5499  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5500  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5501 
5502  updateHWFPControl (*pteam);
5503 
5504 #if OMPT_SUPPORT
5505  if (ompt_enabled) {
5506  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5507  // Initialize OMPT task id for implicit task.
5508  int tid = __kmp_tid_from_gtid(gtid);
5509  task_info->task_id = __ompt_task_id_new(tid);
5510  }
5511 #endif
5512 
5513  {
5514  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5515  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5516  rc = (*pteam)->t.t_invoke( gtid );
5517  }
5518  KMP_ASSERT( rc );
5519 
5520 #if OMPT_SUPPORT
5521  if (ompt_enabled) {
5522  /* no frame set while outside task */
5523  task_info->frame.exit_runtime_frame = NULL;
5524 
5525  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5526  }
5527 #endif
5528  KMP_MB();
5529  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5530  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5531  }
5532  /* join barrier after parallel region */
5533  __kmp_join_barrier( gtid );
5534 #if OMPT_SUPPORT && OMPT_TRACE
5535  if (ompt_enabled) {
5536  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5537  // don't access *pteam here: it may have already been freed
5538  // by the master thread behind the barrier (possible race)
5539  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5540  my_parallel_id, task_info->task_id);
5541  }
5542  task_info->frame.exit_runtime_frame = NULL;
5543  task_info->task_id = 0;
5544  }
5545 #endif
5546  }
5547  }
5548  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5549 
5550 #if OMPT_SUPPORT
5551  if (ompt_enabled &&
5552  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5553  __ompt_thread_end(ompt_thread_worker, gtid);
5554  }
5555 #endif
5556 
5557  this_thr->th.th_task_team = NULL;
5558  /* run the destructors for the threadprivate data for this thread */
5559  __kmp_common_destroy_gtid( gtid );
5560 
5561  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5562  KMP_MB();
5563  return this_thr;
5564 }
5565 
5566 /* ------------------------------------------------------------------------ */
5567 /* ------------------------------------------------------------------------ */
5568 
5569 void
5570 __kmp_internal_end_dest( void *specific_gtid )
5571 {
5572  #if KMP_COMPILER_ICC
5573  #pragma warning( push )
5574  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5575  #endif
5576  // Make sure no significant bits are lost
5577  int gtid = (kmp_intptr_t)specific_gtid - 1;
5578  #if KMP_COMPILER_ICC
5579  #pragma warning( pop )
5580  #endif
5581 
5582  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5583  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5584  * this is because 0 is reserved for the nothing-stored case */
5585 
5586  /* josh: One reason for setting the gtid specific data even when it is being
5587  destroyed by pthread is to allow gtid lookup through thread specific data
5588  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5589  that gets executed in the call to __kmp_internal_end_thread, actually
5590  gets the gtid through the thread specific data. Setting it here seems
5591  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5592  to run smoothly.
5593  todo: get rid of this after we remove the dependence on
5594  __kmp_gtid_get_specific
5595  */
5596  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5597  __kmp_gtid_set_specific( gtid );
5598  #ifdef KMP_TDATA_GTID
5599  __kmp_gtid = gtid;
5600  #endif
5601  __kmp_internal_end_thread( gtid );
5602 }
5603 
5604 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5605 
5606 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5607 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5608 // option in makefile.mk works fine.
5609 
5610 __attribute__(( destructor ))
5611 void
5612 __kmp_internal_end_dtor( void )
5613 {
5614  __kmp_internal_end_atexit();
5615 }
5616 
5617 void
5618 __kmp_internal_end_fini( void )
5619 {
5620  __kmp_internal_end_atexit();
5621 }
5622 
5623 #endif
5624 
5625 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5626 void
5627 __kmp_internal_end_atexit( void )
5628 {
5629  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5630  /* [Windows]
5631  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5632  stat code that depends on thread specific data for gtid fails because that data becomes
5633  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5634  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5635  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5636 
5637 // TODO: Can some of this comment about GVS be removed?
5638  I suspect that the offending stat code is executed when the calling thread tries to
5639  clean up a dead root thread's data structures, resulting in GVS code trying to close
5640  the GVS structures for that thread, but since the stat code uses
5641  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5642  cleaning up itself instead of another thread, it gets confused. This happens because
5643  allowing a thread to unregister and cleanup another thread is a recent modification for
5644  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5645  thread may end up trying to unregister another thread only if thread death does not
5646  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5647  specific data destructor function to detect thread death. For Windows dynamic, there
5648  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5649  workaround is applicable only for Windows static stat library.
5650  */
5651  __kmp_internal_end_library( -1 );
5652  #if KMP_OS_WINDOWS
5653  __kmp_close_console();
5654  #endif
5655 }
5656 
5657 static void
5658 __kmp_reap_thread(
5659  kmp_info_t * thread,
5660  int is_root
5661 ) {
5662 
5663  // It is assumed __kmp_forkjoin_lock is acquired.
5664 
5665  int gtid;
5666 
5667  KMP_DEBUG_ASSERT( thread != NULL );
5668 
5669  gtid = thread->th.th_info.ds.ds_gtid;
5670 
5671  if ( ! is_root ) {
5672 
5673  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5674  /* Assume the threads are at the fork barrier here */
5675  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5676  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5677  ANNOTATE_HAPPENS_BEFORE(thread);
5678  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5679  __kmp_release_64(&flag);
5680  }; // if
5681 
5682  // Terminate OS thread.
5683  __kmp_reap_worker( thread );
5684 
5685  //
5686  // The thread was killed asynchronously. If it was actively
5687  // spinning in the thread pool, decrement the global count.
5688  //
5689  // There is a small timing hole here - if the worker thread was
5690  // just waking up after sleeping in the pool, had reset it's
5691  // th_active_in_pool flag but not decremented the global counter
5692  // __kmp_thread_pool_active_nth yet, then the global counter
5693  // might not get updated.
5694  //
5695  // Currently, this can only happen as the library is unloaded,
5696  // so there are no harmful side effects.
5697  //
5698  if ( thread->th.th_active_in_pool ) {
5699  thread->th.th_active_in_pool = FALSE;
5700  KMP_TEST_THEN_DEC32(
5701  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5702  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5703  }
5704 
5705  // Decrement # of [worker] threads in the pool.
5706  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5707  --__kmp_thread_pool_nth;
5708  }; // if
5709 
5710  __kmp_free_implicit_task(thread);
5711 
5712  // Free the fast memory for tasking
5713  #if USE_FAST_MEMORY
5714  __kmp_free_fast_memory( thread );
5715  #endif /* USE_FAST_MEMORY */
5716 
5717  __kmp_suspend_uninitialize_thread( thread );
5718 
5719  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5720  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5721 
5722  -- __kmp_all_nth;
5723  // __kmp_nth was decremented when thread is added to the pool.
5724 
5725 #ifdef KMP_ADJUST_BLOCKTIME
5726  /* Adjust blocktime back to user setting or default if necessary */
5727  /* Middle initialization might never have occurred */
5728  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5729  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5730  if ( __kmp_nth <= __kmp_avail_proc ) {
5731  __kmp_zero_bt = FALSE;
5732  }
5733  }
5734 #endif /* KMP_ADJUST_BLOCKTIME */
5735 
5736  /* free the memory being used */
5737  if( __kmp_env_consistency_check ) {
5738  if ( thread->th.th_cons ) {
5739  __kmp_free_cons_stack( thread->th.th_cons );
5740  thread->th.th_cons = NULL;
5741  }; // if
5742  }
5743 
5744  if ( thread->th.th_pri_common != NULL ) {
5745  __kmp_free( thread->th.th_pri_common );
5746  thread->th.th_pri_common = NULL;
5747  }; // if
5748 
5749  if (thread->th.th_task_state_memo_stack != NULL) {
5750  __kmp_free(thread->th.th_task_state_memo_stack);
5751  thread->th.th_task_state_memo_stack = NULL;
5752  }
5753 
5754  #if KMP_USE_BGET
5755  if ( thread->th.th_local.bget_data != NULL ) {
5756  __kmp_finalize_bget( thread );
5757  }; // if
5758  #endif
5759 
5760 #if KMP_AFFINITY_SUPPORTED
5761  if ( thread->th.th_affin_mask != NULL ) {
5762  KMP_CPU_FREE( thread->th.th_affin_mask );
5763  thread->th.th_affin_mask = NULL;
5764  }; // if
5765 #endif /* KMP_AFFINITY_SUPPORTED */
5766 
5767  __kmp_reap_team( thread->th.th_serial_team );
5768  thread->th.th_serial_team = NULL;
5769  __kmp_free( thread );
5770 
5771  KMP_MB();
5772 
5773 } // __kmp_reap_thread
5774 
5775 static void
5776 __kmp_internal_end(void)
5777 {
5778  int i;
5779 
5780  /* First, unregister the library */
5781  __kmp_unregister_library();
5782 
5783  #if KMP_OS_WINDOWS
5784  /* In Win static library, we can't tell when a root actually dies, so we
5785  reclaim the data structures for any root threads that have died but not
5786  unregistered themselves, in order to shut down cleanly.
5787  In Win dynamic library we also can't tell when a thread dies.
5788  */
5789  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5790  #endif
5791 
5792  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5793  if( __kmp_root[i] )
5794  if( __kmp_root[i]->r.r_active )
5795  break;
5796  KMP_MB(); /* Flush all pending memory write invalidates. */
5797  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5798 
5799  if ( i < __kmp_threads_capacity ) {
5800 #if KMP_USE_MONITOR
5801  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5802  KMP_MB(); /* Flush all pending memory write invalidates. */
5803 
5804  //
5805  // Need to check that monitor was initialized before reaping it.
5806  // If we are called form __kmp_atfork_child (which sets
5807  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5808  // contain valid data, but it is only valid in the parent process,
5809  // not the child.
5810  //
5811  // New behavior (201008): instead of keying off of the flag
5812  // __kmp_init_parallel, the monitor thread creation is keyed off
5813  // of the new flag __kmp_init_monitor.
5814  //
5815  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5816  if ( TCR_4( __kmp_init_monitor ) ) {
5817  __kmp_reap_monitor( & __kmp_monitor );
5818  TCW_4( __kmp_init_monitor, 0 );
5819  }
5820  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5821  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5822 #endif // KMP_USE_MONITOR
5823  } else {
5824  /* TODO move this to cleanup code */
5825  #ifdef KMP_DEBUG
5826  /* make sure that everything has properly ended */
5827  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5828  if( __kmp_root[i] ) {
5829 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5830  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5831  }
5832  }
5833  #endif
5834 
5835  KMP_MB();
5836 
5837  // Reap the worker threads.
5838  // This is valid for now, but be careful if threads are reaped sooner.
5839  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5840  // Get the next thread from the pool.
5841  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5842  __kmp_thread_pool = thread->th.th_next_pool;
5843  // Reap it.
5844  thread->th.th_next_pool = NULL;
5845  thread->th.th_in_pool = FALSE;
5846  __kmp_reap_thread( thread, 0 );
5847  }; // while
5848  __kmp_thread_pool_insert_pt = NULL;
5849 
5850  // Reap teams.
5851  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5852  // Get the next team from the pool.
5853  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5854  __kmp_team_pool = team->t.t_next_pool;
5855  // Reap it.
5856  team->t.t_next_pool = NULL;
5857  __kmp_reap_team( team );
5858  }; // while
5859 
5860  __kmp_reap_task_teams( );
5861 
5862  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5863  // TBD: Add some checking...
5864  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5865  }
5866 
5867  /* Make sure all threadprivate destructors get run by joining with all worker
5868  threads before resetting this flag */
5869  TCW_SYNC_4(__kmp_init_common, FALSE);
5870 
5871  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5872  KMP_MB();
5873 
5874 #if KMP_USE_MONITOR
5875  //
5876  // See note above: One of the possible fixes for CQ138434 / CQ140126
5877  //
5878  // FIXME: push both code fragments down and CSE them?
5879  // push them into __kmp_cleanup() ?
5880  //
5881  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5882  if ( TCR_4( __kmp_init_monitor ) ) {
5883  __kmp_reap_monitor( & __kmp_monitor );
5884  TCW_4( __kmp_init_monitor, 0 );
5885  }
5886  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5887  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5888 #endif
5889  } /* else !__kmp_global.t_active */
5890  TCW_4(__kmp_init_gtid, FALSE);
5891  KMP_MB(); /* Flush all pending memory write invalidates. */
5892 
5893  __kmp_cleanup();
5894 #if OMPT_SUPPORT
5895  ompt_fini();
5896 #endif
5897 }
5898 
5899 void
5900 __kmp_internal_end_library( int gtid_req )
5901 {
5902  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5903  /* this shouldn't be a race condition because __kmp_internal_end() is the
5904  * only place to clear __kmp_serial_init */
5905  /* we'll check this later too, after we get the lock */
5906  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5907  // because the next check will work in any case.
5908  if( __kmp_global.g.g_abort ) {
5909  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5910  /* TODO abort? */
5911  return;
5912  }
5913  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5914  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5915  return;
5916  }
5917 
5918 
5919  KMP_MB(); /* Flush all pending memory write invalidates. */
5920 
5921  /* find out who we are and what we should do */
5922  {
5923  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5924  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5925  if( gtid == KMP_GTID_SHUTDOWN ) {
5926  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5927  return;
5928  } else if( gtid == KMP_GTID_MONITOR ) {
5929  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5930  return;
5931  } else if( gtid == KMP_GTID_DNE ) {
5932  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5933  /* we don't know who we are, but we may still shutdown the library */
5934  } else if( KMP_UBER_GTID( gtid )) {
5935  /* unregister ourselves as an uber thread. gtid is no longer valid */
5936  if( __kmp_root[gtid]->r.r_active ) {
5937  __kmp_global.g.g_abort = -1;
5938  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5939  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5940  return;
5941  } else {
5942  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5943  __kmp_unregister_root_current_thread( gtid );
5944  }
5945  } else {
5946  /* worker threads may call this function through the atexit handler, if they call exit() */
5947  /* For now, skip the usual subsequent processing and just dump the debug buffer.
5948  TODO: do a thorough shutdown instead
5949  */
5950  #ifdef DUMP_DEBUG_ON_EXIT
5951  if ( __kmp_debug_buf )
5952  __kmp_dump_debug_buffer( );
5953  #endif
5954  return;
5955  }
5956  }
5957  /* synchronize the termination process */
5958  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5959 
5960  /* have we already finished */
5961  if( __kmp_global.g.g_abort ) {
5962  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5963  /* TODO abort? */
5964  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5965  return;
5966  }
5967  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5968  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5969  return;
5970  }
5971 
5972  /* We need this lock to enforce mutex between this reading of
5973  __kmp_threads_capacity and the writing by __kmp_register_root.
5974  Alternatively, we can use a counter of roots that is
5975  atomically updated by __kmp_get_global_thread_id_reg,
5976  __kmp_do_serial_initialize and __kmp_internal_end_*.
5977  */
5978  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5979 
5980  /* now we can safely conduct the actual termination */
5981  __kmp_internal_end();
5982 
5983  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5984  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5985 
5986  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
5987 
5988  #ifdef DUMP_DEBUG_ON_EXIT
5989  if ( __kmp_debug_buf )
5990  __kmp_dump_debug_buffer();
5991  #endif
5992 
5993  #if KMP_OS_WINDOWS
5994  __kmp_close_console();
5995  #endif
5996 
5997  __kmp_fini_allocator();
5998 
5999 } // __kmp_internal_end_library
6000 
6001 void
6002 __kmp_internal_end_thread( int gtid_req )
6003 {
6004  int i;
6005 
6006  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6007  /* this shouldn't be a race condition because __kmp_internal_end() is the
6008  * only place to clear __kmp_serial_init */
6009  /* we'll check this later too, after we get the lock */
6010  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6011  // because the next check will work in any case.
6012  if( __kmp_global.g.g_abort ) {
6013  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6014  /* TODO abort? */
6015  return;
6016  }
6017  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6018  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6019  return;
6020  }
6021 
6022  KMP_MB(); /* Flush all pending memory write invalidates. */
6023 
6024  /* find out who we are and what we should do */
6025  {
6026  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6027  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
6028  if( gtid == KMP_GTID_SHUTDOWN ) {
6029  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6030  return;
6031  } else if( gtid == KMP_GTID_MONITOR ) {
6032  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6033  return;
6034  } else if( gtid == KMP_GTID_DNE ) {
6035  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6036  return;
6037  /* we don't know who we are */
6038  } else if( KMP_UBER_GTID( gtid )) {
6039  /* unregister ourselves as an uber thread. gtid is no longer valid */
6040  if( __kmp_root[gtid]->r.r_active ) {
6041  __kmp_global.g.g_abort = -1;
6042  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6043  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6044  return;
6045  } else {
6046  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6047  __kmp_unregister_root_current_thread( gtid );
6048  }
6049  } else {
6050  /* just a worker thread, let's leave */
6051  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6052 
6053  if ( gtid >= 0 ) {
6054  __kmp_threads[gtid]->th.th_task_team = NULL;
6055  }
6056 
6057  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6058  return;
6059  }
6060  }
6061  #if defined KMP_DYNAMIC_LIB
6062  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6063  // because we will better shutdown later in the library destructor.
6064  // The reason of this change is performance problem when non-openmp thread
6065  // in a loop forks and joins many openmp threads. We can save a lot of time
6066  // keeping worker threads alive until the program shutdown.
6067  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6068  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6069  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6070  return;
6071  #endif
6072  /* synchronize the termination process */
6073  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6074 
6075  /* have we already finished */
6076  if( __kmp_global.g.g_abort ) {
6077  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6078  /* TODO abort? */
6079  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6080  return;
6081  }
6082  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6083  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6084  return;
6085  }
6086 
6087  /* We need this lock to enforce mutex between this reading of
6088  __kmp_threads_capacity and the writing by __kmp_register_root.
6089  Alternatively, we can use a counter of roots that is
6090  atomically updated by __kmp_get_global_thread_id_reg,
6091  __kmp_do_serial_initialize and __kmp_internal_end_*.
6092  */
6093 
6094  /* should we finish the run-time? are all siblings done? */
6095  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6096 
6097  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6098  if ( KMP_UBER_GTID( i ) ) {
6099  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6100  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6101  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6102  return;
6103  };
6104  }
6105 
6106  /* now we can safely conduct the actual termination */
6107 
6108  __kmp_internal_end();
6109 
6110  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6111  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6112 
6113  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6114 
6115  #ifdef DUMP_DEBUG_ON_EXIT
6116  if ( __kmp_debug_buf )
6117  __kmp_dump_debug_buffer();
6118  #endif
6119 } // __kmp_internal_end_thread
6120 
6121 // -------------------------------------------------------------------------------------------------
6122 // Library registration stuff.
6123 
6124 static long __kmp_registration_flag = 0;
6125  // Random value used to indicate library initialization.
6126 static char * __kmp_registration_str = NULL;
6127  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6128 
6129 
6130 static inline
6131 char *
6132 __kmp_reg_status_name() {
6133  /*
6134  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6135  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6136  the name of registered_lib_env env var can not be found, because the name will contain different pid.
6137  */
6138  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6139 } // __kmp_reg_status_get
6140 
6141 
6142 void
6143 __kmp_register_library_startup(
6144  void
6145 ) {
6146 
6147  char * name = __kmp_reg_status_name(); // Name of the environment variable.
6148  int done = 0;
6149  union {
6150  double dtime;
6151  long ltime;
6152  } time;
6153  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6154  __kmp_initialize_system_tick();
6155  #endif
6156  __kmp_read_system_time( & time.dtime );
6157  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6158  __kmp_registration_str =
6159  __kmp_str_format(
6160  "%p-%lx-%s",
6161  & __kmp_registration_flag,
6162  __kmp_registration_flag,
6163  KMP_LIBRARY_FILE
6164  );
6165 
6166  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6167 
6168  while ( ! done ) {
6169 
6170  char * value = NULL; // Actual value of the environment variable.
6171 
6172  // Set environment variable, but do not overwrite if it is exist.
6173  __kmp_env_set( name, __kmp_registration_str, 0 );
6174  // Check the variable is written.
6175  value = __kmp_env_get( name );
6176  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6177 
6178  done = 1; // Ok, environment variable set successfully, exit the loop.
6179 
6180  } else {
6181 
6182  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6183  // Check whether it alive or dead.
6184  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6185  char * tail = value;
6186  char * flag_addr_str = NULL;
6187  char * flag_val_str = NULL;
6188  char const * file_name = NULL;
6189  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6190  __kmp_str_split( tail, '-', & flag_val_str, & tail );
6191  file_name = tail;
6192  if ( tail != NULL ) {
6193  long * flag_addr = 0;
6194  long flag_val = 0;
6195  KMP_SSCANF( flag_addr_str, "%p", & flag_addr );
6196  KMP_SSCANF( flag_val_str, "%lx", & flag_val );
6197  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6198  // First, check whether environment-encoded address is mapped into addr space.
6199  // If so, dereference it to see if it still has the right value.
6200 
6201  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6202  neighbor = 1;
6203  } else {
6204  // If not, then we know the other copy of the library is no longer running.
6205  neighbor = 2;
6206  }; // if
6207  }; // if
6208  }; // if
6209  switch ( neighbor ) {
6210  case 0 : // Cannot parse environment variable -- neighbor status unknown.
6211  // Assume it is the incompatible format of future version of the library.
6212  // Assume the other library is alive.
6213  // WARN( ... ); // TODO: Issue a warning.
6214  file_name = "unknown library";
6215  // Attention! Falling to the next case. That's intentional.
6216  case 1 : { // Neighbor is alive.
6217  // Check it is allowed.
6218  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6219  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6220  // That's not allowed. Issue fatal error.
6221  __kmp_msg(
6222  kmp_ms_fatal,
6223  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6224  KMP_HNT( DuplicateLibrary ),
6225  __kmp_msg_null
6226  );
6227  }; // if
6228  KMP_INTERNAL_FREE( duplicate_ok );
6229  __kmp_duplicate_library_ok = 1;
6230  done = 1; // Exit the loop.
6231  } break;
6232  case 2 : { // Neighbor is dead.
6233  // Clear the variable and try to register library again.
6234  __kmp_env_unset( name );
6235  } break;
6236  default : {
6237  KMP_DEBUG_ASSERT( 0 );
6238  } break;
6239  }; // switch
6240 
6241  }; // if
6242  KMP_INTERNAL_FREE( (void *) value );
6243 
6244  }; // while
6245  KMP_INTERNAL_FREE( (void *) name );
6246 
6247 } // func __kmp_register_library_startup
6248 
6249 
6250 void
6251 __kmp_unregister_library( void ) {
6252 
6253  char * name = __kmp_reg_status_name();
6254  char * value = __kmp_env_get( name );
6255 
6256  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6257  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
6258  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6259  // Ok, this is our variable. Delete it.
6260  __kmp_env_unset( name );
6261  }; // if
6262 
6263  KMP_INTERNAL_FREE( __kmp_registration_str );
6264  KMP_INTERNAL_FREE( value );
6265  KMP_INTERNAL_FREE( name );
6266 
6267  __kmp_registration_flag = 0;
6268  __kmp_registration_str = NULL;
6269 
6270 } // __kmp_unregister_library
6271 
6272 
6273 // End of Library registration stuff.
6274 // -------------------------------------------------------------------------------------------------
6275 
6276 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6277 
6278 static void __kmp_check_mic_type()
6279 {
6280  kmp_cpuid_t cpuid_state = {0};
6281  kmp_cpuid_t * cs_p = &cpuid_state;
6282  __kmp_x86_cpuid(1, 0, cs_p);
6283  // We don't support mic1 at the moment
6284  if( (cs_p->eax & 0xff0) == 0xB10 ) {
6285  __kmp_mic_type = mic2;
6286  } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6287  __kmp_mic_type = mic3;
6288  } else {
6289  __kmp_mic_type = non_mic;
6290  }
6291 }
6292 
6293 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6294 
6295 static void
6296 __kmp_do_serial_initialize( void )
6297 {
6298  int i, gtid;
6299  int size;
6300 
6301  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6302 
6303  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6304  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6305  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6306  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6307  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6308 
6309 #if OMPT_SUPPORT
6310  ompt_pre_init();
6311 #endif
6312 
6313  __kmp_validate_locks();
6314 
6315  /* Initialize internal memory allocator */
6316  __kmp_init_allocator();
6317 
6318  /* Register the library startup via an environment variable
6319  and check to see whether another copy of the library is already
6320  registered. */
6321 
6322  __kmp_register_library_startup( );
6323 
6324  /* TODO reinitialization of library */
6325  if( TCR_4(__kmp_global.g.g_done) ) {
6326  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6327  }
6328 
6329  __kmp_global.g.g_abort = 0;
6330  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6331 
6332  /* initialize the locks */
6333 #if KMP_USE_ADAPTIVE_LOCKS
6334 #if KMP_DEBUG_ADAPTIVE_LOCKS
6335  __kmp_init_speculative_stats();
6336 #endif
6337 #endif
6338 #if KMP_STATS_ENABLED
6339  __kmp_stats_init();
6340 #endif
6341  __kmp_init_lock( & __kmp_global_lock );
6342  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6343  __kmp_init_lock( & __kmp_debug_lock );
6344  __kmp_init_atomic_lock( & __kmp_atomic_lock );
6345  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
6346  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
6347  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
6348  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
6349  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
6350  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
6351  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
6352  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6353  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6354  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6355  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6356  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6357  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
6358  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
6359 #if KMP_USE_MONITOR
6360  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
6361 #endif
6362  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6363 
6364  /* conduct initialization and initial setup of configuration */
6365 
6366  __kmp_runtime_initialize();
6367 
6368 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6369  __kmp_check_mic_type();
6370 #endif
6371 
6372  // Some global variable initialization moved here from kmp_env_initialize()
6373 #ifdef KMP_DEBUG
6374  kmp_diag = 0;
6375 #endif
6376  __kmp_abort_delay = 0;
6377 
6378  // From __kmp_init_dflt_team_nth()
6379  /* assume the entire machine will be used */
6380  __kmp_dflt_team_nth_ub = __kmp_xproc;
6381  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6382  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6383  }
6384  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6385  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6386  }
6387  __kmp_max_nth = __kmp_sys_max_nth;
6388 
6389  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6390  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6391 #if KMP_USE_MONITOR
6392  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6393  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6394 #endif
6395  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6396  __kmp_library = library_throughput;
6397  // From KMP_SCHEDULE initialization
6398  __kmp_static = kmp_sch_static_balanced;
6399  // AC: do not use analytical here, because it is non-monotonous
6400  //__kmp_guided = kmp_sch_guided_iterative_chunked;
6401  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6402  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6403  // control parts
6404  #if KMP_FAST_REDUCTION_BARRIER
6405  #define kmp_reduction_barrier_gather_bb ((int)1)
6406  #define kmp_reduction_barrier_release_bb ((int)1)
6407  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6408  #define kmp_reduction_barrier_release_pat bp_hyper_bar
6409  #endif // KMP_FAST_REDUCTION_BARRIER
6410  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6411  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6412  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6413  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6414  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6415  #if KMP_FAST_REDUCTION_BARRIER
6416  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6417  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6418  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6419  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6420  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6421  }
6422  #endif // KMP_FAST_REDUCTION_BARRIER
6423  }
6424  #if KMP_FAST_REDUCTION_BARRIER
6425  #undef kmp_reduction_barrier_release_pat
6426  #undef kmp_reduction_barrier_gather_pat
6427  #undef kmp_reduction_barrier_release_bb
6428  #undef kmp_reduction_barrier_gather_bb
6429  #endif // KMP_FAST_REDUCTION_BARRIER
6430 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6431  if (__kmp_mic_type == mic2) { // KNC
6432  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6433  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plain gather
6434  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
6435  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6436  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6437  }
6438 #if KMP_FAST_REDUCTION_BARRIER
6439  if (__kmp_mic_type == mic2) { // KNC
6440  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6441  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6442  }
6443 #endif
6444 #endif
6445 
6446  // From KMP_CHECKS initialization
6447 #ifdef KMP_DEBUG
6448  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6449 #else
6450  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6451 #endif
6452 
6453  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6454  __kmp_foreign_tp = TRUE;
6455 
6456  __kmp_global.g.g_dynamic = FALSE;
6457  __kmp_global.g.g_dynamic_mode = dynamic_default;
6458 
6459  __kmp_env_initialize( NULL );
6460 
6461  // Print all messages in message catalog for testing purposes.
6462  #ifdef KMP_DEBUG
6463  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6464  if ( __kmp_str_match_true( val ) ) {
6465  kmp_str_buf_t buffer;
6466  __kmp_str_buf_init( & buffer );
6467  __kmp_i18n_dump_catalog( & buffer );
6468  __kmp_printf( "%s", buffer.str );
6469  __kmp_str_buf_free( & buffer );
6470  }; // if
6471  __kmp_env_free( & val );
6472  #endif
6473 
6474  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6475  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6476  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6477 
6478  // If the library is shut down properly, both pools must be NULL. Just in case, set them
6479  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6480  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6481  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6482  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
6483  __kmp_thread_pool = NULL;
6484  __kmp_thread_pool_insert_pt = NULL;
6485  __kmp_team_pool = NULL;
6486 
6487  /* Allocate all of the variable sized records */
6488  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6489  /* Since allocation is cache-aligned, just add extra padding at the end */
6490  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6491  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6492  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6493 
6494  /* init thread counts */
6495  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6496  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
6497  __kmp_all_nth = 0;
6498  __kmp_nth = 0;
6499 
6500  /* setup the uber master thread and hierarchy */
6501  gtid = __kmp_register_root( TRUE );
6502  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6503  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6504  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6505 
6506  KMP_MB(); /* Flush all pending memory write invalidates. */
6507 
6508  __kmp_common_initialize();
6509 
6510  #if KMP_OS_UNIX
6511  /* invoke the child fork handler */
6512  __kmp_register_atfork();
6513  #endif
6514 
6515  #if ! defined KMP_DYNAMIC_LIB
6516  {
6517  /* Invoke the exit handler when the program finishes, only for static library.
6518  For dynamic library, we already have _fini and DllMain.
6519  */
6520  int rc = atexit( __kmp_internal_end_atexit );
6521  if ( rc != 0 ) {
6522  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6523  }; // if
6524  }
6525  #endif
6526 
6527  #if KMP_HANDLE_SIGNALS
6528  #if KMP_OS_UNIX
6529  /* NOTE: make sure that this is called before the user installs
6530  * their own signal handlers so that the user handlers
6531  * are called first. this way they can return false,
6532  * not call our handler, avoid terminating the library,
6533  * and continue execution where they left off. */
6534  __kmp_install_signals( FALSE );
6535  #endif /* KMP_OS_UNIX */
6536  #if KMP_OS_WINDOWS
6537  __kmp_install_signals( TRUE );
6538  #endif /* KMP_OS_WINDOWS */
6539  #endif
6540 
6541  /* we have finished the serial initialization */
6542  __kmp_init_counter ++;
6543 
6544  __kmp_init_serial = TRUE;
6545 
6546  if (__kmp_settings) {
6547  __kmp_env_print();
6548  }
6549 
6550 #if OMP_40_ENABLED
6551  if (__kmp_display_env || __kmp_display_env_verbose) {
6552  __kmp_env_print_2();
6553  }
6554 #endif // OMP_40_ENABLED
6555 
6556 #if OMPT_SUPPORT
6557  ompt_post_init();
6558 #endif
6559 
6560  KMP_MB();
6561 
6562  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6563 }
6564 
6565 void
6566 __kmp_serial_initialize( void )
6567 {
6568  if ( __kmp_init_serial ) {
6569  return;
6570  }
6571  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6572  if ( __kmp_init_serial ) {
6573  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6574  return;
6575  }
6576  __kmp_do_serial_initialize();
6577  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6578 }
6579 
6580 static void
6581 __kmp_do_middle_initialize( void )
6582 {
6583  int i, j;
6584  int prev_dflt_team_nth;
6585 
6586  if( !__kmp_init_serial ) {
6587  __kmp_do_serial_initialize();
6588  }
6589 
6590  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6591 
6592  //
6593  // Save the previous value for the __kmp_dflt_team_nth so that
6594  // we can avoid some reinitialization if it hasn't changed.
6595  //
6596  prev_dflt_team_nth = __kmp_dflt_team_nth;
6597 
6598 #if KMP_AFFINITY_SUPPORTED
6599  //
6600  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6601  // number of cores on the machine.
6602  //
6603  __kmp_affinity_initialize();
6604 
6605  //
6606  // Run through the __kmp_threads array and set the affinity mask
6607  // for each root thread that is currently registered with the RTL.
6608  //
6609  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6610  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6611  __kmp_affinity_set_init_mask( i, TRUE );
6612  }
6613  }
6614 #endif /* KMP_AFFINITY_SUPPORTED */
6615 
6616  KMP_ASSERT( __kmp_xproc > 0 );
6617  if ( __kmp_avail_proc == 0 ) {
6618  __kmp_avail_proc = __kmp_xproc;
6619  }
6620 
6621  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6622  j = 0;
6623  while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6624  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6625  j++;
6626  }
6627 
6628  if ( __kmp_dflt_team_nth == 0 ) {
6629 #ifdef KMP_DFLT_NTH_CORES
6630  //
6631  // Default #threads = #cores
6632  //
6633  __kmp_dflt_team_nth = __kmp_ncores;
6634  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6635  __kmp_dflt_team_nth ) );
6636 #else
6637  //
6638  // Default #threads = #available OS procs
6639  //
6640  __kmp_dflt_team_nth = __kmp_avail_proc;
6641  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6642  __kmp_dflt_team_nth ) );
6643 #endif /* KMP_DFLT_NTH_CORES */
6644  }
6645 
6646  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6647  __kmp_dflt_team_nth = KMP_MIN_NTH;
6648  }
6649  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6650  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6651  }
6652 
6653  //
6654  // There's no harm in continuing if the following check fails,
6655  // but it indicates an error in the previous logic.
6656  //
6657  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6658 
6659  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6660  //
6661  // Run through the __kmp_threads array and set the num threads icv
6662  // for each root thread that is currently registered with the RTL
6663  // (which has not already explicitly set its nthreads-var with a
6664  // call to omp_set_num_threads()).
6665  //
6666  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6667  kmp_info_t *thread = __kmp_threads[ i ];
6668  if ( thread == NULL ) continue;
6669  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6670 
6671  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6672  }
6673  }
6674  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6675  __kmp_dflt_team_nth) );
6676 
6677 #ifdef KMP_ADJUST_BLOCKTIME
6678  /* Adjust blocktime to zero if necessary */
6679  /* now that __kmp_avail_proc is set */
6680  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6681  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6682  if ( __kmp_nth > __kmp_avail_proc ) {
6683  __kmp_zero_bt = TRUE;
6684  }
6685  }
6686 #endif /* KMP_ADJUST_BLOCKTIME */
6687 
6688  /* we have finished middle initialization */
6689  TCW_SYNC_4(__kmp_init_middle, TRUE);
6690 
6691  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6692 }
6693 
6694 void
6695 __kmp_middle_initialize( void )
6696 {
6697  if ( __kmp_init_middle ) {
6698  return;
6699  }
6700  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6701  if ( __kmp_init_middle ) {
6702  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6703  return;
6704  }
6705  __kmp_do_middle_initialize();
6706  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6707 }
6708 
6709 void
6710 __kmp_parallel_initialize( void )
6711 {
6712  int gtid = __kmp_entry_gtid(); // this might be a new root
6713 
6714  /* synchronize parallel initialization (for sibling) */
6715  if( TCR_4(__kmp_init_parallel) ) return;
6716  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6717  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6718 
6719  /* TODO reinitialization after we have already shut down */
6720  if( TCR_4(__kmp_global.g.g_done) ) {
6721  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6722  __kmp_infinite_loop();
6723  }
6724 
6725  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6726  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6727  */
6728  if( !__kmp_init_middle ) {
6729  __kmp_do_middle_initialize();
6730  }
6731 
6732  /* begin initialization */
6733  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6734  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6735 
6736 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6737  //
6738  // Save the FP control regs.
6739  // Worker threads will set theirs to these values at thread startup.
6740  //
6741  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6742  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6743  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6744 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6745 
6746 #if KMP_OS_UNIX
6747 # if KMP_HANDLE_SIGNALS
6748  /* must be after __kmp_serial_initialize */
6749  __kmp_install_signals( TRUE );
6750 # endif
6751 #endif
6752 
6753  __kmp_suspend_initialize();
6754 
6755 #if defined(USE_LOAD_BALANCE)
6756  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6757  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6758  }
6759 #else
6760  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6761  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6762  }
6763 #endif
6764 
6765  if ( __kmp_version ) {
6766  __kmp_print_version_2();
6767  }
6768 
6769  /* we have finished parallel initialization */
6770  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6771 
6772  KMP_MB();
6773  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6774 
6775  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6776 }
6777 
6778 
6779 /* ------------------------------------------------------------------------ */
6780 
6781 void
6782 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6783  kmp_team_t *team )
6784 {
6785  kmp_disp_t *dispatch;
6786 
6787  KMP_MB();
6788 
6789  /* none of the threads have encountered any constructs, yet. */
6790  this_thr->th.th_local.this_construct = 0;
6791 #if KMP_CACHE_MANAGE
6792  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6793 #endif /* KMP_CACHE_MANAGE */
6794  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6795  KMP_DEBUG_ASSERT( dispatch );
6796  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6797  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6798 
6799  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6800 #if OMP_45_ENABLED
6801  dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
6802 #endif
6803  if( __kmp_env_consistency_check )
6804  __kmp_push_parallel( gtid, team->t.t_ident );
6805 
6806  KMP_MB(); /* Flush all pending memory write invalidates. */
6807 }
6808 
6809 void
6810 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6811  kmp_team_t *team )
6812 {
6813  if( __kmp_env_consistency_check )
6814  __kmp_pop_parallel( gtid, team->t.t_ident );
6815 
6816  __kmp_finish_implicit_task(this_thr);
6817 }
6818 
6819 int
6820 __kmp_invoke_task_func( int gtid )
6821 {
6822  int rc;
6823  int tid = __kmp_tid_from_gtid( gtid );
6824  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6825  kmp_team_t *team = this_thr->th.th_team;
6826 
6827  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6828 #if USE_ITT_BUILD
6829  if ( __itt_stack_caller_create_ptr ) {
6830  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6831  }
6832 #endif /* USE_ITT_BUILD */
6833 #if INCLUDE_SSC_MARKS
6834  SSC_MARK_INVOKING();
6835 #endif
6836 
6837 #if OMPT_SUPPORT
6838  void *dummy;
6839  void **exit_runtime_p;
6840  ompt_task_id_t my_task_id;
6841  ompt_parallel_id_t my_parallel_id;
6842 
6843  if (ompt_enabled) {
6844  exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6845  ompt_task_info.frame.exit_runtime_frame);
6846  } else {
6847  exit_runtime_p = &dummy;
6848  }
6849 
6850 #if OMPT_TRACE
6851  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6852  my_parallel_id = team->t.ompt_team_info.parallel_id;
6853  if (ompt_enabled &&
6854  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6855  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6856  my_parallel_id, my_task_id);
6857  }
6858 #endif
6859 #endif
6860 
6861  {
6862  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6863  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6864  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6865  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6866 #if OMPT_SUPPORT
6867  , exit_runtime_p
6868 #endif
6869  );
6870 #if OMPT_SUPPORT
6871  *exit_runtime_p = NULL;
6872 #endif
6873  }
6874 
6875 #if USE_ITT_BUILD
6876  if ( __itt_stack_caller_create_ptr ) {
6877  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6878  }
6879 #endif /* USE_ITT_BUILD */
6880  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6881 
6882  return rc;
6883 }
6884 
6885 #if OMP_40_ENABLED
6886 void
6887 __kmp_teams_master( int gtid )
6888 {
6889  // This routine is called by all master threads in teams construct
6890  kmp_info_t *thr = __kmp_threads[ gtid ];
6891  kmp_team_t *team = thr->th.th_team;
6892  ident_t *loc = team->t.t_ident;
6893  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6894  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6895  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6896  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6897  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6898  // Launch league of teams now, but not let workers execute
6899  // (they hang on fork barrier until next parallel)
6900 #if INCLUDE_SSC_MARKS
6901  SSC_MARK_FORKING();
6902 #endif
6903  __kmp_fork_call( loc, gtid, fork_context_intel,
6904  team->t.t_argc,
6905 #if OMPT_SUPPORT
6906  (void *)thr->th.th_teams_microtask, // "unwrapped" task
6907 #endif
6908  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6909  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6910  NULL );
6911 #if INCLUDE_SSC_MARKS
6912  SSC_MARK_JOINING();
6913 #endif
6914 
6915  // AC: last parameter "1" eliminates join barrier which won't work because
6916  // worker threads are in a fork barrier waiting for more parallel regions
6917  __kmp_join_call( loc, gtid
6918 #if OMPT_SUPPORT
6919  , fork_context_intel
6920 #endif
6921  , 1 );
6922 }
6923 
6924 int
6925 __kmp_invoke_teams_master( int gtid )
6926 {
6927  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6928  kmp_team_t *team = this_thr->th.th_team;
6929  #if KMP_DEBUG
6930  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6931  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6932  #endif
6933  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6934  __kmp_teams_master( gtid );
6935  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6936  return 1;
6937 }
6938 #endif /* OMP_40_ENABLED */
6939 
6940 /* this sets the requested number of threads for the next parallel region
6941  * encountered by this team */
6942 /* since this should be enclosed in the forkjoin critical section it
6943  * should avoid race conditions with assymmetrical nested parallelism */
6944 
6945 void
6946 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6947 {
6948  kmp_info_t *thr = __kmp_threads[gtid];
6949 
6950  if( num_threads > 0 )
6951  thr->th.th_set_nproc = num_threads;
6952 }
6953 
6954 #if OMP_40_ENABLED
6955 
6956 /* this sets the requested number of teams for the teams region and/or
6957  * the number of threads for the next parallel region encountered */
6958 void
6959 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6960 {
6961  kmp_info_t *thr = __kmp_threads[gtid];
6962  KMP_DEBUG_ASSERT(num_teams >= 0);
6963  KMP_DEBUG_ASSERT(num_threads >= 0);
6964 
6965  if( num_teams == 0 )
6966  num_teams = 1; // default number of teams is 1.
6967  if( num_teams > __kmp_max_nth ) { // if too many teams requested?
6968  if ( !__kmp_reserve_warn ) {
6969  __kmp_reserve_warn = 1;
6970  __kmp_msg(
6971  kmp_ms_warning,
6972  KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
6973  KMP_HNT( Unset_ALL_THREADS ),
6974  __kmp_msg_null
6975  );
6976  }
6977  num_teams = __kmp_max_nth;
6978  }
6979  // Set number of teams (number of threads in the outer "parallel" of the teams)
6980  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6981 
6982  // Remember the number of threads for inner parallel regions
6983  if( num_threads == 0 ) {
6984  if( !TCR_4(__kmp_init_middle) )
6985  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6986  num_threads = __kmp_avail_proc / num_teams;
6987  if( num_teams * num_threads > __kmp_max_nth ) {
6988  // adjust num_threads w/o warning as it is not user setting
6989  num_threads = __kmp_max_nth / num_teams;
6990  }
6991  } else {
6992  if( num_teams * num_threads > __kmp_max_nth ) {
6993  int new_threads = __kmp_max_nth / num_teams;
6994  if ( !__kmp_reserve_warn ) { // user asked for too many threads
6995  __kmp_reserve_warn = 1; // that conflicts with OMP_THREAD_LIMIT
6996  __kmp_msg(
6997  kmp_ms_warning,
6998  KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
6999  KMP_HNT( Unset_ALL_THREADS ),
7000  __kmp_msg_null
7001  );
7002  }
7003  num_threads = new_threads;
7004  }
7005  }
7006  thr->th.th_teams_size.nth = num_threads;
7007 }
7008 
7009 
7010 //
7011 // Set the proc_bind var to use in the following parallel region.
7012 //
7013 void
7014 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7015 {
7016  kmp_info_t *thr = __kmp_threads[gtid];
7017  thr->th.th_set_proc_bind = proc_bind;
7018 }
7019 
7020 #endif /* OMP_40_ENABLED */
7021 
7022 /* Launch the worker threads into the microtask. */
7023 
7024 void
7025 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7026 {
7027  kmp_info_t *this_thr = __kmp_threads[gtid];
7028 
7029 #ifdef KMP_DEBUG
7030  int f;
7031 #endif /* KMP_DEBUG */
7032 
7033  KMP_DEBUG_ASSERT( team );
7034  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7035  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7036  KMP_MB(); /* Flush all pending memory write invalidates. */
7037 
7038  team->t.t_construct = 0; /* no single directives seen yet */
7039  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7040 
7041  /* Reset the identifiers on the dispatch buffer */
7042  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7043  if ( team->t.t_max_nproc > 1 ) {
7044  int i;
7045  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7046  team->t.t_disp_buffer[ i ].buffer_index = i;
7047 #if OMP_45_ENABLED
7048  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7049 #endif
7050  }
7051  } else {
7052  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7053 #if OMP_45_ENABLED
7054  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7055 #endif
7056  }
7057 
7058  KMP_MB(); /* Flush all pending memory write invalidates. */
7059  KMP_ASSERT( this_thr->th.th_team == team );
7060 
7061 #ifdef KMP_DEBUG
7062  for( f=0 ; f<team->t.t_nproc ; f++ ) {
7063  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7064  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7065  }
7066 #endif /* KMP_DEBUG */
7067 
7068  /* release the worker threads so they may begin working */
7069  __kmp_fork_barrier( gtid, 0 );
7070 }
7071 
7072 
7073 void
7074 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7075 {
7076  kmp_info_t *this_thr = __kmp_threads[gtid];
7077 
7078  KMP_DEBUG_ASSERT( team );
7079  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7080  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7081  KMP_MB(); /* Flush all pending memory write invalidates. */
7082 
7083  /* Join barrier after fork */
7084 
7085 #ifdef KMP_DEBUG
7086  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7087  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7088  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7089  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7090  __kmp_print_structure();
7091  }
7092  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7093  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7094 #endif /* KMP_DEBUG */
7095 
7096  __kmp_join_barrier( gtid ); /* wait for everyone */
7097 
7098  KMP_MB(); /* Flush all pending memory write invalidates. */
7099  KMP_ASSERT( this_thr->th.th_team == team );
7100 }
7101 
7102 
7103 /* ------------------------------------------------------------------------ */
7104 /* ------------------------------------------------------------------------ */
7105 
7106 #ifdef USE_LOAD_BALANCE
7107 
7108 //
7109 // Return the worker threads actively spinning in the hot team, if we
7110 // are at the outermost level of parallelism. Otherwise, return 0.
7111 //
7112 static int
7113 __kmp_active_hot_team_nproc( kmp_root_t *root )
7114 {
7115  int i;
7116  int retval;
7117  kmp_team_t *hot_team;
7118 
7119  if ( root->r.r_active ) {
7120  return 0;
7121  }
7122  hot_team = root->r.r_hot_team;
7123  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7124  return hot_team->t.t_nproc - 1; // Don't count master thread
7125  }
7126 
7127  //
7128  // Skip the master thread - it is accounted for elsewhere.
7129  //
7130  retval = 0;
7131  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7132  if ( hot_team->t.t_threads[i]->th.th_active ) {
7133  retval++;
7134  }
7135  }
7136  return retval;
7137 }
7138 
7139 //
7140 // Perform an automatic adjustment to the number of
7141 // threads used by the next parallel region.
7142 //
7143 static int
7144 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7145 {
7146  int retval;
7147  int pool_active;
7148  int hot_team_active;
7149  int team_curr_active;
7150  int system_active;
7151 
7152  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7153  root, set_nproc ) );
7154  KMP_DEBUG_ASSERT( root );
7155  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7156  KMP_DEBUG_ASSERT( set_nproc > 1 );
7157 
7158  if ( set_nproc == 1) {
7159  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7160  return 1;
7161  }
7162 
7163  //
7164  // Threads that are active in the thread pool, active in the hot team
7165  // for this particular root (if we are at the outer par level), and
7166  // the currently executing thread (to become the master) are available
7167  // to add to the new team, but are currently contributing to the system
7168  // load, and must be accounted for.
7169  //
7170  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7171  hot_team_active = __kmp_active_hot_team_nproc( root );
7172  team_curr_active = pool_active + hot_team_active + 1;
7173 
7174  //
7175  // Check the system load.
7176  //
7177  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7178  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7179  system_active, pool_active, hot_team_active ) );
7180 
7181  if ( system_active < 0 ) {
7182  //
7183  // There was an error reading the necessary info from /proc,
7184  // so use the thread limit algorithm instead. Once we set
7185  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7186  // we shouldn't wind up getting back here.
7187  //
7188  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7189  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7190 
7191  //
7192  // Make this call behave like the thread limit algorithm.
7193  //
7194  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7195  : root->r.r_hot_team->t.t_nproc);
7196  if ( retval > set_nproc ) {
7197  retval = set_nproc;
7198  }
7199  if ( retval < KMP_MIN_NTH ) {
7200  retval = KMP_MIN_NTH;
7201  }
7202 
7203  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7204  return retval;
7205  }
7206 
7207  //
7208  // There is a slight delay in the load balance algorithm in detecting
7209  // new running procs. The real system load at this instant should be
7210  // at least as large as the #active omp thread that are available to
7211  // add to the team.
7212  //
7213  if ( system_active < team_curr_active ) {
7214  system_active = team_curr_active;
7215  }
7216  retval = __kmp_avail_proc - system_active + team_curr_active;
7217  if ( retval > set_nproc ) {
7218  retval = set_nproc;
7219  }
7220  if ( retval < KMP_MIN_NTH ) {
7221  retval = KMP_MIN_NTH;
7222  }
7223 
7224  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7225  return retval;
7226 } // __kmp_load_balance_nproc()
7227 
7228 #endif /* USE_LOAD_BALANCE */
7229 
7230 /* ------------------------------------------------------------------------ */
7231 /* ------------------------------------------------------------------------ */
7232 
7233 /* NOTE: this is called with the __kmp_init_lock held */
7234 void
7235 __kmp_cleanup( void )
7236 {
7237  int f;
7238 
7239  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7240 
7241  if (TCR_4(__kmp_init_parallel)) {
7242 #if KMP_HANDLE_SIGNALS
7243  __kmp_remove_signals();
7244 #endif
7245  TCW_4(__kmp_init_parallel, FALSE);
7246  }
7247 
7248  if (TCR_4(__kmp_init_middle)) {
7249 #if KMP_AFFINITY_SUPPORTED
7250  __kmp_affinity_uninitialize();
7251 #endif /* KMP_AFFINITY_SUPPORTED */
7252  __kmp_cleanup_hierarchy();
7253  TCW_4(__kmp_init_middle, FALSE);
7254  }
7255 
7256  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7257 
7258  if (__kmp_init_serial) {
7259  __kmp_runtime_destroy();
7260  __kmp_init_serial = FALSE;
7261  }
7262 
7263  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7264  if ( __kmp_root[ f ] != NULL ) {
7265  __kmp_free( __kmp_root[ f ] );
7266  __kmp_root[ f ] = NULL;
7267  }
7268  }
7269  __kmp_free( __kmp_threads );
7270  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7271  // freeing __kmp_root.
7272  __kmp_threads = NULL;
7273  __kmp_root = NULL;
7274  __kmp_threads_capacity = 0;
7275 
7276 #if KMP_USE_DYNAMIC_LOCK
7277  __kmp_cleanup_indirect_user_locks();
7278 #else
7279  __kmp_cleanup_user_locks();
7280 #endif
7281 
7282  #if KMP_AFFINITY_SUPPORTED
7283  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7284  __kmp_cpuinfo_file = NULL;
7285  #endif /* KMP_AFFINITY_SUPPORTED */
7286 
7287  #if KMP_USE_ADAPTIVE_LOCKS
7288  #if KMP_DEBUG_ADAPTIVE_LOCKS
7289  __kmp_print_speculative_stats();
7290  #endif
7291  #endif
7292  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7293  __kmp_nested_nth.nth = NULL;
7294  __kmp_nested_nth.size = 0;
7295  __kmp_nested_nth.used = 0;
7296  KMP_INTERNAL_FREE( __kmp_nested_proc_bind.bind_types );
7297  __kmp_nested_proc_bind.bind_types = NULL;
7298  __kmp_nested_proc_bind.size = 0;
7299  __kmp_nested_proc_bind.used = 0;
7300 
7301  __kmp_i18n_catclose();
7302 
7303 #if KMP_STATS_ENABLED
7304  __kmp_stats_fini();
7305 #endif
7306 
7307  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7308 }
7309 
7310 /* ------------------------------------------------------------------------ */
7311 /* ------------------------------------------------------------------------ */
7312 
7313 int
7314 __kmp_ignore_mppbeg( void )
7315 {
7316  char *env;
7317 
7318  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7319  if (__kmp_str_match_false( env ))
7320  return FALSE;
7321  }
7322  // By default __kmpc_begin() is no-op.
7323  return TRUE;
7324 }
7325 
7326 int
7327 __kmp_ignore_mppend( void )
7328 {
7329  char *env;
7330 
7331  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7332  if (__kmp_str_match_false( env ))
7333  return FALSE;
7334  }
7335  // By default __kmpc_end() is no-op.
7336  return TRUE;
7337 }
7338 
7339 void
7340 __kmp_internal_begin( void )
7341 {
7342  int gtid;
7343  kmp_root_t *root;
7344 
7345  /* this is a very important step as it will register new sibling threads
7346  * and assign these new uber threads a new gtid */
7347  gtid = __kmp_entry_gtid();
7348  root = __kmp_threads[ gtid ]->th.th_root;
7349  KMP_ASSERT( KMP_UBER_GTID( gtid ));
7350 
7351  if( root->r.r_begin ) return;
7352  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7353  if( root->r.r_begin ) {
7354  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7355  return;
7356  }
7357 
7358  root->r.r_begin = TRUE;
7359 
7360  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7361 }
7362 
7363 
7364 /* ------------------------------------------------------------------------ */
7365 /* ------------------------------------------------------------------------ */
7366 
7367 void
7368 __kmp_user_set_library (enum library_type arg)
7369 {
7370  int gtid;
7371  kmp_root_t *root;
7372  kmp_info_t *thread;
7373 
7374  /* first, make sure we are initialized so we can get our gtid */
7375 
7376  gtid = __kmp_entry_gtid();
7377  thread = __kmp_threads[ gtid ];
7378 
7379  root = thread->th.th_root;
7380 
7381  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7382  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7383  KMP_WARNING( SetLibraryIncorrectCall );
7384  return;
7385  }
7386 
7387  switch ( arg ) {
7388  case library_serial :
7389  thread->th.th_set_nproc = 0;
7390  set__nproc( thread, 1 );
7391  break;
7392  case library_turnaround :
7393  thread->th.th_set_nproc = 0;
7394  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7395  break;
7396  case library_throughput :
7397  thread->th.th_set_nproc = 0;
7398  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7399  break;
7400  default:
7401  KMP_FATAL( UnknownLibraryType, arg );
7402  }
7403 
7404  __kmp_aux_set_library ( arg );
7405 }
7406 
7407 void
7408 __kmp_aux_set_stacksize( size_t arg )
7409 {
7410  if (! __kmp_init_serial)
7411  __kmp_serial_initialize();
7412 
7413 #if KMP_OS_DARWIN
7414  if (arg & (0x1000 - 1)) {
7415  arg &= ~(0x1000 - 1);
7416  if(arg + 0x1000) /* check for overflow if we round up */
7417  arg += 0x1000;
7418  }
7419 #endif
7420  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7421 
7422  /* only change the default stacksize before the first parallel region */
7423  if (! TCR_4(__kmp_init_parallel)) {
7424  size_t value = arg; /* argument is in bytes */
7425 
7426  if (value < __kmp_sys_min_stksize )
7427  value = __kmp_sys_min_stksize ;
7428  else if (value > KMP_MAX_STKSIZE)
7429  value = KMP_MAX_STKSIZE;
7430 
7431  __kmp_stksize = value;
7432 
7433  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7434  }
7435 
7436  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7437 }
7438 
7439 /* set the behaviour of the runtime library */
7440 /* TODO this can cause some odd behaviour with sibling parallelism... */
7441 void
7442 __kmp_aux_set_library (enum library_type arg)
7443 {
7444  __kmp_library = arg;
7445 
7446  switch ( __kmp_library ) {
7447  case library_serial :
7448  {
7449  KMP_INFORM( LibraryIsSerial );
7450  (void) __kmp_change_library( TRUE );
7451  }
7452  break;
7453  case library_turnaround :
7454  (void) __kmp_change_library( TRUE );
7455  break;
7456  case library_throughput :
7457  (void) __kmp_change_library( FALSE );
7458  break;
7459  default:
7460  KMP_FATAL( UnknownLibraryType, arg );
7461  }
7462 }
7463 
7464 /* ------------------------------------------------------------------------ */
7465 /* ------------------------------------------------------------------------ */
7466 
7467 void
7468 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7469 {
7470  int blocktime = arg; /* argument is in milliseconds */
7471 #if KMP_USE_MONITOR
7472  int bt_intervals;
7473 #endif
7474  int bt_set;
7475 
7476  __kmp_save_internal_controls( thread );
7477 
7478  /* Normalize and set blocktime for the teams */
7479  if (blocktime < KMP_MIN_BLOCKTIME)
7480  blocktime = KMP_MIN_BLOCKTIME;
7481  else if (blocktime > KMP_MAX_BLOCKTIME)
7482  blocktime = KMP_MAX_BLOCKTIME;
7483 
7484  set__blocktime_team( thread->th.th_team, tid, blocktime );
7485  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7486 
7487 #if KMP_USE_MONITOR
7488  /* Calculate and set blocktime intervals for the teams */
7489  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7490 
7491  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7492  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7493 #endif
7494 
7495  /* Set whether blocktime has been set to "TRUE" */
7496  bt_set = TRUE;
7497 
7498  set__bt_set_team( thread->th.th_team, tid, bt_set );
7499  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7500 #if KMP_USE_MONITOR
7501  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7502  "bt_intervals=%d, monitor_updates=%d\n",
7503  __kmp_gtid_from_tid(tid, thread->th.th_team),
7504  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7505  __kmp_monitor_wakeups));
7506 #else
7507  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7508  __kmp_gtid_from_tid(tid, thread->th.th_team),
7509  thread->th.th_team->t.t_id, tid, blocktime));
7510 #endif
7511 }
7512 
7513 void
7514 __kmp_aux_set_defaults(
7515  char const * str,
7516  int len
7517 ) {
7518  if ( ! __kmp_init_serial ) {
7519  __kmp_serial_initialize();
7520  };
7521  __kmp_env_initialize( str );
7522 
7523  if (__kmp_settings
7524 #if OMP_40_ENABLED
7525  || __kmp_display_env || __kmp_display_env_verbose
7526 #endif // OMP_40_ENABLED
7527  ) {
7528  __kmp_env_print();
7529  }
7530 } // __kmp_aux_set_defaults
7531 
7532 /* ------------------------------------------------------------------------ */
7533 
7534 /*
7535  * internal fast reduction routines
7536  */
7537 
7538 PACKED_REDUCTION_METHOD_T
7539 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7540  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7541  kmp_critical_name *lck )
7542 {
7543 
7544  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7545  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7546  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7547  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7548 
7549  PACKED_REDUCTION_METHOD_T retval;
7550 
7551  int team_size;
7552 
7553  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
7554  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
7555 
7556  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7557  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
7558 
7559  retval = critical_reduce_block;
7560 
7561  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7562 
7563  if( team_size == 1 ) {
7564 
7565  retval = empty_reduce_block;
7566 
7567  } else {
7568 
7569  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7570  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7571 
7572  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7573 
7574  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7575 
7576  int teamsize_cutoff = 4;
7577 
7578 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7579  if( __kmp_mic_type != non_mic ) {
7580  teamsize_cutoff = 8;
7581  }
7582 #endif
7583  if( tree_available ) {
7584  if( team_size <= teamsize_cutoff ) {
7585  if ( atomic_available ) {
7586  retval = atomic_reduce_block;
7587  }
7588  } else {
7589  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7590  }
7591  } else if ( atomic_available ) {
7592  retval = atomic_reduce_block;
7593  }
7594  #else
7595  #error "Unknown or unsupported OS"
7596  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7597 
7598  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7599 
7600  #if KMP_OS_LINUX || KMP_OS_WINDOWS
7601 
7602  // basic tuning
7603 
7604  if( atomic_available ) {
7605  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7606  retval = atomic_reduce_block;
7607  }
7608  } // otherwise: use critical section
7609 
7610  #elif KMP_OS_DARWIN
7611 
7612  if( atomic_available && ( num_vars <= 3 ) ) {
7613  retval = atomic_reduce_block;
7614  } else if( tree_available ) {
7615  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7616  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7617  }
7618  } // otherwise: use critical section
7619 
7620  #else
7621  #error "Unknown or unsupported OS"
7622  #endif
7623 
7624  #else
7625  #error "Unknown or unsupported architecture"
7626  #endif
7627 
7628  }
7629 
7630  // KMP_FORCE_REDUCTION
7631 
7632  // If the team is serialized (team_size == 1), ignore the forced reduction
7633  // method and stay with the unsynchronized method (empty_reduce_block)
7634  if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7635 
7636  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7637 
7638  int atomic_available, tree_available;
7639 
7640  switch( ( forced_retval = __kmp_force_reduction_method ) )
7641  {
7642  case critical_reduce_block:
7643  KMP_ASSERT( lck ); // lck should be != 0
7644  break;
7645 
7646  case atomic_reduce_block:
7647  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7648  if( ! atomic_available ) {
7649  KMP_WARNING(RedMethodNotSupported, "atomic");
7650  forced_retval = critical_reduce_block;
7651  }
7652  break;
7653 
7654  case tree_reduce_block:
7655  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7656  if( ! tree_available ) {
7657  KMP_WARNING(RedMethodNotSupported, "tree");
7658  forced_retval = critical_reduce_block;
7659  } else {
7660  #if KMP_FAST_REDUCTION_BARRIER
7661  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7662  #endif
7663  }
7664  break;
7665 
7666  default:
7667  KMP_ASSERT( 0 ); // "unsupported method specified"
7668  }
7669 
7670  retval = forced_retval;
7671  }
7672 
7673  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7674 
7675  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7676  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7677 
7678  return ( retval );
7679 }
7680 
7681 // this function is for testing set/get/determine reduce method
7682 kmp_int32
7683 __kmp_get_reduce_method( void ) {
7684  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7685 }
7686 
7687 /* ------------------------------------------------------------------------ */
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:761
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:735
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:183
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:802
Definition: kmp.h:200
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
sched_type
Definition: kmp.h:303
kmp_int32 flags
Definition: kmp.h:202