LLVM OpenMP* Runtime Library
kmp_csupport.cpp
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "omp.h" /* extern "C" declarations of user-visible routines */
17 #include "kmp.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_error.h"
22 #include "kmp_stats.h"
23 
24 #if OMPT_SUPPORT
25 #include "ompt-internal.h"
26 #include "ompt-specific.h"
27 #endif
28 
29 #define MAX_MESSAGE 512
30 
31 /* ------------------------------------------------------------------------ */
32 /* ------------------------------------------------------------------------ */
33 
34 /* flags will be used in future, e.g., to implement */
35 /* openmp_strict library restrictions */
36 
46 void
47 __kmpc_begin(ident_t *loc, kmp_int32 flags)
48 {
49  // By default __kmp_ignore_mppbeg() returns TRUE.
50  if (__kmp_ignore_mppbeg() == FALSE) {
51  __kmp_internal_begin();
52 
53  KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
54  }
55 }
56 
64 void
66 {
67  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
68  // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
69  // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
70  // will unregister this root (it can cause library shut down).
71  if (__kmp_ignore_mppend() == FALSE) {
72  KC_TRACE( 10, ("__kmpc_end: called\n" ) );
73  KA_TRACE( 30, ("__kmpc_end\n" ));
74 
75  __kmp_internal_end_thread( -1 );
76  }
77 }
78 
98 kmp_int32
100 {
101  kmp_int32 gtid = __kmp_entry_gtid();
102 
103  KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
104 
105  return gtid;
106 }
107 
121 kmp_int32
123 {
124  KC_TRACE(10,("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
125 
126  return TCR_4(__kmp_all_nth);
127 }
128 
135 kmp_int32
137 {
138  KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
139  return __kmp_tid_from_gtid( __kmp_entry_gtid() );
140 }
141 
147 kmp_int32
149 {
150  KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
151 
152  return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
153 }
154 
161 kmp_int32
163 {
164 #ifndef KMP_DEBUG
165 
166  return TRUE;
167 
168 #else
169 
170  const char *semi2;
171  const char *semi3;
172  int line_no;
173 
174  if (__kmp_par_range == 0) {
175  return TRUE;
176  }
177  semi2 = loc->psource;
178  if (semi2 == NULL) {
179  return TRUE;
180  }
181  semi2 = strchr(semi2, ';');
182  if (semi2 == NULL) {
183  return TRUE;
184  }
185  semi2 = strchr(semi2 + 1, ';');
186  if (semi2 == NULL) {
187  return TRUE;
188  }
189  if (__kmp_par_range_filename[0]) {
190  const char *name = semi2 - 1;
191  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
192  name--;
193  }
194  if ((*name == '/') || (*name == ';')) {
195  name++;
196  }
197  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
198  return __kmp_par_range < 0;
199  }
200  }
201  semi3 = strchr(semi2 + 1, ';');
202  if (__kmp_par_range_routine[0]) {
203  if ((semi3 != NULL) && (semi3 > semi2)
204  && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
205  return __kmp_par_range < 0;
206  }
207  }
208  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
209  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
210  return __kmp_par_range > 0;
211  }
212  return __kmp_par_range < 0;
213  }
214  return TRUE;
215 
216 #endif /* KMP_DEBUG */
217 
218 }
219 
225 kmp_int32
227 {
228  return __kmp_entry_thread() -> th.th_root -> r.r_active;
229 }
230 
240 void
241 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
242 {
243  KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
244  global_tid, num_threads ) );
245 
246  __kmp_push_num_threads( loc, global_tid, num_threads );
247 }
248 
249 void
250 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
251 {
252  KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
253 
254  /* the num_threads are automatically popped */
255 }
256 
257 
258 #if OMP_40_ENABLED
259 
260 void
261 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
262 {
263  KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
264  global_tid, proc_bind ) );
265 
266  __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
267 }
268 
269 #endif /* OMP_40_ENABLED */
270 
271 
281 void
282 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
283 {
284  int gtid = __kmp_entry_gtid();
285 
286 #if (KMP_STATS_ENABLED)
287  int inParallel = __kmpc_in_parallel(loc);
288  if (inParallel)
289  {
290  KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
291  }
292  else
293  {
294  KMP_COUNT_BLOCK(OMP_PARALLEL);
295  }
296 #endif
297 
298  // maybe to save thr_state is enough here
299  {
300  va_list ap;
301  va_start( ap, microtask );
302 
303 #if OMPT_SUPPORT
304  ompt_frame_t* ompt_frame;
305  if (ompt_enabled) {
306  kmp_info_t *master_th = __kmp_threads[ gtid ];
307  kmp_team_t *parent_team = master_th->th.th_team;
308  ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
309  if (lwt)
310  ompt_frame = &(lwt->ompt_task_info.frame);
311  else
312  {
313  int tid = __kmp_tid_from_gtid( gtid );
314  ompt_frame = &(parent_team->t.t_implicit_task_taskdata[tid].
315  ompt_task_info.frame);
316  }
317  ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
318  }
319 #endif
320 
321 #if INCLUDE_SSC_MARKS
322  SSC_MARK_FORKING();
323 #endif
324  __kmp_fork_call( loc, gtid, fork_context_intel,
325  argc,
326 #if OMPT_SUPPORT
327  VOLATILE_CAST(void *) microtask, // "unwrapped" task
328 #endif
329  VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
330  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
331 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
332 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
333  &ap
334 #else
335  ap
336 #endif
337  );
338 #if INCLUDE_SSC_MARKS
339  SSC_MARK_JOINING();
340 #endif
341  __kmp_join_call( loc, gtid
342 #if OMPT_SUPPORT
343  , fork_context_intel
344 #endif
345  );
346 
347  va_end( ap );
348 
349  }
350 }
351 
352 #if OMP_40_ENABLED
353 
364 void
365 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
366 {
367  KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
368  global_tid, num_teams, num_threads ) );
369 
370  __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
371 }
372 
382 void
383 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
384 {
385  int gtid = __kmp_entry_gtid();
386  kmp_info_t *this_thr = __kmp_threads[ gtid ];
387  va_list ap;
388  va_start( ap, microtask );
389 
390  KMP_COUNT_BLOCK(OMP_TEAMS);
391 
392  // remember teams entry point and nesting level
393  this_thr->th.th_teams_microtask = microtask;
394  this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
395 
396 #if OMPT_SUPPORT
397  kmp_team_t *parent_team = this_thr->th.th_team;
398  int tid = __kmp_tid_from_gtid( gtid );
399  if (ompt_enabled) {
400  parent_team->t.t_implicit_task_taskdata[tid].
401  ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1);
402  }
403 #endif
404 
405  // check if __kmpc_push_num_teams called, set default number of teams otherwise
406  if ( this_thr->th.th_teams_size.nteams == 0 ) {
407  __kmp_push_num_teams( loc, gtid, 0, 0 );
408  }
409  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
410  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
411  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
412 
413  __kmp_fork_call( loc, gtid, fork_context_intel,
414  argc,
415 #if OMPT_SUPPORT
416  VOLATILE_CAST(void *) microtask, // "unwrapped" task
417 #endif
418  VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
419  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
420 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
421  &ap
422 #else
423  ap
424 #endif
425  );
426  __kmp_join_call( loc, gtid
427 #if OMPT_SUPPORT
428  , fork_context_intel
429 #endif
430  );
431 
432  this_thr->th.th_teams_microtask = NULL;
433  this_thr->th.th_teams_level = 0;
434  *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
435  va_end( ap );
436 }
437 #endif /* OMP_40_ENABLED */
438 
439 
440 //
441 // I don't think this function should ever have been exported.
442 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
443 // openmp code ever called it, but it's been exported from the RTL for so
444 // long that I'm afraid to remove the definition.
445 //
446 int
447 __kmpc_invoke_task_func( int gtid )
448 {
449  return __kmp_invoke_task_func( gtid );
450 }
451 
464 void
465 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
466 {
467  // The implementation is now in kmp_runtime.cpp so that it can share static
468  // functions with kmp_fork_call since the tasks to be done are similar in
469  // each case.
470  __kmp_serialized_parallel(loc, global_tid);
471 }
472 
480 void
481 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
482 {
483  kmp_internal_control_t *top;
484  kmp_info_t *this_thr;
485  kmp_team_t *serial_team;
486 
487  KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
488 
489  /* skip all this code for autopar serialized loops since it results in
490  unacceptable overhead */
491  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
492  return;
493 
494  // Not autopar code
495  if( ! TCR_4( __kmp_init_parallel ) )
496  __kmp_parallel_initialize();
497 
498  this_thr = __kmp_threads[ global_tid ];
499  serial_team = this_thr->th.th_serial_team;
500 
501  #if OMP_45_ENABLED
502  kmp_task_team_t * task_team = this_thr->th.th_task_team;
503 
504  // we need to wait for the proxy tasks before finishing the thread
505  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
506  __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL) ); // is an ITT object needed here?
507  #endif
508 
509  KMP_MB();
510  KMP_DEBUG_ASSERT( serial_team );
511  KMP_ASSERT( serial_team -> t.t_serialized );
512  KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
513  KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
514  KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
515  KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
516 
517  /* If necessary, pop the internal control stack values and replace the team values */
518  top = serial_team -> t.t_control_stack_top;
519  if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
520  copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
521  serial_team -> t.t_control_stack_top = top -> next;
522  __kmp_free(top);
523  }
524 
525  //if( serial_team -> t.t_serialized > 1 )
526  serial_team -> t.t_level--;
527 
528  /* pop dispatch buffers stack */
529  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
530  {
531  dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
532  serial_team->t.t_dispatch->th_disp_buffer =
533  serial_team->t.t_dispatch->th_disp_buffer->next;
534  __kmp_free( disp_buffer );
535  }
536 
537  -- serial_team -> t.t_serialized;
538  if ( serial_team -> t.t_serialized == 0 ) {
539 
540  /* return to the parallel section */
541 
542 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
543  if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
544  __kmp_clear_x87_fpu_status_word();
545  __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
546  __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
547  }
548 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
549 
550  this_thr -> th.th_team = serial_team -> t.t_parent;
551  this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
552 
553  /* restore values cached in the thread */
554  this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */
555  this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */
556  this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
557 
558  /* TODO the below shouldn't need to be adjusted for serialized teams */
559  this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
560  t.t_dispatch[ serial_team -> t.t_master_tid ];
561 
562  __kmp_pop_current_task_from_thread( this_thr );
563 
564  KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
565  this_thr -> th.th_current_task -> td_flags.executing = 1;
566 
567  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
568  // Copy the task team from the new child / old parent team to the thread.
569  this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
570  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
571  global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
572  }
573  } else {
574  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
575  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
576  global_tid, serial_team, serial_team -> t.t_serialized ) );
577  }
578  }
579 
580  if ( __kmp_env_consistency_check )
581  __kmp_pop_parallel( global_tid, NULL );
582 }
583 
592 void
594 {
595  KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
596 
597  /* need explicit __mf() here since use volatile instead in library */
598  KMP_MB(); /* Flush all pending memory write invalidates. */
599 
600  #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
601  #if KMP_MIC
602  // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
603  // We shouldn't need it, though, since the ABI rules require that
604  // * If the compiler generates NGO stores it also generates the fence
605  // * If users hand-code NGO stores they should insert the fence
606  // therefore no incomplete unordered stores should be visible.
607  #else
608  // C74404
609  // This is to address non-temporal store instructions (sfence needed).
610  // The clflush instruction is addressed either (mfence needed).
611  // Probably the non-temporal load monvtdqa instruction should also be addressed.
612  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
613  if ( ! __kmp_cpuinfo.initialized ) {
614  __kmp_query_cpuid( & __kmp_cpuinfo );
615  }; // if
616  if ( ! __kmp_cpuinfo.sse2 ) {
617  // CPU cannot execute SSE2 instructions.
618  } else {
619  #if KMP_COMPILER_ICC
620  _mm_mfence();
621  #elif KMP_COMPILER_MSVC
622  MemoryBarrier();
623  #else
624  __sync_synchronize();
625  #endif // KMP_COMPILER_ICC
626  }; // if
627  #endif // KMP_MIC
628  #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
629  // Nothing to see here move along
630  #elif KMP_ARCH_PPC64
631  // Nothing needed here (we have a real MB above).
632  #if KMP_OS_CNK
633  // The flushing thread needs to yield here; this prevents a
634  // busy-waiting thread from saturating the pipeline. flush is
635  // often used in loops like this:
636  // while (!flag) {
637  // #pragma omp flush(flag)
638  // }
639  // and adding the yield here is good for at least a 10x speedup
640  // when running >2 threads per core (on the NAS LU benchmark).
641  __kmp_yield(TRUE);
642  #endif
643  #else
644  #error Unknown or unsupported architecture
645  #endif
646 
647 }
648 
649 /* -------------------------------------------------------------------------- */
650 
651 /* -------------------------------------------------------------------------- */
652 
660 void
661 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
662 {
663  KMP_COUNT_BLOCK(OMP_BARRIER);
664  KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
665 
666  if (! TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668 
669  if ( __kmp_env_consistency_check ) {
670  if ( loc == 0 ) {
671  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
672  }; // if
673 
674  __kmp_check_barrier( global_tid, ct_barrier, loc );
675  }
676 
677 #if OMPT_SUPPORT && OMPT_TRACE
678  ompt_frame_t * ompt_frame;
679  if (ompt_enabled ) {
680  ompt_frame = __ompt_get_task_frame_internal(0);
681  if ( ompt_frame->reenter_runtime_frame == NULL )
682  ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
683  }
684 #endif
685  __kmp_threads[ global_tid ]->th.th_ident = loc;
686  // TODO: explicit barrier_wait_id:
687  // this function is called when 'barrier' directive is present or
688  // implicit barrier at the end of a worksharing construct.
689  // 1) better to add a per-thread barrier counter to a thread data structure
690  // 2) set to 0 when a new team is created
691  // 4) no sync is required
692 
693  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
694 #if OMPT_SUPPORT && OMPT_TRACE
695  if (ompt_enabled ) {
696  ompt_frame->reenter_runtime_frame = NULL;
697  }
698 #endif
699 }
700 
701 /* The BARRIER for a MASTER section is always explicit */
708 kmp_int32
709 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
710 {
711  int status = 0;
712 
713  KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
714 
715  if( ! TCR_4( __kmp_init_parallel ) )
716  __kmp_parallel_initialize();
717 
718  if( KMP_MASTER_GTID( global_tid )) {
719  KMP_COUNT_BLOCK(OMP_MASTER);
720  KMP_PUSH_PARTITIONED_TIMER(OMP_master);
721  status = 1;
722  }
723 
724 #if OMPT_SUPPORT && OMPT_TRACE
725  if (status) {
726  if (ompt_enabled &&
727  ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
728  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
729  kmp_team_t *team = this_thr -> th.th_team;
730 
731  int tid = __kmp_tid_from_gtid( global_tid );
732  ompt_callbacks.ompt_callback(ompt_event_master_begin)(
733  team->t.ompt_team_info.parallel_id,
734  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
735  }
736  }
737 #endif
738 
739  if ( __kmp_env_consistency_check ) {
740 #if KMP_USE_DYNAMIC_LOCK
741  if (status)
742  __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
743  else
744  __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
745 #else
746  if (status)
747  __kmp_push_sync( global_tid, ct_master, loc, NULL );
748  else
749  __kmp_check_sync( global_tid, ct_master, loc, NULL );
750 #endif
751  }
752 
753  return status;
754 }
755 
764 void
765 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
766 {
767  KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
768 
769  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
770  KMP_POP_PARTITIONED_TIMER();
771 
772 #if OMPT_SUPPORT && OMPT_TRACE
773  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
774  kmp_team_t *team = this_thr -> th.th_team;
775  if (ompt_enabled &&
776  ompt_callbacks.ompt_callback(ompt_event_master_end)) {
777  int tid = __kmp_tid_from_gtid( global_tid );
778  ompt_callbacks.ompt_callback(ompt_event_master_end)(
779  team->t.ompt_team_info.parallel_id,
780  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
781  }
782 #endif
783 
784  if ( __kmp_env_consistency_check ) {
785  if( global_tid < 0 )
786  KMP_WARNING( ThreadIdentInvalid );
787 
788  if( KMP_MASTER_GTID( global_tid ))
789  __kmp_pop_sync( global_tid, ct_master, loc );
790  }
791 }
792 
800 void
801 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
802 {
803  int cid = 0;
804  kmp_info_t *th;
805  KMP_DEBUG_ASSERT( __kmp_init_serial );
806 
807  KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
808 
809  if (! TCR_4(__kmp_init_parallel))
810  __kmp_parallel_initialize();
811 
812 #if USE_ITT_BUILD
813  __kmp_itt_ordered_prep( gtid );
814  // TODO: ordered_wait_id
815 #endif /* USE_ITT_BUILD */
816 
817  th = __kmp_threads[ gtid ];
818 
819 #if OMPT_SUPPORT && OMPT_TRACE
820  if (ompt_enabled) {
821  /* OMPT state update */
822  th->th.ompt_thread_info.wait_id = (uint64_t) loc;
823  th->th.ompt_thread_info.state = ompt_state_wait_ordered;
824 
825  /* OMPT event callback */
826  if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
827  ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
828  th->th.ompt_thread_info.wait_id);
829  }
830  }
831 #endif
832 
833  if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
834  (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
835  else
836  __kmp_parallel_deo( & gtid, & cid, loc );
837 
838 #if OMPT_SUPPORT && OMPT_TRACE
839  if (ompt_enabled) {
840  /* OMPT state update */
841  th->th.ompt_thread_info.state = ompt_state_work_parallel;
842  th->th.ompt_thread_info.wait_id = 0;
843 
844  /* OMPT event callback */
845  if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
846  ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
847  th->th.ompt_thread_info.wait_id);
848  }
849  }
850 #endif
851 
852 #if USE_ITT_BUILD
853  __kmp_itt_ordered_start( gtid );
854 #endif /* USE_ITT_BUILD */
855 }
856 
864 void
865 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
866 {
867  int cid = 0;
868  kmp_info_t *th;
869 
870  KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
871 
872 #if USE_ITT_BUILD
873  __kmp_itt_ordered_end( gtid );
874  // TODO: ordered_wait_id
875 #endif /* USE_ITT_BUILD */
876 
877  th = __kmp_threads[ gtid ];
878 
879  if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
880  (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
881  else
882  __kmp_parallel_dxo( & gtid, & cid, loc );
883 
884 #if OMPT_SUPPORT && OMPT_BLAME
885  if (ompt_enabled &&
886  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
887  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
888  th->th.ompt_thread_info.wait_id);
889  }
890 #endif
891 }
892 
893 #if KMP_USE_DYNAMIC_LOCK
894 
895 static __forceinline void
896 __kmp_init_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_indirect_locktag_t tag)
897 {
898  // Pointer to the allocated indirect lock is written to crit, while indexing is ignored.
899  void *idx;
900  kmp_indirect_lock_t **lck;
901  lck = (kmp_indirect_lock_t **)crit;
902  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
903  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
904  KMP_SET_I_LOCK_LOCATION(ilk, loc);
905  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
906  KA_TRACE(20, ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
907 #if USE_ITT_BUILD
908  __kmp_itt_critical_creating(ilk->lock, loc);
909 #endif
910  int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
911  if (status == 0) {
912 #if USE_ITT_BUILD
913  __kmp_itt_critical_destroyed(ilk->lock);
914 #endif
915  // We don't really need to destroy the unclaimed lock here since it will be cleaned up at program exit.
916  //KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
917  }
918  KMP_DEBUG_ASSERT(*lck != NULL);
919 }
920 
921 // Fast-path acquire tas lock
922 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) { \
923  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
924  if (l->lk.poll != KMP_LOCK_FREE(tas) || \
925  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \
926  kmp_uint32 spins; \
927  KMP_FSYNC_PREPARE(l); \
928  KMP_INIT_YIELD(spins); \
929  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
930  KMP_YIELD(TRUE); \
931  } else { \
932  KMP_YIELD_SPIN(spins); \
933  } \
934  kmp_backoff_t backoff = __kmp_spin_backoff_params; \
935  while (l->lk.poll != KMP_LOCK_FREE(tas) || \
936  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \
937  __kmp_spin_backoff(&backoff); \
938  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
939  KMP_YIELD(TRUE); \
940  } else { \
941  KMP_YIELD_SPIN(spins); \
942  } \
943  } \
944  } \
945  KMP_FSYNC_ACQUIRED(l); \
946 }
947 
948 // Fast-path test tas lock
949 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) { \
950  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
951  rc = l->lk.poll == KMP_LOCK_FREE(tas) && \
952  KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas)); \
953 }
954 
955 // Fast-path release tas lock
956 #define KMP_RELEASE_TAS_LOCK(lock, gtid) { \
957  TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \
958  KMP_MB(); \
959 }
960 
961 #if KMP_USE_FUTEX
962 
963 # include <unistd.h>
964 # include <sys/syscall.h>
965 # ifndef FUTEX_WAIT
966 # define FUTEX_WAIT 0
967 # endif
968 # ifndef FUTEX_WAKE
969 # define FUTEX_WAKE 1
970 # endif
971 
972 // Fast-path acquire futex lock
973 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) { \
974  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
975  kmp_int32 gtid_code = (gtid+1) << 1; \
976  KMP_MB(); \
977  KMP_FSYNC_PREPARE(ftx); \
978  kmp_int32 poll_val; \
979  while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
980  KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
981  kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
982  if (!cond) { \
983  if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | KMP_LOCK_BUSY(1, futex))) { \
984  continue; \
985  } \
986  poll_val |= KMP_LOCK_BUSY(1, futex); \
987  } \
988  kmp_int32 rc; \
989  if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { \
990  continue; \
991  } \
992  gtid_code |= 1; \
993  } \
994  KMP_FSYNC_ACQUIRED(ftx); \
995 }
996 
997 // Fast-path test futex lock
998 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) { \
999  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1000  if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), KMP_LOCK_BUSY(gtid+1 << 1, futex))) { \
1001  KMP_FSYNC_ACQUIRED(ftx); \
1002  rc = TRUE; \
1003  } else { \
1004  rc = FALSE; \
1005  } \
1006 }
1007 
1008 // Fast-path release futex lock
1009 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) { \
1010  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1011  KMP_MB(); \
1012  KMP_FSYNC_RELEASING(ftx); \
1013  kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1014  if (KMP_LOCK_STRIP(poll_val) & 1) { \
1015  syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1016  } \
1017  KMP_MB(); \
1018  KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
1019 }
1020 
1021 #endif // KMP_USE_FUTEX
1022 
1023 #else // KMP_USE_DYNAMIC_LOCK
1024 
1025 static kmp_user_lock_p
1026 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
1027 {
1028  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1029 
1030  //
1031  // Because of the double-check, the following load
1032  // doesn't need to be volatile.
1033  //
1034  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1035 
1036  if ( lck == NULL ) {
1037  void * idx;
1038 
1039  // Allocate & initialize the lock.
1040  // Remember allocated locks in table in order to free them in __kmp_cleanup()
1041  lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
1042  __kmp_init_user_lock_with_checks( lck );
1043  __kmp_set_user_lock_location( lck, loc );
1044 #if USE_ITT_BUILD
1045  __kmp_itt_critical_creating( lck );
1046  // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
1047  // lock. It is the only place where we can guarantee it. There are chances the lock will
1048  // destroyed with no usage, but it is not a problem, because this is not real event seen
1049  // by user but rather setting name for object (lock). See more details in kmp_itt.h.
1050 #endif /* USE_ITT_BUILD */
1051 
1052  //
1053  // Use a cmpxchg instruction to slam the start of the critical
1054  // section with the lock pointer. If another thread beat us
1055  // to it, deallocate the lock, and use the lock that the other
1056  // thread allocated.
1057  //
1058  int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
1059 
1060  if ( status == 0 ) {
1061  // Deallocate the lock and reload the value.
1062 #if USE_ITT_BUILD
1063  __kmp_itt_critical_destroyed( lck );
1064  // Let ITT know the lock is destroyed and the same memory location may be reused for
1065  // another purpose.
1066 #endif /* USE_ITT_BUILD */
1067  __kmp_destroy_user_lock_with_checks( lck );
1068  __kmp_user_lock_free( &idx, gtid, lck );
1069  lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1070  KMP_DEBUG_ASSERT( lck != NULL );
1071  }
1072  }
1073  return lck;
1074 }
1075 
1076 #endif // KMP_USE_DYNAMIC_LOCK
1077 
1088 void
1089 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit )
1090 {
1091 #if KMP_USE_DYNAMIC_LOCK
1092  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1093 #else
1094  KMP_COUNT_BLOCK(OMP_CRITICAL);
1095  KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */
1096  kmp_user_lock_p lck;
1097 
1098  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1099 
1100  //TODO: add THR_OVHD_STATE
1101 
1102  KMP_CHECK_USER_LOCK_INIT();
1103 
1104  if ( ( __kmp_user_lock_kind == lk_tas )
1105  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1106  lck = (kmp_user_lock_p)crit;
1107  }
1108 #if KMP_USE_FUTEX
1109  else if ( ( __kmp_user_lock_kind == lk_futex )
1110  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1111  lck = (kmp_user_lock_p)crit;
1112  }
1113 #endif
1114  else { // ticket, queuing or drdpa
1115  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1116  }
1117 
1118  if ( __kmp_env_consistency_check )
1119  __kmp_push_sync( global_tid, ct_critical, loc, lck );
1120 
1121  /* since the critical directive binds to all threads, not just
1122  * the current team we have to check this even if we are in a
1123  * serialized team */
1124  /* also, even if we are the uber thread, we still have to conduct the lock,
1125  * as we have to contend with sibling threads */
1126 
1127 #if USE_ITT_BUILD
1128  __kmp_itt_critical_acquiring( lck );
1129 #endif /* USE_ITT_BUILD */
1130  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1131  __kmp_acquire_user_lock_with_checks( lck, global_tid );
1132 
1133 #if USE_ITT_BUILD
1134  __kmp_itt_critical_acquired( lck );
1135 #endif /* USE_ITT_BUILD */
1136 
1137  KMP_START_EXPLICIT_TIMER(OMP_critical);
1138  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1139 #endif // KMP_USE_DYNAMIC_LOCK
1140 }
1141 
1142 #if KMP_USE_DYNAMIC_LOCK
1143 
1144 // Converts the given hint to an internal lock implementation
1145 static __forceinline kmp_dyna_lockseq_t
1146 __kmp_map_hint_to_lock(uintptr_t hint)
1147 {
1148 #if KMP_USE_TSX
1149 # define KMP_TSX_LOCK(seq) lockseq_##seq
1150 #else
1151 # define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1152 #endif
1153 
1154 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1155 # define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1156 #else
1157 # define KMP_CPUINFO_RTM 0
1158 #endif
1159 
1160  // Hints that do not require further logic
1161  if (hint & kmp_lock_hint_hle)
1162  return KMP_TSX_LOCK(hle);
1163  if (hint & kmp_lock_hint_rtm)
1164  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm): __kmp_user_lock_seq;
1165  if (hint & kmp_lock_hint_adaptive)
1166  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive): __kmp_user_lock_seq;
1167 
1168  // Rule out conflicting hints first by returning the default lock
1169  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1170  return __kmp_user_lock_seq;
1171  if ((hint & omp_lock_hint_speculative) && (hint & omp_lock_hint_nonspeculative))
1172  return __kmp_user_lock_seq;
1173 
1174  // Do not even consider speculation when it appears to be contended
1175  if (hint & omp_lock_hint_contended)
1176  return lockseq_queuing;
1177 
1178  // Uncontended lock without speculation
1179  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1180  return lockseq_tas;
1181 
1182  // HLE lock for speculation
1183  if (hint & omp_lock_hint_speculative)
1184  return KMP_TSX_LOCK(hle);
1185 
1186  return __kmp_user_lock_seq;
1187 }
1188 
1201 void
1202 __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit, uintptr_t hint )
1203 {
1204  KMP_COUNT_BLOCK(OMP_CRITICAL);
1205  kmp_user_lock_p lck;
1206 
1207  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1208 
1209  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1210  // Check if it is initialized.
1211  if (*lk == 0) {
1212  kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1213  if (KMP_IS_D_LOCK(lckseq)) {
1214  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(lckseq));
1215  } else {
1216  __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1217  }
1218  }
1219  // Branch for accessing the actual lock object and set operation. This branching is inevitable since
1220  // this lock initialization does not follow the normal dispatch path (lock table is not used).
1221  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1222  lck = (kmp_user_lock_p)lk;
1223  if (__kmp_env_consistency_check) {
1224  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1225  }
1226 # if USE_ITT_BUILD
1227  __kmp_itt_critical_acquiring(lck);
1228 # endif
1229 # if KMP_USE_INLINED_TAS
1230  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1231  KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1232  } else
1233 # elif KMP_USE_INLINED_FUTEX
1234  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1235  KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1236  } else
1237 # endif
1238  {
1239  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1240  }
1241  } else {
1242  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1243  lck = ilk->lock;
1244  if (__kmp_env_consistency_check) {
1245  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1246  }
1247 # if USE_ITT_BUILD
1248  __kmp_itt_critical_acquiring(lck);
1249 # endif
1250  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1251  }
1252 
1253 #if USE_ITT_BUILD
1254  __kmp_itt_critical_acquired( lck );
1255 #endif /* USE_ITT_BUILD */
1256 
1257  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1258  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1259 } // __kmpc_critical_with_hint
1260 
1261 #endif // KMP_USE_DYNAMIC_LOCK
1262 
1272 void
1273 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
1274 {
1275  kmp_user_lock_p lck;
1276 
1277  KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
1278 
1279 #if KMP_USE_DYNAMIC_LOCK
1280  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1281  lck = (kmp_user_lock_p)crit;
1282  KMP_ASSERT(lck != NULL);
1283  if (__kmp_env_consistency_check) {
1284  __kmp_pop_sync(global_tid, ct_critical, loc);
1285  }
1286 # if USE_ITT_BUILD
1287  __kmp_itt_critical_releasing( lck );
1288 # endif
1289 # if KMP_USE_INLINED_TAS
1290  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1291  KMP_RELEASE_TAS_LOCK(lck, global_tid);
1292  } else
1293 # elif KMP_USE_INLINED_FUTEX
1294  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1295  KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1296  } else
1297 # endif
1298  {
1299  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1300  }
1301  } else {
1302  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1303  KMP_ASSERT(ilk != NULL);
1304  lck = ilk->lock;
1305  if (__kmp_env_consistency_check) {
1306  __kmp_pop_sync(global_tid, ct_critical, loc);
1307  }
1308 # if USE_ITT_BUILD
1309  __kmp_itt_critical_releasing( lck );
1310 # endif
1311  KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1312  }
1313 
1314 #else // KMP_USE_DYNAMIC_LOCK
1315 
1316  if ( ( __kmp_user_lock_kind == lk_tas )
1317  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1318  lck = (kmp_user_lock_p)crit;
1319  }
1320 #if KMP_USE_FUTEX
1321  else if ( ( __kmp_user_lock_kind == lk_futex )
1322  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1323  lck = (kmp_user_lock_p)crit;
1324  }
1325 #endif
1326  else { // ticket, queuing or drdpa
1327  lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
1328  }
1329 
1330  KMP_ASSERT(lck != NULL);
1331 
1332  if ( __kmp_env_consistency_check )
1333  __kmp_pop_sync( global_tid, ct_critical, loc );
1334 
1335 #if USE_ITT_BUILD
1336  __kmp_itt_critical_releasing( lck );
1337 #endif /* USE_ITT_BUILD */
1338  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1339  __kmp_release_user_lock_with_checks( lck, global_tid );
1340 
1341 #if OMPT_SUPPORT && OMPT_BLAME
1342  if (ompt_enabled &&
1343  ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1344  ompt_callbacks.ompt_callback(ompt_event_release_critical)(
1345  (uint64_t) lck);
1346  }
1347 #endif
1348 
1349 #endif // KMP_USE_DYNAMIC_LOCK
1350  KMP_POP_PARTITIONED_TIMER();
1351  KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
1352 }
1353 
1362 kmp_int32
1363 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
1364 {
1365  int status;
1366 
1367  KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
1368 
1369  if (! TCR_4(__kmp_init_parallel))
1370  __kmp_parallel_initialize();
1371 
1372  if ( __kmp_env_consistency_check )
1373  __kmp_check_barrier( global_tid, ct_barrier, loc );
1374 
1375 #if USE_ITT_NOTIFY
1376  __kmp_threads[global_tid]->th.th_ident = loc;
1377 #endif
1378  status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
1379 
1380  return (status != 0) ? 0 : 1;
1381 }
1382 
1392 void
1393 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
1394 {
1395  KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
1396 
1397  __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
1398 }
1399 
1410 kmp_int32
1411 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1412 {
1413  kmp_int32 ret;
1414 
1415  KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1416 
1417  if (! TCR_4(__kmp_init_parallel))
1418  __kmp_parallel_initialize();
1419 
1420  if ( __kmp_env_consistency_check ) {
1421  if ( loc == 0 ) {
1422  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1423  }
1424  __kmp_check_barrier( global_tid, ct_barrier, loc );
1425  }
1426 
1427 #if USE_ITT_NOTIFY
1428  __kmp_threads[global_tid]->th.th_ident = loc;
1429 #endif
1430  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1431 
1432  ret = __kmpc_master (loc, global_tid);
1433 
1434  if ( __kmp_env_consistency_check ) {
1435  /* there's no __kmpc_end_master called; so the (stats) */
1436  /* actions of __kmpc_end_master are done here */
1437 
1438  if ( global_tid < 0 ) {
1439  KMP_WARNING( ThreadIdentInvalid );
1440  }
1441  if (ret) {
1442  /* only one thread should do the pop since only */
1443  /* one did the push (see __kmpc_master()) */
1444 
1445  __kmp_pop_sync( global_tid, ct_master, loc );
1446  }
1447  }
1448 
1449  return (ret);
1450 }
1451 
1452 /* The BARRIER for a SINGLE process section is always explicit */
1464 kmp_int32
1465 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1466 {
1467  kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1468 
1469  if (rc) {
1470  // We are going to execute the single statement, so we should count it.
1471  KMP_COUNT_BLOCK(OMP_SINGLE);
1472  KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1473  }
1474 
1475 #if OMPT_SUPPORT && OMPT_TRACE
1476  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1477  kmp_team_t *team = this_thr -> th.th_team;
1478  int tid = __kmp_tid_from_gtid( global_tid );
1479 
1480  if (ompt_enabled) {
1481  if (rc) {
1482  if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1483  ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1484  team->t.ompt_team_info.parallel_id,
1485  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1486  team->t.ompt_team_info.microtask);
1487  }
1488  } else {
1489  if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1490  ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1491  team->t.ompt_team_info.parallel_id,
1492  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1493  }
1494  this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1495  }
1496  }
1497 #endif
1498 
1499  return rc;
1500 }
1501 
1511 void
1512 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1513 {
1514  __kmp_exit_single( global_tid );
1515  KMP_POP_PARTITIONED_TIMER();
1516 
1517 #if OMPT_SUPPORT && OMPT_TRACE
1518  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1519  kmp_team_t *team = this_thr -> th.th_team;
1520  int tid = __kmp_tid_from_gtid( global_tid );
1521 
1522  if (ompt_enabled &&
1523  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1524  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1525  team->t.ompt_team_info.parallel_id,
1526  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1527  }
1528 #endif
1529 }
1530 
1538 void
1539 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1540 {
1541  KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1542 
1543 #if OMPT_SUPPORT && OMPT_TRACE
1544  if (ompt_enabled &&
1545  ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1546  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1547  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1548  ompt_callbacks.ompt_callback(ompt_event_loop_end)(
1549  team_info->parallel_id, task_info->task_id);
1550  }
1551 #endif
1552 
1553  if ( __kmp_env_consistency_check )
1554  __kmp_pop_workshare( global_tid, ct_pdo, loc );
1555 }
1556 
1557 /*
1558  * User routines which take C-style arguments (call by value)
1559  * different from the Fortran equivalent routines
1560  */
1561 
1562 void
1563 ompc_set_num_threads( int arg )
1564 {
1565 // !!!!! TODO: check the per-task binding
1566  __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1567 }
1568 
1569 void
1570 ompc_set_dynamic( int flag )
1571 {
1572  kmp_info_t *thread;
1573 
1574  /* For the thread-private implementation of the internal controls */
1575  thread = __kmp_entry_thread();
1576 
1577  __kmp_save_internal_controls( thread );
1578 
1579  set__dynamic( thread, flag ? TRUE : FALSE );
1580 }
1581 
1582 void
1583 ompc_set_nested( int flag )
1584 {
1585  kmp_info_t *thread;
1586 
1587  /* For the thread-private internal controls implementation */
1588  thread = __kmp_entry_thread();
1589 
1590  __kmp_save_internal_controls( thread );
1591 
1592  set__nested( thread, flag ? TRUE : FALSE );
1593 }
1594 
1595 void
1596 ompc_set_max_active_levels( int max_active_levels )
1597 {
1598  /* TO DO */
1599  /* we want per-task implementation of this internal control */
1600 
1601  /* For the per-thread internal controls implementation */
1602  __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1603 }
1604 
1605 void
1606 ompc_set_schedule( omp_sched_t kind, int modifier )
1607 {
1608 // !!!!! TODO: check the per-task binding
1609  __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1610 }
1611 
1612 int
1613 ompc_get_ancestor_thread_num( int level )
1614 {
1615  return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1616 }
1617 
1618 int
1619 ompc_get_team_size( int level )
1620 {
1621  return __kmp_get_team_size( __kmp_entry_gtid(), level );
1622 }
1623 
1624 void
1625 kmpc_set_stacksize( int arg )
1626 {
1627  // __kmp_aux_set_stacksize initializes the library if needed
1628  __kmp_aux_set_stacksize( arg );
1629 }
1630 
1631 void
1632 kmpc_set_stacksize_s( size_t arg )
1633 {
1634  // __kmp_aux_set_stacksize initializes the library if needed
1635  __kmp_aux_set_stacksize( arg );
1636 }
1637 
1638 void
1639 kmpc_set_blocktime( int arg )
1640 {
1641  int gtid, tid;
1642  kmp_info_t *thread;
1643 
1644  gtid = __kmp_entry_gtid();
1645  tid = __kmp_tid_from_gtid(gtid);
1646  thread = __kmp_thread_from_gtid(gtid);
1647 
1648  __kmp_aux_set_blocktime( arg, thread, tid );
1649 }
1650 
1651 void
1652 kmpc_set_library( int arg )
1653 {
1654  // __kmp_user_set_library initializes the library if needed
1655  __kmp_user_set_library( (enum library_type)arg );
1656 }
1657 
1658 void
1659 kmpc_set_defaults( char const * str )
1660 {
1661  // __kmp_aux_set_defaults initializes the library if needed
1662  __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
1663 }
1664 
1665 void
1666 kmpc_set_disp_num_buffers( int arg )
1667 {
1668  // ignore after initialization because some teams have already
1669  // allocated dispatch buffers
1670  if( __kmp_init_serial == 0 && arg > 0 )
1671  __kmp_dispatch_num_buffers = arg;
1672 }
1673 
1674 int
1675 kmpc_set_affinity_mask_proc( int proc, void **mask )
1676 {
1677 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1678  return -1;
1679 #else
1680  if ( ! TCR_4(__kmp_init_middle) ) {
1681  __kmp_middle_initialize();
1682  }
1683  return __kmp_aux_set_affinity_mask_proc( proc, mask );
1684 #endif
1685 }
1686 
1687 int
1688 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1689 {
1690 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1691  return -1;
1692 #else
1693  if ( ! TCR_4(__kmp_init_middle) ) {
1694  __kmp_middle_initialize();
1695  }
1696  return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1697 #endif
1698 }
1699 
1700 int
1701 kmpc_get_affinity_mask_proc( int proc, void **mask )
1702 {
1703 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1704  return -1;
1705 #else
1706  if ( ! TCR_4(__kmp_init_middle) ) {
1707  __kmp_middle_initialize();
1708  }
1709  return __kmp_aux_get_affinity_mask_proc( proc, mask );
1710 #endif
1711 }
1712 
1713 
1714 /* -------------------------------------------------------------------------- */
1755 void
1756 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1757 {
1758  void **data_ptr;
1759 
1760  KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1761 
1762  KMP_MB();
1763 
1764  data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1765 
1766  if ( __kmp_env_consistency_check ) {
1767  if ( loc == 0 ) {
1768  KMP_WARNING( ConstructIdentInvalid );
1769  }
1770  }
1771 
1772  /* ToDo: Optimize the following two barriers into some kind of split barrier */
1773 
1774  if (didit) *data_ptr = cpy_data;
1775 
1776  /* This barrier is not a barrier region boundary */
1777 #if USE_ITT_NOTIFY
1778  __kmp_threads[gtid]->th.th_ident = loc;
1779 #endif
1780  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1781 
1782  if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1783 
1784  /* Consider next barrier the user-visible barrier for barrier region boundaries */
1785  /* Nesting checks are already handled by the single construct checks */
1786 
1787 #if USE_ITT_NOTIFY
1788  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
1789 #endif
1790  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1791 }
1792 
1793 /* -------------------------------------------------------------------------- */
1794 
1795 #define INIT_LOCK __kmp_init_user_lock_with_checks
1796 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1797 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1798 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1799 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1800 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1801 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1802 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1803 #define TEST_LOCK __kmp_test_user_lock_with_checks
1804 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1805 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1806 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1807 
1808 
1809 /*
1810  * TODO: Make check abort messages use location info & pass it
1811  * into with_checks routines
1812  */
1813 
1814 #if KMP_USE_DYNAMIC_LOCK
1815 
1816 // internal lock initializer
1817 static __forceinline void
1818 __kmp_init_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1819 {
1820  if (KMP_IS_D_LOCK(seq)) {
1821  KMP_INIT_D_LOCK(lock, seq);
1822 #if USE_ITT_BUILD
1823  __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
1824 #endif
1825  } else {
1826  KMP_INIT_I_LOCK(lock, seq);
1827 #if USE_ITT_BUILD
1828  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1829  __kmp_itt_lock_creating(ilk->lock, loc);
1830 #endif
1831  }
1832 }
1833 
1834 // internal nest lock initializer
1835 static __forceinline void
1836 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1837 {
1838 #if KMP_USE_TSX
1839  // Don't have nested lock implementation for speculative locks
1840  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
1841  seq = __kmp_user_lock_seq;
1842 #endif
1843  switch (seq) {
1844  case lockseq_tas:
1845  seq = lockseq_nested_tas;
1846  break;
1847 #if KMP_USE_FUTEX
1848  case lockseq_futex:
1849  seq = lockseq_nested_futex;
1850  break;
1851 #endif
1852  case lockseq_ticket:
1853  seq = lockseq_nested_ticket;
1854  break;
1855  case lockseq_queuing:
1856  seq = lockseq_nested_queuing;
1857  break;
1858  case lockseq_drdpa:
1859  seq = lockseq_nested_drdpa;
1860  break;
1861  default:
1862  seq = lockseq_nested_queuing;
1863  }
1864  KMP_INIT_I_LOCK(lock, seq);
1865 #if USE_ITT_BUILD
1866  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1867  __kmp_itt_lock_creating(ilk->lock, loc);
1868 #endif
1869 }
1870 
1871 /* initialize the lock with a hint */
1872 void
1873 __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1874 {
1875  KMP_DEBUG_ASSERT(__kmp_init_serial);
1876  if (__kmp_env_consistency_check && user_lock == NULL) {
1877  KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
1878  }
1879 
1880  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1881 }
1882 
1883 /* initialize the lock with a hint */
1884 void
1885 __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1886 {
1887  KMP_DEBUG_ASSERT(__kmp_init_serial);
1888  if (__kmp_env_consistency_check && user_lock == NULL) {
1889  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
1890  }
1891 
1892  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1893 }
1894 
1895 #endif // KMP_USE_DYNAMIC_LOCK
1896 
1897 /* initialize the lock */
1898 void
1899 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1900 #if KMP_USE_DYNAMIC_LOCK
1901  KMP_DEBUG_ASSERT(__kmp_init_serial);
1902  if (__kmp_env_consistency_check && user_lock == NULL) {
1903  KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1904  }
1905  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1906 
1907 #else // KMP_USE_DYNAMIC_LOCK
1908 
1909  static char const * const func = "omp_init_lock";
1910  kmp_user_lock_p lck;
1911  KMP_DEBUG_ASSERT( __kmp_init_serial );
1912 
1913  if ( __kmp_env_consistency_check ) {
1914  if ( user_lock == NULL ) {
1915  KMP_FATAL( LockIsUninitialized, func );
1916  }
1917  }
1918 
1919  KMP_CHECK_USER_LOCK_INIT();
1920 
1921  if ( ( __kmp_user_lock_kind == lk_tas )
1922  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1923  lck = (kmp_user_lock_p)user_lock;
1924  }
1925 #if KMP_USE_FUTEX
1926  else if ( ( __kmp_user_lock_kind == lk_futex )
1927  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1928  lck = (kmp_user_lock_p)user_lock;
1929  }
1930 #endif
1931  else {
1932  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1933  }
1934  INIT_LOCK( lck );
1935  __kmp_set_user_lock_location( lck, loc );
1936 
1937 #if OMPT_SUPPORT && OMPT_TRACE
1938  if (ompt_enabled &&
1939  ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
1940  ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t) lck);
1941  }
1942 #endif
1943 
1944 #if USE_ITT_BUILD
1945  __kmp_itt_lock_creating( lck );
1946 #endif /* USE_ITT_BUILD */
1947 
1948 #endif // KMP_USE_DYNAMIC_LOCK
1949 } // __kmpc_init_lock
1950 
1951 /* initialize the lock */
1952 void
1953 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1954 #if KMP_USE_DYNAMIC_LOCK
1955 
1956  KMP_DEBUG_ASSERT(__kmp_init_serial);
1957  if (__kmp_env_consistency_check && user_lock == NULL) {
1958  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1959  }
1960  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1961 
1962 #else // KMP_USE_DYNAMIC_LOCK
1963 
1964  static char const * const func = "omp_init_nest_lock";
1965  kmp_user_lock_p lck;
1966  KMP_DEBUG_ASSERT( __kmp_init_serial );
1967 
1968  if ( __kmp_env_consistency_check ) {
1969  if ( user_lock == NULL ) {
1970  KMP_FATAL( LockIsUninitialized, func );
1971  }
1972  }
1973 
1974  KMP_CHECK_USER_LOCK_INIT();
1975 
1976  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1977  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1978  lck = (kmp_user_lock_p)user_lock;
1979  }
1980 #if KMP_USE_FUTEX
1981  else if ( ( __kmp_user_lock_kind == lk_futex )
1982  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1983  <= OMP_NEST_LOCK_T_SIZE ) ) {
1984  lck = (kmp_user_lock_p)user_lock;
1985  }
1986 #endif
1987  else {
1988  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1989  }
1990 
1991  INIT_NESTED_LOCK( lck );
1992  __kmp_set_user_lock_location( lck, loc );
1993 
1994 #if OMPT_SUPPORT && OMPT_TRACE
1995  if (ompt_enabled &&
1996  ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
1997  ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t) lck);
1998  }
1999 #endif
2000 
2001 #if USE_ITT_BUILD
2002  __kmp_itt_lock_creating( lck );
2003 #endif /* USE_ITT_BUILD */
2004 
2005 #endif // KMP_USE_DYNAMIC_LOCK
2006 } // __kmpc_init_nest_lock
2007 
2008 void
2009 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2010 #if KMP_USE_DYNAMIC_LOCK
2011 
2012 # if USE_ITT_BUILD
2013  kmp_user_lock_p lck;
2014  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2015  lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2016  } else {
2017  lck = (kmp_user_lock_p)user_lock;
2018  }
2019  __kmp_itt_lock_destroyed(lck);
2020 # endif
2021  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2022 #else
2023  kmp_user_lock_p lck;
2024 
2025  if ( ( __kmp_user_lock_kind == lk_tas )
2026  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2027  lck = (kmp_user_lock_p)user_lock;
2028  }
2029 #if KMP_USE_FUTEX
2030  else if ( ( __kmp_user_lock_kind == lk_futex )
2031  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2032  lck = (kmp_user_lock_p)user_lock;
2033  }
2034 #endif
2035  else {
2036  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
2037  }
2038 
2039 #if OMPT_SUPPORT && OMPT_TRACE
2040  if (ompt_enabled &&
2041  ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
2042  ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t) lck);
2043  }
2044 #endif
2045 
2046 #if USE_ITT_BUILD
2047  __kmp_itt_lock_destroyed( lck );
2048 #endif /* USE_ITT_BUILD */
2049  DESTROY_LOCK( lck );
2050 
2051  if ( ( __kmp_user_lock_kind == lk_tas )
2052  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2053  ;
2054  }
2055 #if KMP_USE_FUTEX
2056  else if ( ( __kmp_user_lock_kind == lk_futex )
2057  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2058  ;
2059  }
2060 #endif
2061  else {
2062  __kmp_user_lock_free( user_lock, gtid, lck );
2063  }
2064 #endif // KMP_USE_DYNAMIC_LOCK
2065 } // __kmpc_destroy_lock
2066 
2067 /* destroy the lock */
2068 void
2069 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2070 #if KMP_USE_DYNAMIC_LOCK
2071 
2072 # if USE_ITT_BUILD
2073  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2074  __kmp_itt_lock_destroyed(ilk->lock);
2075 # endif
2076  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2077 
2078 #else // KMP_USE_DYNAMIC_LOCK
2079 
2080  kmp_user_lock_p lck;
2081 
2082  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2083  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2084  lck = (kmp_user_lock_p)user_lock;
2085  }
2086 #if KMP_USE_FUTEX
2087  else if ( ( __kmp_user_lock_kind == lk_futex )
2088  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2089  <= OMP_NEST_LOCK_T_SIZE ) ) {
2090  lck = (kmp_user_lock_p)user_lock;
2091  }
2092 #endif
2093  else {
2094  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
2095  }
2096 
2097 #if OMPT_SUPPORT && OMPT_TRACE
2098  if (ompt_enabled &&
2099  ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
2100  ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t) lck);
2101  }
2102 #endif
2103 
2104 #if USE_ITT_BUILD
2105  __kmp_itt_lock_destroyed( lck );
2106 #endif /* USE_ITT_BUILD */
2107 
2108  DESTROY_NESTED_LOCK( lck );
2109 
2110  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2111  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2112  ;
2113  }
2114 #if KMP_USE_FUTEX
2115  else if ( ( __kmp_user_lock_kind == lk_futex )
2116  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2117  <= OMP_NEST_LOCK_T_SIZE ) ) {
2118  ;
2119  }
2120 #endif
2121  else {
2122  __kmp_user_lock_free( user_lock, gtid, lck );
2123  }
2124 #endif // KMP_USE_DYNAMIC_LOCK
2125 } // __kmpc_destroy_nest_lock
2126 
2127 void
2128 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2129  KMP_COUNT_BLOCK(OMP_set_lock);
2130 #if KMP_USE_DYNAMIC_LOCK
2131  int tag = KMP_EXTRACT_D_TAG(user_lock);
2132 # if USE_ITT_BUILD
2133  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
2134 # endif
2135 # if KMP_USE_INLINED_TAS
2136  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2137  KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2138  } else
2139 # elif KMP_USE_INLINED_FUTEX
2140  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2141  KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2142  } else
2143 # endif
2144  {
2145  __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2146  }
2147 # if USE_ITT_BUILD
2148  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2149 # endif
2150 
2151 #else // KMP_USE_DYNAMIC_LOCK
2152 
2153  kmp_user_lock_p lck;
2154 
2155  if ( ( __kmp_user_lock_kind == lk_tas )
2156  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2157  lck = (kmp_user_lock_p)user_lock;
2158  }
2159 #if KMP_USE_FUTEX
2160  else if ( ( __kmp_user_lock_kind == lk_futex )
2161  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2162  lck = (kmp_user_lock_p)user_lock;
2163  }
2164 #endif
2165  else {
2166  lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
2167  }
2168 
2169 #if USE_ITT_BUILD
2170  __kmp_itt_lock_acquiring( lck );
2171 #endif /* USE_ITT_BUILD */
2172 
2173  ACQUIRE_LOCK( lck, gtid );
2174 
2175 #if USE_ITT_BUILD
2176  __kmp_itt_lock_acquired( lck );
2177 #endif /* USE_ITT_BUILD */
2178 
2179 #if OMPT_SUPPORT && OMPT_TRACE
2180  if (ompt_enabled &&
2181  ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
2182  ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t) lck);
2183  }
2184 #endif
2185 
2186 #endif // KMP_USE_DYNAMIC_LOCK
2187 }
2188 
2189 void
2190 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2191 #if KMP_USE_DYNAMIC_LOCK
2192 
2193 # if USE_ITT_BUILD
2194  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2195 # endif
2196  KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2197 # if USE_ITT_BUILD
2198  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2199 #endif
2200 
2201 #if OMPT_SUPPORT && OMPT_TRACE
2202  if (ompt_enabled) {
2203  // missing support here: need to know whether acquired first or not
2204  }
2205 #endif
2206 
2207 #else // KMP_USE_DYNAMIC_LOCK
2208  int acquire_status;
2209  kmp_user_lock_p lck;
2210 
2211  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2212  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2213  lck = (kmp_user_lock_p)user_lock;
2214  }
2215 #if KMP_USE_FUTEX
2216  else if ( ( __kmp_user_lock_kind == lk_futex )
2217  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2218  <= OMP_NEST_LOCK_T_SIZE ) ) {
2219  lck = (kmp_user_lock_p)user_lock;
2220  }
2221 #endif
2222  else {
2223  lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
2224  }
2225 
2226 #if USE_ITT_BUILD
2227  __kmp_itt_lock_acquiring( lck );
2228 #endif /* USE_ITT_BUILD */
2229 
2230  ACQUIRE_NESTED_LOCK( lck, gtid, &acquire_status );
2231 
2232 #if USE_ITT_BUILD
2233  __kmp_itt_lock_acquired( lck );
2234 #endif /* USE_ITT_BUILD */
2235 
2236 #if OMPT_SUPPORT && OMPT_TRACE
2237  if (ompt_enabled) {
2238  if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2239  if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
2240  ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)((uint64_t) lck);
2241  } else {
2242  if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
2243  ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)((uint64_t) lck);
2244  }
2245  }
2246 #endif
2247 
2248 #endif // KMP_USE_DYNAMIC_LOCK
2249 }
2250 
2251 void
2252 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2253 {
2254 #if KMP_USE_DYNAMIC_LOCK
2255 
2256  int tag = KMP_EXTRACT_D_TAG(user_lock);
2257 # if USE_ITT_BUILD
2258  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2259 # endif
2260 # if KMP_USE_INLINED_TAS
2261  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2262  KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2263  } else
2264 # elif KMP_USE_INLINED_FUTEX
2265  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2266  KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2267  } else
2268 # endif
2269  {
2270  __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2271  }
2272 
2273 #else // KMP_USE_DYNAMIC_LOCK
2274 
2275  kmp_user_lock_p lck;
2276 
2277  /* Can't use serial interval since not block structured */
2278  /* release the lock */
2279 
2280  if ( ( __kmp_user_lock_kind == lk_tas )
2281  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2282 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2283  // "fast" path implemented to fix customer performance issue
2284 #if USE_ITT_BUILD
2285  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2286 #endif /* USE_ITT_BUILD */
2287  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2288  KMP_MB();
2289  return;
2290 #else
2291  lck = (kmp_user_lock_p)user_lock;
2292 #endif
2293  }
2294 #if KMP_USE_FUTEX
2295  else if ( ( __kmp_user_lock_kind == lk_futex )
2296  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2297  lck = (kmp_user_lock_p)user_lock;
2298  }
2299 #endif
2300  else {
2301  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
2302  }
2303 
2304 #if USE_ITT_BUILD
2305  __kmp_itt_lock_releasing( lck );
2306 #endif /* USE_ITT_BUILD */
2307 
2308  RELEASE_LOCK( lck, gtid );
2309 
2310 #if OMPT_SUPPORT && OMPT_BLAME
2311  if (ompt_enabled &&
2312  ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2313  ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
2314  }
2315 #endif
2316 
2317 #endif // KMP_USE_DYNAMIC_LOCK
2318 }
2319 
2320 /* release the lock */
2321 void
2322 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2323 {
2324 #if KMP_USE_DYNAMIC_LOCK
2325 
2326 # if USE_ITT_BUILD
2327  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2328 # endif
2329  KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2330 
2331 #else // KMP_USE_DYNAMIC_LOCK
2332 
2333  kmp_user_lock_p lck;
2334 
2335  /* Can't use serial interval since not block structured */
2336 
2337  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2338  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2339 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2340  // "fast" path implemented to fix customer performance issue
2341  kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
2342 #if USE_ITT_BUILD
2343  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2344 #endif /* USE_ITT_BUILD */
2345  if ( --(tl->lk.depth_locked) == 0 ) {
2346  TCW_4(tl->lk.poll, 0);
2347  }
2348  KMP_MB();
2349  return;
2350 #else
2351  lck = (kmp_user_lock_p)user_lock;
2352 #endif
2353  }
2354 #if KMP_USE_FUTEX
2355  else if ( ( __kmp_user_lock_kind == lk_futex )
2356  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2357  <= OMP_NEST_LOCK_T_SIZE ) ) {
2358  lck = (kmp_user_lock_p)user_lock;
2359  }
2360 #endif
2361  else {
2362  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
2363  }
2364 
2365 #if USE_ITT_BUILD
2366  __kmp_itt_lock_releasing( lck );
2367 #endif /* USE_ITT_BUILD */
2368 
2369  int release_status;
2370  release_status = RELEASE_NESTED_LOCK( lck, gtid );
2371 #if OMPT_SUPPORT && OMPT_BLAME
2372  if (ompt_enabled) {
2373  if (release_status == KMP_LOCK_RELEASED) {
2374  if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2375  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2376  (uint64_t) lck);
2377  }
2378  } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
2379  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2380  (uint64_t) lck);
2381  }
2382  }
2383 #endif
2384 
2385 #endif // KMP_USE_DYNAMIC_LOCK
2386 }
2387 
2388 /* try to acquire the lock */
2389 int
2390 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2391 {
2392  KMP_COUNT_BLOCK(OMP_test_lock);
2393 
2394 #if KMP_USE_DYNAMIC_LOCK
2395  int rc;
2396  int tag = KMP_EXTRACT_D_TAG(user_lock);
2397 # if USE_ITT_BUILD
2398  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2399 # endif
2400 # if KMP_USE_INLINED_TAS
2401  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2402  KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2403  } else
2404 # elif KMP_USE_INLINED_FUTEX
2405  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2406  KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2407  } else
2408 # endif
2409  {
2410  rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2411  }
2412  if (rc) {
2413 # if USE_ITT_BUILD
2414  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2415 # endif
2416  return FTN_TRUE;
2417  } else {
2418 # if USE_ITT_BUILD
2419  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2420 # endif
2421  return FTN_FALSE;
2422  }
2423 
2424 #else // KMP_USE_DYNAMIC_LOCK
2425 
2426  kmp_user_lock_p lck;
2427  int rc;
2428 
2429  if ( ( __kmp_user_lock_kind == lk_tas )
2430  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2431  lck = (kmp_user_lock_p)user_lock;
2432  }
2433 #if KMP_USE_FUTEX
2434  else if ( ( __kmp_user_lock_kind == lk_futex )
2435  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2436  lck = (kmp_user_lock_p)user_lock;
2437  }
2438 #endif
2439  else {
2440  lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
2441  }
2442 
2443 #if USE_ITT_BUILD
2444  __kmp_itt_lock_acquiring( lck );
2445 #endif /* USE_ITT_BUILD */
2446 
2447  rc = TEST_LOCK( lck, gtid );
2448 #if USE_ITT_BUILD
2449  if ( rc ) {
2450  __kmp_itt_lock_acquired( lck );
2451  } else {
2452  __kmp_itt_lock_cancelled( lck );
2453  }
2454 #endif /* USE_ITT_BUILD */
2455  return ( rc ? FTN_TRUE : FTN_FALSE );
2456 
2457  /* Can't use serial interval since not block structured */
2458 
2459 #endif // KMP_USE_DYNAMIC_LOCK
2460 }
2461 
2462 /* try to acquire the lock */
2463 int
2464 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2465 {
2466 #if KMP_USE_DYNAMIC_LOCK
2467  int rc;
2468 # if USE_ITT_BUILD
2469  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2470 # endif
2471  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2472 # if USE_ITT_BUILD
2473  if (rc) {
2474  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2475  } else {
2476  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2477  }
2478 # endif
2479  return rc;
2480 
2481 #else // KMP_USE_DYNAMIC_LOCK
2482 
2483  kmp_user_lock_p lck;
2484  int rc;
2485 
2486  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2487  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2488  lck = (kmp_user_lock_p)user_lock;
2489  }
2490 #if KMP_USE_FUTEX
2491  else if ( ( __kmp_user_lock_kind == lk_futex )
2492  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2493  <= OMP_NEST_LOCK_T_SIZE ) ) {
2494  lck = (kmp_user_lock_p)user_lock;
2495  }
2496 #endif
2497  else {
2498  lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
2499  }
2500 
2501 #if USE_ITT_BUILD
2502  __kmp_itt_lock_acquiring( lck );
2503 #endif /* USE_ITT_BUILD */
2504 
2505  rc = TEST_NESTED_LOCK( lck, gtid );
2506 #if USE_ITT_BUILD
2507  if ( rc ) {
2508  __kmp_itt_lock_acquired( lck );
2509  } else {
2510  __kmp_itt_lock_cancelled( lck );
2511  }
2512 #endif /* USE_ITT_BUILD */
2513  return rc;
2514 
2515  /* Can't use serial interval since not block structured */
2516 
2517 #endif // KMP_USE_DYNAMIC_LOCK
2518 }
2519 
2520 
2521 /*--------------------------------------------------------------------------------------------------------------------*/
2522 
2523 /*
2524  * Interface to fast scalable reduce methods routines
2525  */
2526 
2527 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
2528 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
2529 // AT: which solution is better?
2530 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
2531  ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
2532 
2533 #define __KMP_GET_REDUCTION_METHOD(gtid) \
2534  ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
2535 
2536 // description of the packed_reduction_method variable: look at the macros in kmp.h
2537 
2538 
2539 // used in a critical section reduce block
2540 static __forceinline void
2541 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2542 
2543  // this lock was visible to a customer and to the threading profile tool as a serial overhead span
2544  // (although it's used for an internal purpose only)
2545  // why was it visible in previous implementation?
2546  // should we keep it visible in new reduce block?
2547  kmp_user_lock_p lck;
2548 
2549 #if KMP_USE_DYNAMIC_LOCK
2550 
2551  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
2552  // Check if it is initialized.
2553  if (*lk == 0) {
2554  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2555  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(__kmp_user_lock_seq));
2556  } else {
2557  __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(__kmp_user_lock_seq));
2558  }
2559  }
2560  // Branch for accessing the actual lock object and set operation. This branching is inevitable since
2561  // this lock initialization does not follow the normal dispatch path (lock table is not used).
2562  if (KMP_EXTRACT_D_TAG(lk) != 0) {
2563  lck = (kmp_user_lock_p)lk;
2564  KMP_DEBUG_ASSERT(lck != NULL);
2565  if (__kmp_env_consistency_check) {
2566  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2567  }
2568  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
2569  } else {
2570  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
2571  lck = ilk->lock;
2572  KMP_DEBUG_ASSERT(lck != NULL);
2573  if (__kmp_env_consistency_check) {
2574  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2575  }
2576  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
2577  }
2578 
2579 #else // KMP_USE_DYNAMIC_LOCK
2580 
2581  // We know that the fast reduction code is only emitted by Intel compilers
2582  // with 32 byte critical sections. If there isn't enough space, then we
2583  // have to use a pointer.
2584  if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
2585  lck = (kmp_user_lock_p)crit;
2586  }
2587  else {
2588  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
2589  }
2590  KMP_DEBUG_ASSERT( lck != NULL );
2591 
2592  if ( __kmp_env_consistency_check )
2593  __kmp_push_sync( global_tid, ct_critical, loc, lck );
2594 
2595  __kmp_acquire_user_lock_with_checks( lck, global_tid );
2596 
2597 #endif // KMP_USE_DYNAMIC_LOCK
2598 }
2599 
2600 // used in a critical section reduce block
2601 static __forceinline void
2602 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2603 
2604  kmp_user_lock_p lck;
2605 
2606 #if KMP_USE_DYNAMIC_LOCK
2607 
2608  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2609  lck = (kmp_user_lock_p)crit;
2610  if (__kmp_env_consistency_check)
2611  __kmp_pop_sync(global_tid, ct_critical, loc);
2612  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2613  } else {
2614  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2615  if (__kmp_env_consistency_check)
2616  __kmp_pop_sync(global_tid, ct_critical, loc);
2617  KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2618  }
2619 
2620 #else // KMP_USE_DYNAMIC_LOCK
2621 
2622  // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
2623  // sections. If there isn't enough space, then we have to use a pointer.
2624  if ( __kmp_base_user_lock_size > 32 ) {
2625  lck = *( (kmp_user_lock_p *) crit );
2626  KMP_ASSERT( lck != NULL );
2627  } else {
2628  lck = (kmp_user_lock_p) crit;
2629  }
2630 
2631  if ( __kmp_env_consistency_check )
2632  __kmp_pop_sync( global_tid, ct_critical, loc );
2633 
2634  __kmp_release_user_lock_with_checks( lck, global_tid );
2635 
2636 #endif // KMP_USE_DYNAMIC_LOCK
2637 } // __kmp_end_critical_section_reduce_block
2638 
2639 
2640 /* 2.a.i. Reduce Block without a terminating barrier */
2654 kmp_int32
2656  ident_t *loc, kmp_int32 global_tid,
2657  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
2658  kmp_critical_name *lck ) {
2659 
2660  KMP_COUNT_BLOCK(REDUCE_nowait);
2661  int retval = 0;
2662  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2663 #if OMP_40_ENABLED
2664  kmp_team_t *team;
2665  kmp_info_t *th;
2666  int teams_swapped = 0, task_state;
2667 #endif
2668  KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
2669 
2670  // why do we need this initialization here at all?
2671  // Reduction clause can not be used as a stand-alone directive.
2672 
2673  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2674  // possible detection of false-positive race by the threadchecker ???
2675  if( ! TCR_4( __kmp_init_parallel ) )
2676  __kmp_parallel_initialize();
2677 
2678  // check correctness of reduce block nesting
2679 #if KMP_USE_DYNAMIC_LOCK
2680  if ( __kmp_env_consistency_check )
2681  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2682 #else
2683  if ( __kmp_env_consistency_check )
2684  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2685 #endif
2686 
2687 #if OMP_40_ENABLED
2688  th = __kmp_thread_from_gtid(global_tid);
2689  if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct?
2690  team = th->th.th_team;
2691  if( team->t.t_level == th->th.th_teams_level ) {
2692  // this is reduction at teams construct
2693  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
2694  // Let's swap teams temporarily for the reduction barrier
2695  teams_swapped = 1;
2696  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2697  th->th.th_team = team->t.t_parent;
2698  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2699  th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2700  task_state = th->th.th_task_state;
2701  th->th.th_task_state = 0;
2702  }
2703  }
2704 #endif // OMP_40_ENABLED
2705 
2706  // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
2707  // the variable should be either a construct-specific or thread-specific property, not a team specific property
2708  // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
2709  // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
2710  // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
2711  // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
2712  // a thread-specific "th_local.reduction_method" variable is used currently
2713  // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
2714 
2715  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2716  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2717 
2718  if( packed_reduction_method == critical_reduce_block ) {
2719 
2720  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2721  retval = 1;
2722 
2723  } else if( packed_reduction_method == empty_reduce_block ) {
2724 
2725  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2726  retval = 1;
2727 
2728  } else if( packed_reduction_method == atomic_reduce_block ) {
2729 
2730  retval = 2;
2731 
2732  // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
2733  // (it's not quite good, because the checking block has been closed by this 'pop',
2734  // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
2735  if ( __kmp_env_consistency_check )
2736  __kmp_pop_sync( global_tid, ct_reduce, loc );
2737 
2738  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2739 
2740  //AT: performance issue: a real barrier here
2741  //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them)
2742  //AT: (it's not what a customer might expect specifying NOWAIT clause)
2743  //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
2744  //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
2745  // and be more in line with sense of NOWAIT
2746  //AT: TO DO: do epcc test and compare times
2747 
2748  // this barrier should be invisible to a customer and to the threading profile tool
2749  // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
2750 #if USE_ITT_NOTIFY
2751  __kmp_threads[global_tid]->th.th_ident = loc;
2752 #endif
2753  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
2754  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2755 
2756  // all other workers except master should do this pop here
2757  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
2758  if ( __kmp_env_consistency_check ) {
2759  if( retval == 0 ) {
2760  __kmp_pop_sync( global_tid, ct_reduce, loc );
2761  }
2762  }
2763 
2764  } else {
2765 
2766  // should never reach this block
2767  KMP_ASSERT( 0 ); // "unexpected method"
2768 
2769  }
2770 #if OMP_40_ENABLED
2771  if( teams_swapped ) {
2772  // Restore thread structure
2773  th->th.th_info.ds.ds_tid = 0;
2774  th->th.th_team = team;
2775  th->th.th_team_nproc = team->t.t_nproc;
2776  th->th.th_task_team = team->t.t_task_team[task_state];
2777  th->th.th_task_state = task_state;
2778  }
2779 #endif
2780  KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2781 
2782  return retval;
2783 }
2784 
2793 void
2794 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2795 
2796  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2797 
2798  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
2799 
2800  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2801 
2802  if( packed_reduction_method == critical_reduce_block ) {
2803 
2804  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2805 
2806  } else if( packed_reduction_method == empty_reduce_block ) {
2807 
2808  // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
2809 
2810  } else if( packed_reduction_method == atomic_reduce_block ) {
2811 
2812  // neither master nor other workers should get here
2813  // (code gen does not generate this call in case 2: atomic reduce block)
2814  // actually it's better to remove this elseif at all;
2815  // after removal this value will checked by the 'else' and will assert
2816 
2817  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2818 
2819  // only master gets here
2820 
2821  } else {
2822 
2823  // should never reach this block
2824  KMP_ASSERT( 0 ); // "unexpected method"
2825 
2826  }
2827 
2828  if ( __kmp_env_consistency_check )
2829  __kmp_pop_sync( global_tid, ct_reduce, loc );
2830 
2831  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2832 
2833  return;
2834 }
2835 
2836 /* 2.a.ii. Reduce Block with a terminating barrier */
2837 
2851 kmp_int32
2853  ident_t *loc, kmp_int32 global_tid,
2854  kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2855  void (*reduce_func)(void *lhs_data, void *rhs_data),
2856  kmp_critical_name *lck )
2857 {
2858  KMP_COUNT_BLOCK(REDUCE_wait);
2859  int retval = 0;
2860  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2861 
2862  KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2863 
2864  // why do we need this initialization here at all?
2865  // Reduction clause can not be a stand-alone directive.
2866 
2867  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2868  // possible detection of false-positive race by the threadchecker ???
2869  if( ! TCR_4( __kmp_init_parallel ) )
2870  __kmp_parallel_initialize();
2871 
2872  // check correctness of reduce block nesting
2873 #if KMP_USE_DYNAMIC_LOCK
2874  if ( __kmp_env_consistency_check )
2875  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2876 #else
2877  if ( __kmp_env_consistency_check )
2878  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2879 #endif
2880 
2881  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2882  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2883 
2884  if( packed_reduction_method == critical_reduce_block ) {
2885 
2886  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2887  retval = 1;
2888 
2889  } else if( packed_reduction_method == empty_reduce_block ) {
2890 
2891  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2892  retval = 1;
2893 
2894  } else if( packed_reduction_method == atomic_reduce_block ) {
2895 
2896  retval = 2;
2897 
2898  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2899 
2900  //case tree_reduce_block:
2901  // this barrier should be visible to a customer and to the threading profile tool
2902  // (it's a terminating barrier on constructs if NOWAIT not specified)
2903 #if USE_ITT_NOTIFY
2904  __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
2905 #endif
2906  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2907  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2908 
2909  // all other workers except master should do this pop here
2910  // ( none of other workers except master will enter __kmpc_end_reduce() )
2911  if ( __kmp_env_consistency_check ) {
2912  if( retval == 0 ) { // 0: all other workers; 1: master
2913  __kmp_pop_sync( global_tid, ct_reduce, loc );
2914  }
2915  }
2916 
2917  } else {
2918 
2919  // should never reach this block
2920  KMP_ASSERT( 0 ); // "unexpected method"
2921 
2922  }
2923 
2924  KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2925 
2926  return retval;
2927 }
2928 
2938 void
2939 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2940 
2941  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2942 
2943  KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2944 
2945  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2946 
2947  // this barrier should be visible to a customer and to the threading profile tool
2948  // (it's a terminating barrier on constructs if NOWAIT not specified)
2949 
2950  if( packed_reduction_method == critical_reduce_block ) {
2951 
2952  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2953 
2954  // TODO: implicit barrier: should be exposed
2955 #if USE_ITT_NOTIFY
2956  __kmp_threads[global_tid]->th.th_ident = loc;
2957 #endif
2958  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2959 
2960  } else if( packed_reduction_method == empty_reduce_block ) {
2961 
2962  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2963 
2964  // TODO: implicit barrier: should be exposed
2965 #if USE_ITT_NOTIFY
2966  __kmp_threads[global_tid]->th.th_ident = loc;
2967 #endif
2968  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2969 
2970  } else if( packed_reduction_method == atomic_reduce_block ) {
2971 
2972  // TODO: implicit barrier: should be exposed
2973 #if USE_ITT_NOTIFY
2974  __kmp_threads[global_tid]->th.th_ident = loc;
2975 #endif
2976  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2977 
2978  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2979 
2980  // only master executes here (master releases all other workers)
2981  __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2982 
2983  } else {
2984 
2985  // should never reach this block
2986  KMP_ASSERT( 0 ); // "unexpected method"
2987 
2988  }
2989 
2990  if ( __kmp_env_consistency_check )
2991  __kmp_pop_sync( global_tid, ct_reduce, loc );
2992 
2993  KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2994 
2995  return;
2996 }
2997 
2998 #undef __KMP_GET_REDUCTION_METHOD
2999 #undef __KMP_SET_REDUCTION_METHOD
3000 
3001 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
3002 
3003 kmp_uint64
3004 __kmpc_get_taskid() {
3005 
3006  kmp_int32 gtid;
3007  kmp_info_t * thread;
3008 
3009  gtid = __kmp_get_gtid();
3010  if ( gtid < 0 ) {
3011  return 0;
3012  }; // if
3013  thread = __kmp_thread_from_gtid( gtid );
3014  return thread->th.th_current_task->td_task_id;
3015 
3016 } // __kmpc_get_taskid
3017 
3018 
3019 kmp_uint64
3020 __kmpc_get_parent_taskid() {
3021 
3022  kmp_int32 gtid;
3023  kmp_info_t * thread;
3024  kmp_taskdata_t * parent_task;
3025 
3026  gtid = __kmp_get_gtid();
3027  if ( gtid < 0 ) {
3028  return 0;
3029  }; // if
3030  thread = __kmp_thread_from_gtid( gtid );
3031  parent_task = thread->th.th_current_task->td_parent;
3032  return ( parent_task == NULL ? 0 : parent_task->td_task_id );
3033 
3034 } // __kmpc_get_parent_taskid
3035 
3036 void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
3037 {
3038  if ( ! __kmp_init_serial ) {
3039  __kmp_serial_initialize();
3040  }
3041  __kmp_place_num_sockets = nS;
3042  __kmp_place_socket_offset = sO;
3043  __kmp_place_num_cores = nC;
3044  __kmp_place_core_offset = cO;
3045  __kmp_place_num_threads_per_core = nT;
3046 }
3047 
3048 #if OMP_45_ENABLED
3049 
3060 void
3061 __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
3062 {
3063  int j, idx;
3064  kmp_int64 last, trace_count;
3065  kmp_info_t *th = __kmp_threads[gtid];
3066  kmp_team_t *team = th->th.th_team;
3067  kmp_uint32 *flags;
3068  kmp_disp_t *pr_buf = th->th.th_dispatch;
3069  dispatch_shared_info_t *sh_buf;
3070 
3071  KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3072  gtid, num_dims, !team->t.t_serialized));
3073  KMP_DEBUG_ASSERT(dims != NULL);
3074  KMP_DEBUG_ASSERT(num_dims > 0);
3075 
3076  if( team->t.t_serialized ) {
3077  KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
3078  return; // no dependencies if team is serialized
3079  }
3080  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3081  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
3082  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3083 
3084  // Save bounds info into allocated private buffer
3085  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3086  pr_buf->th_doacross_info =
3087  (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
3088  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3089  pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
3090  // Save also address of num_done in order to access it later without knowing the buffer index
3091  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3092  pr_buf->th_doacross_info[2] = dims[0].lo;
3093  pr_buf->th_doacross_info[3] = dims[0].up;
3094  pr_buf->th_doacross_info[4] = dims[0].st;
3095  last = 5;
3096  for( j = 1; j < num_dims; ++j ) {
3097  kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
3098  if( dims[j].st == 1 ) { // most common case
3099  // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3100  range_length = dims[j].up - dims[j].lo + 1;
3101  } else {
3102  if( dims[j].st > 0 ) {
3103  KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3104  range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3105  } else { // negative increment
3106  KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3107  range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3108  }
3109  }
3110  pr_buf->th_doacross_info[last++] = range_length;
3111  pr_buf->th_doacross_info[last++] = dims[j].lo;
3112  pr_buf->th_doacross_info[last++] = dims[j].up;
3113  pr_buf->th_doacross_info[last++] = dims[j].st;
3114  }
3115 
3116  // Compute total trip count.
3117  // Start with range of dims[0] which we don't need to keep in the buffer.
3118  if( dims[0].st == 1 ) { // most common case
3119  trace_count = dims[0].up - dims[0].lo + 1;
3120  } else if( dims[0].st > 0 ) {
3121  KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3122  trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3123  } else { // negative increment
3124  KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3125  trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3126  }
3127  for( j = 1; j < num_dims; ++j ) {
3128  trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3129  }
3130  KMP_DEBUG_ASSERT(trace_count > 0);
3131 
3132  // Check if shared buffer is not occupied by other loop (idx - __kmp_dispatch_num_buffers)
3133  if( idx != sh_buf->doacross_buf_idx ) {
3134  // Shared buffer is occupied, wait for it to be free
3135  __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
3136  }
3137  // Check if we are the first thread. After the CAS the first thread gets 0,
3138  // others get 1 if initialization is in progress, allocated pointer otherwise.
3139  flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
3140  (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
3141  if( flags == NULL ) {
3142  // we are the first thread, allocate the array of flags
3143  kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3144  sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
3145  } else if( (kmp_int64)flags == 1 ) {
3146  // initialization is still in progress, need to wait
3147  while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
3148  KMP_YIELD(TRUE);
3149  }
3150  }
3151  KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
3152  pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not
3153  // touch shared buffer on each iteration
3154  KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
3155 }
3156 
3157 void
3158 __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
3159 {
3160  kmp_int32 shft, num_dims, i;
3161  kmp_uint32 flag;
3162  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3163  kmp_info_t *th = __kmp_threads[gtid];
3164  kmp_team_t *team = th->th.th_team;
3165  kmp_disp_t *pr_buf;
3166  kmp_int64 lo, up, st;
3167 
3168  KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3169  if( team->t.t_serialized ) {
3170  KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
3171  return; // no dependencies if team is serialized
3172  }
3173 
3174  // calculate sequential iteration number and check out-of-bounds condition
3175  pr_buf = th->th.th_dispatch;
3176  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3177  num_dims = pr_buf->th_doacross_info[0];
3178  lo = pr_buf->th_doacross_info[2];
3179  up = pr_buf->th_doacross_info[3];
3180  st = pr_buf->th_doacross_info[4];
3181  if( st == 1 ) { // most common case
3182  if( vec[0] < lo || vec[0] > up ) {
3183  KA_TRACE(20,(
3184  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3185  gtid, vec[0], lo, up));
3186  return;
3187  }
3188  iter_number = vec[0] - lo;
3189  } else if( st > 0 ) {
3190  if( vec[0] < lo || vec[0] > up ) {
3191  KA_TRACE(20,(
3192  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3193  gtid, vec[0], lo, up));
3194  return;
3195  }
3196  iter_number = (kmp_uint64)(vec[0] - lo) / st;
3197  } else { // negative increment
3198  if( vec[0] > lo || vec[0] < up ) {
3199  KA_TRACE(20,(
3200  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3201  gtid, vec[0], lo, up));
3202  return;
3203  }
3204  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3205  }
3206  for( i = 1; i < num_dims; ++i ) {
3207  kmp_int64 iter, ln;
3208  kmp_int32 j = i * 4;
3209  ln = pr_buf->th_doacross_info[j + 1];
3210  lo = pr_buf->th_doacross_info[j + 2];
3211  up = pr_buf->th_doacross_info[j + 3];
3212  st = pr_buf->th_doacross_info[j + 4];
3213  if( st == 1 ) {
3214  if( vec[i] < lo || vec[i] > up ) {
3215  KA_TRACE(20,(
3216  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3217  gtid, vec[i], lo, up));
3218  return;
3219  }
3220  iter = vec[i] - lo;
3221  } else if( st > 0 ) {
3222  if( vec[i] < lo || vec[i] > up ) {
3223  KA_TRACE(20,(
3224  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3225  gtid, vec[i], lo, up));
3226  return;
3227  }
3228  iter = (kmp_uint64)(vec[i] - lo) / st;
3229  } else { // st < 0
3230  if( vec[i] > lo || vec[i] < up ) {
3231  KA_TRACE(20,(
3232  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3233  gtid, vec[i], lo, up));
3234  return;
3235  }
3236  iter = (kmp_uint64)(lo - vec[i]) / (-st);
3237  }
3238  iter_number = iter + ln * iter_number;
3239  }
3240  shft = iter_number % 32; // use 32-bit granularity
3241  iter_number >>= 5; // divided by 32
3242  flag = 1 << shft;
3243  while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
3244  KMP_YIELD(TRUE);
3245  }
3246  KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3247  gtid, (iter_number<<5)+shft));
3248 }
3249 
3250 void
3251 __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
3252 {
3253  kmp_int32 shft, num_dims, i;
3254  kmp_uint32 flag;
3255  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3256  kmp_info_t *th = __kmp_threads[gtid];
3257  kmp_team_t *team = th->th.th_team;
3258  kmp_disp_t *pr_buf;
3259  kmp_int64 lo, st;
3260 
3261  KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3262  if( team->t.t_serialized ) {
3263  KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
3264  return; // no dependencies if team is serialized
3265  }
3266 
3267  // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
3268  pr_buf = th->th.th_dispatch;
3269  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3270  num_dims = pr_buf->th_doacross_info[0];
3271  lo = pr_buf->th_doacross_info[2];
3272  st = pr_buf->th_doacross_info[4];
3273  if( st == 1 ) { // most common case
3274  iter_number = vec[0] - lo;
3275  } else if( st > 0 ) {
3276  iter_number = (kmp_uint64)(vec[0] - lo) / st;
3277  } else { // negative increment
3278  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3279  }
3280  for( i = 1; i < num_dims; ++i ) {
3281  kmp_int64 iter, ln;
3282  kmp_int32 j = i * 4;
3283  ln = pr_buf->th_doacross_info[j + 1];
3284  lo = pr_buf->th_doacross_info[j + 2];
3285  st = pr_buf->th_doacross_info[j + 4];
3286  if( st == 1 ) {
3287  iter = vec[i] - lo;
3288  } else if( st > 0 ) {
3289  iter = (kmp_uint64)(vec[i] - lo) / st;
3290  } else { // st < 0
3291  iter = (kmp_uint64)(lo - vec[i]) / (-st);
3292  }
3293  iter_number = iter + ln * iter_number;
3294  }
3295  shft = iter_number % 32; // use 32-bit granularity
3296  iter_number >>= 5; // divided by 32
3297  flag = 1 << shft;
3298  if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
3299  KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
3300  KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
3301  gtid, (iter_number<<5)+shft));
3302 }
3303 
3304 void
3305 __kmpc_doacross_fini(ident_t *loc, int gtid)
3306 {
3307  kmp_int64 num_done;
3308  kmp_info_t *th = __kmp_threads[gtid];
3309  kmp_team_t *team = th->th.th_team;
3310  kmp_disp_t *pr_buf = th->th.th_dispatch;
3311 
3312  KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
3313  if( team->t.t_serialized ) {
3314  KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
3315  return; // nothing to do
3316  }
3317  num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
3318  if( num_done == th->th.th_team_nproc ) {
3319  // we are the last thread, need to free shared resources
3320  int idx = pr_buf->th_doacross_buf_idx - 1;
3321  dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3322  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
3323  KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
3324  KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
3325  __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
3326  sh_buf->doacross_flags = NULL;
3327  sh_buf->doacross_num_done = 0;
3328  sh_buf->doacross_buf_idx += __kmp_dispatch_num_buffers; // free buffer for future re-use
3329  }
3330  // free private resources (need to keep buffer index forever)
3331  __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
3332  pr_buf->th_doacross_info = NULL;
3333  KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
3334 }
3335 #endif
3336 
3337 // end of file //
3338 
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:761
void __kmpc_flush(ident_t *loc)
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end(ident_t *loc)
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:183
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:747
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp.h:200
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
kmp_int32 __kmpc_in_parallel(ident_t *loc)
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1276
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
char const * psource
Definition: kmp.h:209
kmp_int32 flags
Definition: kmp.h:202