LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  * it may change values between parallel regions. __kmp_max_nth
18  * is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-specific.h"
40 #endif
41 
42 /* ------------------------------------------------------------------------ */
43 
44 #if KMP_STATIC_STEAL_ENABLED
45 
46 // replaces dispatch_private_info{32,64} structures and
47 // dispatch_private_info{32,64}_t types
48 template <typename T> struct dispatch_private_infoXX_template {
49  typedef typename traits_t<T>::unsigned_t UT;
50  typedef typename traits_t<T>::signed_t ST;
51  UT count; // unsigned
52  T ub;
53  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
54  T lb;
55  ST st; // signed
56  UT tc; // unsigned
57  T static_steal_counter; // for static_steal only; maybe better to put after ub
58 
59  /* parm[1-4] are used in different ways by different scheduling algorithms */
60 
61  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
62  // a) parm3 is properly aligned and
63  // b) all parm1-4 are in the same cache line.
64  // Because of parm1-4 are used together, performance seems to be better
65  // if they are in the same line (not measured though).
66 
67  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
68  T parm1;
69  T parm2;
70  T parm3;
71  T parm4;
72  };
73 
74  UT ordered_lower; // unsigned
75  UT ordered_upper; // unsigned
76 #if KMP_OS_WINDOWS
77  T last_upper;
78 #endif /* KMP_OS_WINDOWS */
79 };
80 
81 #else /* KMP_STATIC_STEAL_ENABLED */
82 
83 // replaces dispatch_private_info{32,64} structures and
84 // dispatch_private_info{32,64}_t types
85 template <typename T> struct dispatch_private_infoXX_template {
86  typedef typename traits_t<T>::unsigned_t UT;
87  typedef typename traits_t<T>::signed_t ST;
88  T lb;
89  T ub;
90  ST st; // signed
91  UT tc; // unsigned
92 
93  T parm1;
94  T parm2;
95  T parm3;
96  T parm4;
97 
98  UT count; // unsigned
99 
100  UT ordered_lower; // unsigned
101  UT ordered_upper; // unsigned
102 #if KMP_OS_WINDOWS
103  T last_upper;
104 #endif /* KMP_OS_WINDOWS */
105 };
106 
107 #endif /* KMP_STATIC_STEAL_ENABLED */
108 
109 // replaces dispatch_private_info structure and dispatch_private_info_t type
110 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
111  // duplicate alignment here, otherwise size of structure is not correct in our
112  // compiler
113  union KMP_ALIGN_CACHE private_info_tmpl {
114  dispatch_private_infoXX_template<T> p;
115  dispatch_private_info64_t p64;
116  } u;
117  enum sched_type schedule; /* scheduling algorithm */
118  kmp_uint32 ordered; /* ordered clause specified */
119  kmp_uint32 ordered_bumped;
120  // To retain the structure size after making ordered_iteration scalar
121  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
122  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
123  kmp_uint32 nomerge; /* don't merge iters if serialized */
124  kmp_uint32 type_size;
125  enum cons_type pushed_ws;
126 };
127 
128 // replaces dispatch_shared_info{32,64} structures and
129 // dispatch_shared_info{32,64}_t types
130 template <typename UT> struct dispatch_shared_infoXX_template {
131  /* chunk index under dynamic, number of idle threads under static-steal;
132  iteration index otherwise */
133  volatile UT iteration;
134  volatile UT num_done;
135  volatile UT ordered_iteration;
136  // to retain the structure size making ordered_iteration scalar
137  UT ordered_dummy[KMP_MAX_ORDERED - 3];
138 };
139 
140 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
141 template <typename UT> struct dispatch_shared_info_template {
142  // we need union here to keep the structure size
143  union shared_info_tmpl {
144  dispatch_shared_infoXX_template<UT> s;
145  dispatch_shared_info64_t s64;
146  } u;
147  volatile kmp_uint32 buffer_index;
148 #if OMP_45_ENABLED
149  volatile kmp_int32 doacross_buf_idx; // teamwise index
150  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
151  kmp_int32 doacross_num_done; // count finished threads
152 #endif
153 #if KMP_USE_HWLOC
154  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
155  // machines (> 48 cores). Performance analysis showed that a cache thrash
156  // was occurring and this padding helps alleviate the problem.
157  char padding[64];
158 #endif
159 };
160 
161 /* ------------------------------------------------------------------------ */
162 
163 #undef USE_TEST_LOCKS
164 
165 // test_then_add template (general template should NOT be used)
166 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
167 
168 template <>
169 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
170  kmp_int32 d) {
171  kmp_int32 r;
172  r = KMP_TEST_THEN_ADD32(p, d);
173  return r;
174 }
175 
176 template <>
177 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
178  kmp_int64 d) {
179  kmp_int64 r;
180  r = KMP_TEST_THEN_ADD64(p, d);
181  return r;
182 }
183 
184 // test_then_inc_acq template (general template should NOT be used)
185 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
186 
187 template <>
188 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
189  kmp_int32 r;
190  r = KMP_TEST_THEN_INC_ACQ32(p);
191  return r;
192 }
193 
194 template <>
195 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
196  kmp_int64 r;
197  r = KMP_TEST_THEN_INC_ACQ64(p);
198  return r;
199 }
200 
201 // test_then_inc template (general template should NOT be used)
202 template <typename T> static __forceinline T test_then_inc(volatile T *p);
203 
204 template <>
205 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
206  kmp_int32 r;
207  r = KMP_TEST_THEN_INC32(p);
208  return r;
209 }
210 
211 template <>
212 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
213  kmp_int64 r;
214  r = KMP_TEST_THEN_INC64(p);
215  return r;
216 }
217 
218 // compare_and_swap template (general template should NOT be used)
219 template <typename T>
220 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
221 
222 template <>
223 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
224  kmp_int32 c, kmp_int32 s) {
225  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
226 }
227 
228 template <>
229 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
230  kmp_int64 c, kmp_int64 s) {
231  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
232 }
233 
234 /* Spin wait loop that first does pause, then yield.
235  Waits until function returns non-zero when called with *spinner and check.
236  Does NOT put threads to sleep.
237 #if USE_ITT_BUILD
238  Arguments:
239  obj -- is higher-level synchronization object to report to ittnotify.
240  It is used to report locks consistently. For example, if lock is
241  acquired immediately, its address is reported to ittnotify via
242  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
243  and lock routine calls to KMP_WAIT_YIELD(), the later should report the
244  same address, not an address of low-level spinner.
245 #endif // USE_ITT_BUILD
246 */
247 template <typename UT>
248 // ToDo: make inline function (move to header file for icl)
249 static UT // unsigned 4- or 8-byte type
250  __kmp_wait_yield(
251  volatile UT *spinner, UT checker,
252  kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
253  void *obj) // Higher-level synchronization object, or NULL.
254  ) {
255  // note: we may not belong to a team at this point
256  volatile UT *spin = spinner;
257  UT check = checker;
258  kmp_uint32 spins;
259  kmp_uint32 (*f)(UT, UT) = pred;
260  UT r;
261 
262  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
263  KMP_INIT_YIELD(spins);
264  // main wait spin loop
265  while (!f(r = *spin, check)) {
266  KMP_FSYNC_SPIN_PREPARE(obj);
267  /* GEH - remove this since it was accidentally introduced when kmp_wait was
268  split. It causes problems with infinite recursion because of exit lock */
269  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
270  __kmp_abort_thread(); */
271 
272  // if we are oversubscribed, or have waited a bit (and
273  // KMP_LIBRARY=throughput, then yield. pause is in the following code
274  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
275  KMP_YIELD_SPIN(spins);
276  }
277  KMP_FSYNC_SPIN_ACQUIRED(obj);
278  return r;
279 }
280 
281 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
282  return value == checker;
283 }
284 
285 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
286  return value != checker;
287 }
288 
289 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
290  return value < checker;
291 }
292 
293 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
294  return value >= checker;
295 }
296 
297 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
298  return value <= checker;
299 }
300 
301 /* ------------------------------------------------------------------------ */
302 
303 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
304  ident_t *loc_ref) {
305  kmp_info_t *th;
306 
307  KMP_DEBUG_ASSERT(gtid_ref);
308 
309  if (__kmp_env_consistency_check) {
310  th = __kmp_threads[*gtid_ref];
311  if (th->th.th_root->r.r_active &&
312  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
313 #if KMP_USE_DYNAMIC_LOCK
314  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
315 #else
316  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
317 #endif
318  }
319  }
320 }
321 
322 template <typename UT>
323 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
324  typedef typename traits_t<UT>::signed_t ST;
325  dispatch_private_info_template<UT> *pr;
326 
327  int gtid = *gtid_ref;
328  // int cid = *cid_ref;
329  kmp_info_t *th = __kmp_threads[gtid];
330  KMP_DEBUG_ASSERT(th->th.th_dispatch);
331 
332  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
333  if (__kmp_env_consistency_check) {
334  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
335  th->th.th_dispatch->th_dispatch_pr_current);
336  if (pr->pushed_ws != ct_none) {
337 #if KMP_USE_DYNAMIC_LOCK
338  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
339 #else
340  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
341 #endif
342  }
343  }
344 
345  if (!th->th.th_team->t.t_serialized) {
346  dispatch_shared_info_template<UT> *sh =
347  reinterpret_cast<dispatch_shared_info_template<UT> *>(
348  th->th.th_dispatch->th_dispatch_sh_current);
349  UT lower;
350 
351  if (!__kmp_env_consistency_check) {
352  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
353  th->th.th_dispatch->th_dispatch_pr_current);
354  }
355  lower = pr->u.p.ordered_lower;
356 
357 #if !defined(KMP_GOMP_COMPAT)
358  if (__kmp_env_consistency_check) {
359  if (pr->ordered_bumped) {
360  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
361  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
362  ct_ordered_in_pdo, loc_ref,
363  &p->stack_data[p->w_top]);
364  }
365  }
366 #endif /* !defined(KMP_GOMP_COMPAT) */
367 
368  KMP_MB();
369 #ifdef KMP_DEBUG
370  {
371  char *buff;
372  // create format specifiers before the debug output
373  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
374  "ordered_iter:%%%s lower:%%%s\n",
375  traits_t<UT>::spec, traits_t<UT>::spec);
376  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
377  __kmp_str_free(&buff);
378  }
379 #endif
380 
381  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
382  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
383  KMP_MB(); /* is this necessary? */
384 #ifdef KMP_DEBUG
385  {
386  char *buff;
387  // create format specifiers before the debug output
388  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
389  "ordered_iter:%%%s lower:%%%s\n",
390  traits_t<UT>::spec, traits_t<UT>::spec);
391  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
392  __kmp_str_free(&buff);
393  }
394 #endif
395  }
396  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
397 }
398 
399 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
400  ident_t *loc_ref) {
401  kmp_info_t *th;
402 
403  if (__kmp_env_consistency_check) {
404  th = __kmp_threads[*gtid_ref];
405  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
406  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
407  }
408  }
409 }
410 
411 template <typename UT>
412 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
413  typedef typename traits_t<UT>::signed_t ST;
414  dispatch_private_info_template<UT> *pr;
415 
416  int gtid = *gtid_ref;
417  // int cid = *cid_ref;
418  kmp_info_t *th = __kmp_threads[gtid];
419  KMP_DEBUG_ASSERT(th->th.th_dispatch);
420 
421  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
422  if (__kmp_env_consistency_check) {
423  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
424  th->th.th_dispatch->th_dispatch_pr_current);
425  if (pr->pushed_ws != ct_none) {
426  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
427  }
428  }
429 
430  if (!th->th.th_team->t.t_serialized) {
431  dispatch_shared_info_template<UT> *sh =
432  reinterpret_cast<dispatch_shared_info_template<UT> *>(
433  th->th.th_dispatch->th_dispatch_sh_current);
434 
435  if (!__kmp_env_consistency_check) {
436  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
437  th->th.th_dispatch->th_dispatch_pr_current);
438  }
439 
440  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
441 #if !defined(KMP_GOMP_COMPAT)
442  if (__kmp_env_consistency_check) {
443  if (pr->ordered_bumped != 0) {
444  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
445  /* How to test it? - OM */
446  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
447  ct_ordered_in_pdo, loc_ref,
448  &p->stack_data[p->w_top]);
449  }
450  }
451 #endif /* !defined(KMP_GOMP_COMPAT) */
452 
453  KMP_MB(); /* Flush all pending memory write invalidates. */
454 
455  pr->ordered_bumped += 1;
456 
457  KD_TRACE(1000,
458  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
459  gtid, pr->ordered_bumped));
460 
461  KMP_MB(); /* Flush all pending memory write invalidates. */
462 
463  /* TODO use general release procedure? */
464  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
465 
466  KMP_MB(); /* Flush all pending memory write invalidates. */
467  }
468  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
469 }
470 
471 // Computes and returns x to the power of y, where y must a non-negative integer
472 template <typename UT>
473 static __forceinline long double __kmp_pow(long double x, UT y) {
474  long double s = 1.0L;
475 
476  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
477  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
478  while (y) {
479  if (y & 1)
480  s *= x;
481  x *= x;
482  y >>= 1;
483  }
484  return s;
485 }
486 
487 /* Computes and returns the number of unassigned iterations after idx chunks
488  have been assigned (the total number of unassigned iterations in chunks with
489  index greater than or equal to idx). __forceinline seems to be broken so that
490  if we __forceinline this function, the behavior is wrong
491  (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
492 template <typename T>
493 static __inline typename traits_t<T>::unsigned_t
494 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
495  typename traits_t<T>::unsigned_t idx) {
496  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
497  ICL 8.1, long double arithmetic may not really have long double precision,
498  even with /Qlong_double. Currently, we workaround that in the caller code,
499  by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack
500  of precision is not expected to be a correctness issue, though. */
501  typedef typename traits_t<T>::unsigned_t UT;
502 
503  long double x = tc * __kmp_pow<UT>(base, idx);
504  UT r = (UT)x;
505  if (x == r)
506  return r;
507  return r + 1;
508 }
509 
510 // Parameters of the guided-iterative algorithm:
511 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
512 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
513 // by default n = 2. For example with n = 3 the chunks distribution will be more
514 // flat.
515 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
516 static int guided_int_param = 2;
517 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
518 
519 // UT - unsigned flavor of T, ST - signed flavor of T,
520 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
521 template <typename T>
522 static void
523 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
524  T ub, typename traits_t<T>::signed_t st,
525  typename traits_t<T>::signed_t chunk, int push_ws) {
526  typedef typename traits_t<T>::unsigned_t UT;
527  typedef typename traits_t<T>::signed_t ST;
528  typedef typename traits_t<T>::floating_t DBL;
529 
530  int active;
531  T tc;
532  kmp_info_t *th;
533  kmp_team_t *team;
534  kmp_uint32 my_buffer_index;
535  dispatch_private_info_template<T> *pr;
536  dispatch_shared_info_template<UT> volatile *sh;
537 
538  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
539  sizeof(dispatch_private_info));
540  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
541  sizeof(dispatch_shared_info));
542 
543  if (!TCR_4(__kmp_init_parallel))
544  __kmp_parallel_initialize();
545 
546 #if INCLUDE_SSC_MARKS
547  SSC_MARK_DISPATCH_INIT();
548 #endif
549 #ifdef KMP_DEBUG
550  {
551  char *buff;
552  // create format specifiers before the debug output
553  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
554  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
555  traits_t<ST>::spec, traits_t<T>::spec,
556  traits_t<T>::spec, traits_t<ST>::spec);
557  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
558  __kmp_str_free(&buff);
559  }
560 #endif
561  /* setup data */
562  th = __kmp_threads[gtid];
563  team = th->th.th_team;
564  active = !team->t.t_serialized;
565  th->th.th_ident = loc;
566 
567 #if USE_ITT_BUILD
568  kmp_uint64 cur_chunk = chunk;
569  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
570  __kmp_forkjoin_frames_mode == 3 &&
571  KMP_MASTER_GTID(gtid) &&
572 #if OMP_40_ENABLED
573  th->th.th_teams_microtask == NULL &&
574 #endif
575  team->t.t_active_level == 1;
576 #endif
577  if (!active) {
578  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
579  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
580  } else {
581  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
582  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
583 
584  my_buffer_index = th->th.th_dispatch->th_disp_index++;
585 
586  /* What happens when number of threads changes, need to resize buffer? */
587  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
588  &th->th.th_dispatch
589  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
590  sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
591  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
592  }
593 
594 #if (KMP_STATIC_STEAL_ENABLED)
595  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
596  // AC: we now have only one implementation of stealing, so use it
597  schedule = kmp_sch_static_steal;
598  else
599 #endif
600  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
601 
602  /* Pick up the nomerge/ordered bits from the scheduling type */
603  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
604  pr->nomerge = TRUE;
605  schedule =
606  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
607  } else {
608  pr->nomerge = FALSE;
609  }
610  pr->type_size = traits_t<T>::type_size; // remember the size of variables
611  if (kmp_ord_lower & schedule) {
612  pr->ordered = TRUE;
613  schedule =
614  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
615  } else {
616  pr->ordered = FALSE;
617  }
618 
619  if (schedule == kmp_sch_static) {
620  schedule = __kmp_static;
621  } else {
622  if (schedule == kmp_sch_runtime) {
623  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
624  // not specified)
625  schedule = team->t.t_sched.r_sched_type;
626  // Detail the schedule if needed (global controls are differentiated
627  // appropriately)
628  if (schedule == kmp_sch_guided_chunked) {
629  schedule = __kmp_guided;
630  } else if (schedule == kmp_sch_static) {
631  schedule = __kmp_static;
632  }
633  // Use the chunk size specified by OMP_SCHEDULE (or default if not
634  // specified)
635  chunk = team->t.t_sched.chunk;
636 #if USE_ITT_BUILD
637  cur_chunk = chunk;
638 #endif
639 #ifdef KMP_DEBUG
640  {
641  char *buff;
642  // create format specifiers before the debug output
643  buff = __kmp_str_format(
644  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
645  traits_t<ST>::spec);
646  KD_TRACE(10, (buff, gtid, schedule, chunk));
647  __kmp_str_free(&buff);
648  }
649 #endif
650  } else {
651  if (schedule == kmp_sch_guided_chunked) {
652  schedule = __kmp_guided;
653  }
654  if (chunk <= 0) {
655  chunk = KMP_DEFAULT_CHUNK;
656  }
657  }
658 
659  if (schedule == kmp_sch_auto) {
660  // mapping and differentiation: in the __kmp_do_serial_initialize()
661  schedule = __kmp_auto;
662 #ifdef KMP_DEBUG
663  {
664  char *buff;
665  // create format specifiers before the debug output
666  buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
667  "schedule:%%d chunk:%%%s\n",
668  traits_t<ST>::spec);
669  KD_TRACE(10, (buff, gtid, schedule, chunk));
670  __kmp_str_free(&buff);
671  }
672 #endif
673  }
674 
675  /* guided analytical not safe for too many threads */
676  if (schedule == kmp_sch_guided_analytical_chunked &&
677  th->th.th_team_nproc > 1 << 20) {
678  schedule = kmp_sch_guided_iterative_chunked;
679  KMP_WARNING(DispatchManyThreads);
680  }
681  if (schedule == kmp_sch_runtime_simd) {
682  // compiler provides simd_width in the chunk parameter
683  schedule = team->t.t_sched.r_sched_type;
684  // Detail the schedule if needed (global controls are differentiated
685  // appropriately)
686  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
687  schedule == __kmp_static) {
688  schedule = kmp_sch_static_balanced_chunked;
689  } else {
690  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
691  schedule = kmp_sch_guided_simd;
692  }
693  chunk = team->t.t_sched.chunk * chunk;
694  }
695 #if USE_ITT_BUILD
696  cur_chunk = chunk;
697 #endif
698 #ifdef KMP_DEBUG
699  {
700  char *buff;
701  // create format specifiers before the debug output
702  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
703  " chunk:%%%s\n",
704  traits_t<ST>::spec);
705  KD_TRACE(10, (buff, gtid, schedule, chunk));
706  __kmp_str_free(&buff);
707  }
708 #endif
709  }
710  pr->u.p.parm1 = chunk;
711  }
712  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
713  "unknown scheduling type");
714 
715  pr->u.p.count = 0;
716 
717  if (__kmp_env_consistency_check) {
718  if (st == 0) {
719  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
720  (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
721  }
722  }
723  // compute trip count
724  if (st == 1) { // most common case
725  if (ub >= lb) {
726  tc = ub - lb + 1;
727  } else { // ub < lb
728  tc = 0; // zero-trip
729  }
730  } else if (st < 0) {
731  if (lb >= ub) {
732  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
733  // where the division needs to be unsigned regardless of the result type
734  tc = (UT)(lb - ub) / (-st) + 1;
735  } else { // lb < ub
736  tc = 0; // zero-trip
737  }
738  } else { // st > 0
739  if (ub >= lb) {
740  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
741  // where the division needs to be unsigned regardless of the result type
742  tc = (UT)(ub - lb) / st + 1;
743  } else { // ub < lb
744  tc = 0; // zero-trip
745  }
746  }
747 
748  // Any half-decent optimizer will remove this test when the blocks are empty
749  // since the macros expand to nothing when statistics are disabled.
750  if (schedule == __kmp_static) {
751  KMP_COUNT_BLOCK(OMP_FOR_static);
752  KMP_COUNT_VALUE(FOR_static_iterations, tc);
753  } else {
754  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
755  KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
756  }
757 
758  pr->u.p.lb = lb;
759  pr->u.p.ub = ub;
760  pr->u.p.st = st;
761  pr->u.p.tc = tc;
762 
763 #if KMP_OS_WINDOWS
764  pr->u.p.last_upper = ub + st;
765 #endif /* KMP_OS_WINDOWS */
766 
767  /* NOTE: only the active parallel region(s) has active ordered sections */
768 
769  if (active) {
770  if (pr->ordered == 0) {
771  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
772  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
773  } else {
774  pr->ordered_bumped = 0;
775 
776  pr->u.p.ordered_lower = 1;
777  pr->u.p.ordered_upper = 0;
778 
779  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
780  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
781  }
782  }
783 
784  if (__kmp_env_consistency_check) {
785  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
786  if (push_ws) {
787  __kmp_push_workshare(gtid, ws, loc);
788  pr->pushed_ws = ws;
789  } else {
790  __kmp_check_workshare(gtid, ws, loc);
791  pr->pushed_ws = ct_none;
792  }
793  }
794 
795  switch (schedule) {
796 #if (KMP_STATIC_STEAL_ENABLED)
797  case kmp_sch_static_steal: {
798  T nproc = th->th.th_team_nproc;
799  T ntc, init;
800 
801  KD_TRACE(100,
802  ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
803 
804  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
805  if (nproc > 1 && ntc >= nproc) {
806  KMP_COUNT_BLOCK(OMP_FOR_static_steal);
807  T id = __kmp_tid_from_gtid(gtid);
808  T small_chunk, extras;
809 
810  small_chunk = ntc / nproc;
811  extras = ntc % nproc;
812 
813  init = id * small_chunk + (id < extras ? id : extras);
814  pr->u.p.count = init;
815  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
816 
817  pr->u.p.parm2 = lb;
818  // pr->pfields.parm3 = 0; // it's not used in static_steal
819  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
820  pr->u.p.st = st;
821  if (traits_t<T>::type_size > 4) {
822  // AC: TODO: check if 16-byte CAS available and use it to
823  // improve performance (probably wait for explicit request
824  // before spending time on this).
825  // For now use dynamically allocated per-thread lock,
826  // free memory in __kmp_dispatch_next when status==0.
827  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
828  th->th.th_dispatch->th_steal_lock =
829  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
830  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
831  }
832  break;
833  } else {
834  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
835  "kmp_sch_static_balanced\n",
836  gtid));
837  schedule = kmp_sch_static_balanced;
838  /* too few iterations: fall-through to kmp_sch_static_balanced */
839  } // if
840  /* FALL-THROUGH to static balanced */
841  } // case
842 #endif
843  case kmp_sch_static_balanced: {
844  T nproc = th->th.th_team_nproc;
845  T init, limit;
846 
847  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
848  gtid));
849 
850  if (nproc > 1) {
851  T id = __kmp_tid_from_gtid(gtid);
852 
853  if (tc < nproc) {
854  if (id < tc) {
855  init = id;
856  limit = id;
857  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
858  } else {
859  pr->u.p.count = 1; /* means no more chunks to execute */
860  pr->u.p.parm1 = FALSE;
861  break;
862  }
863  } else {
864  T small_chunk = tc / nproc;
865  T extras = tc % nproc;
866  init = id * small_chunk + (id < extras ? id : extras);
867  limit = init + small_chunk - (id < extras ? 0 : 1);
868  pr->u.p.parm1 = (id == nproc - 1);
869  }
870  } else {
871  if (tc > 0) {
872  init = 0;
873  limit = tc - 1;
874  pr->u.p.parm1 = TRUE;
875  } else { // zero trip count
876  pr->u.p.count = 1; /* means no more chunks to execute */
877  pr->u.p.parm1 = FALSE;
878  break;
879  }
880  }
881 #if USE_ITT_BUILD
882  // Calculate chunk for metadata report
883  if (itt_need_metadata_reporting)
884  cur_chunk = limit - init + 1;
885 #endif
886  if (st == 1) {
887  pr->u.p.lb = lb + init;
888  pr->u.p.ub = lb + limit;
889  } else {
890  // calculated upper bound, "ub" is user-defined upper bound
891  T ub_tmp = lb + limit * st;
892  pr->u.p.lb = lb + init * st;
893  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
894  // it exactly
895  if (st > 0) {
896  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
897  } else {
898  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
899  }
900  }
901  if (pr->ordered) {
902  pr->u.p.ordered_lower = init;
903  pr->u.p.ordered_upper = limit;
904  }
905  break;
906  } // case
907  case kmp_sch_static_balanced_chunked: {
908  // similar to balanced, but chunk adjusted to multiple of simd width
909  T nth = th->th.th_team_nproc;
910  KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
911  " -> falling-through to static_greedy\n",
912  gtid));
913  schedule = kmp_sch_static_greedy;
914  if (nth > 1)
915  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
916  else
917  pr->u.p.parm1 = tc;
918  break;
919  } // case
920  case kmp_sch_guided_iterative_chunked:
921  case kmp_sch_guided_simd: {
922  T nproc = th->th.th_team_nproc;
923  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
924  " case\n",
925  gtid));
926 
927  if (nproc > 1) {
928  if ((2L * chunk + 1) * nproc >= tc) {
929  /* chunk size too large, switch to dynamic */
930  schedule = kmp_sch_dynamic_chunked;
931  } else {
932  // when remaining iters become less than parm2 - switch to dynamic
933  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
934  *(double *)&pr->u.p.parm3 =
935  guided_flt_param / nproc; // may occupy parm3 and parm4
936  }
937  } else {
938  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
939  "kmp_sch_static_greedy\n",
940  gtid));
941  schedule = kmp_sch_static_greedy;
942  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
943  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
944  gtid));
945  pr->u.p.parm1 = tc;
946  } // if
947  } // case
948  break;
949  case kmp_sch_guided_analytical_chunked: {
950  T nproc = th->th.th_team_nproc;
951  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
952  " case\n",
953  gtid));
954  if (nproc > 1) {
955  if ((2L * chunk + 1) * nproc >= tc) {
956  /* chunk size too large, switch to dynamic */
957  schedule = kmp_sch_dynamic_chunked;
958  } else {
959  /* commonly used term: (2 nproc - 1)/(2 nproc) */
960  DBL x;
961 
962 #if KMP_OS_WINDOWS && KMP_ARCH_X86
963  /* Linux* OS already has 64-bit computation by default for long double,
964  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
965  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
966  instead of the default 53-bit. Even though long double doesn't work
967  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
968  expected to impact the correctness of the algorithm, but this has not
969  been mathematically proven. */
970  // save original FPCW and set precision to 64-bit, as
971  // Windows* OS on IA-32 architecture defaults to 53-bit
972  unsigned int oldFpcw = _control87(0, 0);
973  _control87(_PC_64, _MCW_PC); // 0,0x30000
974 #endif
975  /* value used for comparison in solver for cross-over point */
976  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
977 
978  /* crossover point--chunk indexes equal to or greater than
979  this point switch to dynamic-style scheduling */
980  UT cross;
981 
982  /* commonly used term: (2 nproc - 1)/(2 nproc) */
983  x = (long double)1.0 - (long double)0.5 / nproc;
984 
985 #ifdef KMP_DEBUG
986  { // test natural alignment
987  struct _test_a {
988  char a;
989  union {
990  char b;
991  DBL d;
992  };
993  } t;
994  ptrdiff_t natural_alignment =
995  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
996  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
997  // long)natural_alignment );
998  KMP_DEBUG_ASSERT(
999  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1000  }
1001 #endif // KMP_DEBUG
1002 
1003  /* save the term in thread private dispatch structure */
1004  *(DBL *)&pr->u.p.parm3 = x;
1005 
1006  /* solve for the crossover point to the nearest integer i for which C_i
1007  <= chunk */
1008  {
1009  UT left, right, mid;
1010  long double p;
1011 
1012  /* estimate initial upper and lower bound */
1013 
1014  /* doesn't matter what value right is as long as it is positive, but
1015  it affects performance of the solver */
1016  right = 229;
1017  p = __kmp_pow<UT>(x, right);
1018  if (p > target) {
1019  do {
1020  p *= p;
1021  right <<= 1;
1022  } while (p > target && right < (1 << 27));
1023  /* lower bound is previous (failed) estimate of upper bound */
1024  left = right >> 1;
1025  } else {
1026  left = 0;
1027  }
1028 
1029  /* bisection root-finding method */
1030  while (left + 1 < right) {
1031  mid = (left + right) / 2;
1032  if (__kmp_pow<UT>(x, mid) > target) {
1033  left = mid;
1034  } else {
1035  right = mid;
1036  }
1037  } // while
1038  cross = right;
1039  }
1040  /* assert sanity of computed crossover point */
1041  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1042  __kmp_pow<UT>(x, cross) <= target);
1043 
1044  /* save the crossover point in thread private dispatch structure */
1045  pr->u.p.parm2 = cross;
1046 
1047 // C75803
1048 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1049 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1050 #else
1051 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1052 #endif
1053  /* dynamic-style scheduling offset */
1054  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1055  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1056  cross * chunk;
1057 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1058  // restore FPCW
1059  _control87(oldFpcw, _MCW_PC);
1060 #endif
1061  } // if
1062  } else {
1063  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1064  "kmp_sch_static_greedy\n",
1065  gtid));
1066  schedule = kmp_sch_static_greedy;
1067  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1068  pr->u.p.parm1 = tc;
1069  } // if
1070  } // case
1071  break;
1072  case kmp_sch_static_greedy:
1073  KD_TRACE(100,
1074  ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1075  pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1076  ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1077  : tc;
1078  break;
1079  case kmp_sch_static_chunked:
1080  case kmp_sch_dynamic_chunked:
1081  if (pr->u.p.parm1 <= 0) {
1082  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1083  }
1084  KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1085  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1086  gtid));
1087  break;
1088  case kmp_sch_trapezoidal: {
1089  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1090 
1091  T parm1, parm2, parm3, parm4;
1092  KD_TRACE(100,
1093  ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1094 
1095  parm1 = chunk;
1096 
1097  /* F : size of the first cycle */
1098  parm2 = (tc / (2 * th->th.th_team_nproc));
1099 
1100  if (parm2 < 1) {
1101  parm2 = 1;
1102  }
1103 
1104  /* L : size of the last cycle. Make sure the last cycle is not larger
1105  than the first cycle. */
1106  if (parm1 < 1) {
1107  parm1 = 1;
1108  } else if (parm1 > parm2) {
1109  parm1 = parm2;
1110  }
1111 
1112  /* N : number of cycles */
1113  parm3 = (parm2 + parm1);
1114  parm3 = (2 * tc + parm3 - 1) / parm3;
1115 
1116  if (parm3 < 2) {
1117  parm3 = 2;
1118  }
1119 
1120  /* sigma : decreasing incr of the trapezoid */
1121  parm4 = (parm3 - 1);
1122  parm4 = (parm2 - parm1) / parm4;
1123 
1124  // pointless check, because parm4 >= 0 always
1125  // if ( parm4 < 0 ) {
1126  // parm4 = 0;
1127  //}
1128 
1129  pr->u.p.parm1 = parm1;
1130  pr->u.p.parm2 = parm2;
1131  pr->u.p.parm3 = parm3;
1132  pr->u.p.parm4 = parm4;
1133  } // case
1134  break;
1135 
1136  default: {
1137  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1138  KMP_HNT(GetNewerLibrary), // Hint
1139  __kmp_msg_null // Variadic argument list terminator
1140  );
1141  } break;
1142  } // switch
1143  pr->schedule = schedule;
1144  if (active) {
1145  /* The name of this buffer should be my_buffer_index when it's free to use
1146  * it */
1147 
1148  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1149  "sh->buffer_index:%d\n",
1150  gtid, my_buffer_index, sh->buffer_index));
1151  __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1152  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1153  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1154  // my_buffer_index are *always* 32-bit integers.
1155  KMP_MB(); /* is this necessary? */
1156  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1157  "sh->buffer_index:%d\n",
1158  gtid, my_buffer_index, sh->buffer_index));
1159 
1160  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1161  th->th.th_dispatch->th_dispatch_sh_current =
1162  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1163 #if USE_ITT_BUILD
1164  if (pr->ordered) {
1165  __kmp_itt_ordered_init(gtid);
1166  }
1167  // Report loop metadata
1168  if (itt_need_metadata_reporting) {
1169  // Only report metadata by master of active team at level 1
1170  kmp_uint64 schedtype = 0;
1171  switch (schedule) {
1172  case kmp_sch_static_chunked:
1173  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1174  break;
1175  case kmp_sch_static_greedy:
1176  cur_chunk = pr->u.p.parm1;
1177  break;
1178  case kmp_sch_dynamic_chunked:
1179  schedtype = 1;
1180  break;
1181  case kmp_sch_guided_iterative_chunked:
1182  case kmp_sch_guided_analytical_chunked:
1183  case kmp_sch_guided_simd:
1184  schedtype = 2;
1185  break;
1186  default:
1187  // Should we put this case under "static"?
1188  // case kmp_sch_static_steal:
1189  schedtype = 3;
1190  break;
1191  }
1192  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1193  }
1194 #endif /* USE_ITT_BUILD */
1195  }
1196 
1197 #ifdef KMP_DEBUG
1198  {
1199  char *buff;
1200  // create format specifiers before the debug output
1201  buff = __kmp_str_format(
1202  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1203  "lb:%%%s ub:%%%s"
1204  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1205  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1206  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1207  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1208  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1209  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1210  KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1211  pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1212  pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1213  pr->u.p.parm3, pr->u.p.parm4));
1214  __kmp_str_free(&buff);
1215  }
1216 #endif
1217 #if (KMP_STATIC_STEAL_ENABLED)
1218  // It cannot be guaranteed that after execution of a loop with some other
1219  // schedule kind all the parm3 variables will contain the same value. Even if
1220  // all parm3 will be the same, it still exists a bad case like using 0 and 1
1221  // rather than program life-time increment. So the dedicated variable is
1222  // required. The 'static_steal_counter' is used.
1223  if (schedule == kmp_sch_static_steal) {
1224  // Other threads will inspect this variable when searching for a victim.
1225  // This is a flag showing that other threads may steal from this thread
1226  // since then.
1227  volatile T *p = &pr->u.p.static_steal_counter;
1228  *p = *p + 1;
1229  }
1230 #endif // ( KMP_STATIC_STEAL_ENABLED )
1231 
1232 #if OMPT_SUPPORT && OMPT_OPTIONAL
1233  if (ompt_enabled.ompt_callback_work) {
1234  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1235  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1236  kmp_info_t *thr = __kmp_threads[gtid];
1237  ompt_callbacks.ompt_callback(ompt_callback_work)(
1238  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1239  &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1240  }
1241 #endif
1242 }
1243 
1244 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1245  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1246  * every chunk of iterations. If the ordered section(s) were not executed
1247  * for this iteration (or every iteration in this chunk), we need to set the
1248  * ordered iteration counters so that the next thread can proceed. */
1249 template <typename UT>
1250 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1251  typedef typename traits_t<UT>::signed_t ST;
1252  kmp_info_t *th = __kmp_threads[gtid];
1253 
1254  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1255  if (!th->th.th_team->t.t_serialized) {
1256 
1257  dispatch_private_info_template<UT> *pr =
1258  reinterpret_cast<dispatch_private_info_template<UT> *>(
1259  th->th.th_dispatch->th_dispatch_pr_current);
1260  dispatch_shared_info_template<UT> volatile *sh =
1261  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1262  th->th.th_dispatch->th_dispatch_sh_current);
1263  KMP_DEBUG_ASSERT(pr);
1264  KMP_DEBUG_ASSERT(sh);
1265  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1266  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1267 
1268  if (pr->ordered_bumped) {
1269  KD_TRACE(
1270  1000,
1271  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1272  gtid));
1273  pr->ordered_bumped = 0;
1274  } else {
1275  UT lower = pr->u.p.ordered_lower;
1276 
1277 #ifdef KMP_DEBUG
1278  {
1279  char *buff;
1280  // create format specifiers before the debug output
1281  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1282  "ordered_iteration:%%%s lower:%%%s\n",
1283  traits_t<UT>::spec, traits_t<UT>::spec);
1284  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1285  __kmp_str_free(&buff);
1286  }
1287 #endif
1288 
1289  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1290  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1291  KMP_MB(); /* is this necessary? */
1292 #ifdef KMP_DEBUG
1293  {
1294  char *buff;
1295  // create format specifiers before the debug output
1296  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1297  "ordered_iteration:%%%s lower:%%%s\n",
1298  traits_t<UT>::spec, traits_t<UT>::spec);
1299  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1300  __kmp_str_free(&buff);
1301  }
1302 #endif
1303 
1304  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1305  } // if
1306  } // if
1307  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1308 }
1309 
1310 #ifdef KMP_GOMP_COMPAT
1311 
1312 template <typename UT>
1313 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1314  typedef typename traits_t<UT>::signed_t ST;
1315  kmp_info_t *th = __kmp_threads[gtid];
1316 
1317  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1318  if (!th->th.th_team->t.t_serialized) {
1319  // int cid;
1320  dispatch_private_info_template<UT> *pr =
1321  reinterpret_cast<dispatch_private_info_template<UT> *>(
1322  th->th.th_dispatch->th_dispatch_pr_current);
1323  dispatch_shared_info_template<UT> volatile *sh =
1324  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1325  th->th.th_dispatch->th_dispatch_sh_current);
1326  KMP_DEBUG_ASSERT(pr);
1327  KMP_DEBUG_ASSERT(sh);
1328  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1329  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1330 
1331  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1332  UT lower = pr->u.p.ordered_lower;
1333  UT upper = pr->u.p.ordered_upper;
1334  UT inc = upper - lower + 1;
1335 
1336  if (pr->ordered_bumped == inc) {
1337  KD_TRACE(
1338  1000,
1339  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1340  gtid));
1341  pr->ordered_bumped = 0;
1342  } else {
1343  inc -= pr->ordered_bumped;
1344 
1345 #ifdef KMP_DEBUG
1346  {
1347  char *buff;
1348  // create format specifiers before the debug output
1349  buff = __kmp_str_format(
1350  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1351  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1352  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1353  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1354  __kmp_str_free(&buff);
1355  }
1356 #endif
1357 
1358  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1359  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1360 
1361  KMP_MB(); /* is this necessary? */
1362  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1363  "ordered_bumped to zero\n",
1364  gtid));
1365  pr->ordered_bumped = 0;
1367 #ifdef KMP_DEBUG
1368  {
1369  char *buff;
1370  // create format specifiers before the debug output
1371  buff = __kmp_str_format(
1372  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1373  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1374  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1375  traits_t<UT>::spec);
1376  KD_TRACE(1000,
1377  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1378  __kmp_str_free(&buff);
1379  }
1380 #endif
1381 
1382  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1383  }
1384  // }
1385  }
1386  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1387 }
1388 
1389 #endif /* KMP_GOMP_COMPAT */
1390 
1391 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1392  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1393  is not called. */
1394 #if OMPT_SUPPORT && OMPT_OPTIONAL
1395 #define OMPT_LOOP_END \
1396  if (status == 0) { \
1397  if (ompt_enabled.ompt_callback_work) { \
1398  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1399  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1400  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1401  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1402  &(task_info->task_data), 0, codeptr); \
1403  } \
1404  }
1405 // TODO: implement count
1406 #else
1407 #define OMPT_LOOP_END // no-op
1408 #endif
1409 
1410 template <typename T>
1411 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1412  T *p_lb, T *p_ub,
1413  typename traits_t<T>::signed_t *p_st
1414 #if OMPT_SUPPORT && OMPT_OPTIONAL
1415  ,
1416  void *codeptr
1417 #endif
1418  ) {
1419 
1420  typedef typename traits_t<T>::unsigned_t UT;
1421  typedef typename traits_t<T>::signed_t ST;
1422  typedef typename traits_t<T>::floating_t DBL;
1423 
1424  // This is potentially slightly misleading, schedule(runtime) will appear here
1425  // even if the actual runtme schedule is static. (Which points out a
1426  // disadavantage of schedule(runtime): even when static scheduling is used it
1427  // costs more than a compile time choice to use static scheduling would.)
1428  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1429 
1430  int status;
1431  dispatch_private_info_template<T> *pr;
1432  kmp_info_t *th = __kmp_threads[gtid];
1433  kmp_team_t *team = th->th.th_team;
1434 
1435  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1436 #ifdef KMP_DEBUG
1437  {
1438  char *buff;
1439  // create format specifiers before the debug output
1440  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1441  "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1442  traits_t<T>::spec, traits_t<T>::spec,
1443  traits_t<ST>::spec);
1444  KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1445  __kmp_str_free(&buff);
1446  }
1447 #endif
1448 
1449  if (team->t.t_serialized) {
1450  /* NOTE: serialize this dispatch becase we are not at the active level */
1451  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1452  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1453  KMP_DEBUG_ASSERT(pr);
1454 
1455  if ((status = (pr->u.p.tc != 0)) == 0) {
1456  *p_lb = 0;
1457  *p_ub = 0;
1458  // if ( p_last != NULL )
1459  // *p_last = 0;
1460  if (p_st != NULL)
1461  *p_st = 0;
1462  if (__kmp_env_consistency_check) {
1463  if (pr->pushed_ws != ct_none) {
1464  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1465  }
1466  }
1467  } else if (pr->nomerge) {
1468  kmp_int32 last;
1469  T start;
1470  UT limit, trip, init;
1471  ST incr;
1472  T chunk = pr->u.p.parm1;
1473 
1474  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1475  gtid));
1476 
1477  init = chunk * pr->u.p.count++;
1478  trip = pr->u.p.tc - 1;
1479 
1480  if ((status = (init <= trip)) == 0) {
1481  *p_lb = 0;
1482  *p_ub = 0;
1483  // if ( p_last != NULL )
1484  // *p_last = 0;
1485  if (p_st != NULL)
1486  *p_st = 0;
1487  if (__kmp_env_consistency_check) {
1488  if (pr->pushed_ws != ct_none) {
1489  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1490  }
1491  }
1492  } else {
1493  start = pr->u.p.lb;
1494  limit = chunk + init - 1;
1495  incr = pr->u.p.st;
1496 
1497  if ((last = (limit >= trip)) != 0) {
1498  limit = trip;
1499 #if KMP_OS_WINDOWS
1500  pr->u.p.last_upper = pr->u.p.ub;
1501 #endif /* KMP_OS_WINDOWS */
1502  }
1503  if (p_last != NULL)
1504  *p_last = last;
1505  if (p_st != NULL)
1506  *p_st = incr;
1507  if (incr == 1) {
1508  *p_lb = start + init;
1509  *p_ub = start + limit;
1510  } else {
1511  *p_lb = start + init * incr;
1512  *p_ub = start + limit * incr;
1513  }
1514 
1515  if (pr->ordered) {
1516  pr->u.p.ordered_lower = init;
1517  pr->u.p.ordered_upper = limit;
1518 #ifdef KMP_DEBUG
1519  {
1520  char *buff;
1521  // create format specifiers before the debug output
1522  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1523  "ordered_lower:%%%s ordered_upper:%%%s\n",
1524  traits_t<UT>::spec, traits_t<UT>::spec);
1525  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1526  pr->u.p.ordered_upper));
1527  __kmp_str_free(&buff);
1528  }
1529 #endif
1530  } // if
1531  } // if
1532  } else {
1533  pr->u.p.tc = 0;
1534  *p_lb = pr->u.p.lb;
1535  *p_ub = pr->u.p.ub;
1536 #if KMP_OS_WINDOWS
1537  pr->u.p.last_upper = *p_ub;
1538 #endif /* KMP_OS_WINDOWS */
1539  if (p_last != NULL)
1540  *p_last = TRUE;
1541  if (p_st != NULL)
1542  *p_st = pr->u.p.st;
1543  } // if
1544 #ifdef KMP_DEBUG
1545  {
1546  char *buff;
1547  // create format specifiers before the debug output
1548  buff = __kmp_str_format(
1549  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1550  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1551  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1552  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1553  __kmp_str_free(&buff);
1554  }
1555 #endif
1556 #if INCLUDE_SSC_MARKS
1557  SSC_MARK_DISPATCH_NEXT();
1558 #endif
1559  OMPT_LOOP_END;
1560  return status;
1561  } else {
1562  kmp_int32 last = 0;
1563  dispatch_shared_info_template<UT> *sh;
1564  T start;
1565  ST incr;
1566  UT limit, trip, init;
1567 
1568  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1569  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1570 
1571  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1572  th->th.th_dispatch->th_dispatch_pr_current);
1573  KMP_DEBUG_ASSERT(pr);
1574  sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1575  th->th.th_dispatch->th_dispatch_sh_current);
1576  KMP_DEBUG_ASSERT(sh);
1577 
1578  if (pr->u.p.tc == 0) {
1579  // zero trip count
1580  status = 0;
1581  } else {
1582  switch (pr->schedule) {
1583 #if (KMP_STATIC_STEAL_ENABLED)
1584  case kmp_sch_static_steal: {
1585  T chunk = pr->u.p.parm1;
1586  int nproc = th->th.th_team_nproc;
1587 
1588  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1589  gtid));
1590 
1591  trip = pr->u.p.tc - 1;
1592 
1593  if (traits_t<T>::type_size > 4) {
1594  // use lock for 8-byte and CAS for 4-byte induction
1595  // variable. TODO (optional): check and use 16-byte CAS
1596  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1597  KMP_DEBUG_ASSERT(lck != NULL);
1598  if (pr->u.p.count < (UT)pr->u.p.ub) {
1599  __kmp_acquire_lock(lck, gtid);
1600  // try to get own chunk of iterations
1601  init = (pr->u.p.count)++;
1602  status = (init < (UT)pr->u.p.ub);
1603  __kmp_release_lock(lck, gtid);
1604  } else {
1605  status = 0; // no own chunks
1606  }
1607  if (!status) { // try to steal
1608  kmp_info_t **other_threads = team->t.t_threads;
1609  int while_limit = nproc; // nproc attempts to find a victim
1610  int while_index = 0;
1611  // TODO: algorithm of searching for a victim
1612  // should be cleaned up and measured
1613  while ((!status) && (while_limit != ++while_index)) {
1614  T remaining;
1615  T victimIdx = pr->u.p.parm4;
1616  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1617  dispatch_private_info_template<T> *victim =
1618  reinterpret_cast<dispatch_private_info_template<T> *>(
1619  other_threads[victimIdx]
1620  ->th.th_dispatch->th_dispatch_pr_current);
1621  while ((victim == NULL || victim == pr ||
1622  (*(volatile T *)&victim->u.p.static_steal_counter !=
1623  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1624  oldVictimIdx != victimIdx) {
1625  victimIdx = (victimIdx + 1) % nproc;
1626  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1627  other_threads[victimIdx]
1628  ->th.th_dispatch->th_dispatch_pr_current);
1629  }
1630  if (!victim ||
1631  (*(volatile T *)&victim->u.p.static_steal_counter !=
1632  *(volatile T *)&pr->u.p.static_steal_counter)) {
1633  continue; // try once more (nproc attempts in total)
1634  // no victim is ready yet to participate in stealing
1635  // because all victims are still in kmp_init_dispatch
1636  }
1637  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1638  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1639  continue; // not enough chunks to steal, goto next victim
1640  }
1641 
1642  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1643  KMP_ASSERT(lck != NULL);
1644  __kmp_acquire_lock(lck, gtid);
1645  limit = victim->u.p.ub; // keep initial ub
1646  if (victim->u.p.count >= limit ||
1647  (remaining = limit - victim->u.p.count) < 2) {
1648  __kmp_release_lock(lck, gtid);
1649  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1650  continue; // not enough chunks to steal
1651  }
1652  // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1653  // or by 1
1654  if (remaining > 3) {
1655  KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1656  init = (victim->u.p.ub -=
1657  (remaining >> 2)); // steal 1/4 of remaining
1658  } else {
1659  KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1660  init =
1661  (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1662  }
1663  __kmp_release_lock(lck, gtid);
1664 
1665  KMP_DEBUG_ASSERT(init + 1 <= limit);
1666  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1667  status = 1;
1668  while_index = 0;
1669  // now update own count and ub with stolen range but init chunk
1670  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1671  pr->u.p.count = init + 1;
1672  pr->u.p.ub = limit;
1673  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1674  } // while (search for victim)
1675  } // if (try to find victim and steal)
1676  } else {
1677  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1678  typedef union {
1679  struct {
1680  UT count;
1681  T ub;
1682  } p;
1683  kmp_int64 b;
1684  } union_i4;
1685  // All operations on 'count' or 'ub' must be combined atomically
1686  // together.
1687  {
1688  union_i4 vold, vnew;
1689  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1690  vnew = vold;
1691  vnew.p.count++;
1692  while (!KMP_COMPARE_AND_STORE_ACQ64(
1693  (volatile kmp_int64 *)&pr->u.p.count,
1694  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1695  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1696  KMP_CPU_PAUSE();
1697  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1698  vnew = vold;
1699  vnew.p.count++;
1700  }
1701  vnew = vold;
1702  init = vnew.p.count;
1703  status = (init < (UT)vnew.p.ub);
1704  }
1705 
1706  if (!status) {
1707  kmp_info_t **other_threads = team->t.t_threads;
1708  int while_limit = nproc; // nproc attempts to find a victim
1709  int while_index = 0;
1710 
1711  // TODO: algorithm of searching for a victim
1712  // should be cleaned up and measured
1713  while ((!status) && (while_limit != ++while_index)) {
1714  union_i4 vold, vnew;
1715  kmp_int32 remaining;
1716  T victimIdx = pr->u.p.parm4;
1717  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1718  dispatch_private_info_template<T> *victim =
1719  reinterpret_cast<dispatch_private_info_template<T> *>(
1720  other_threads[victimIdx]
1721  ->th.th_dispatch->th_dispatch_pr_current);
1722  while ((victim == NULL || victim == pr ||
1723  (*(volatile T *)&victim->u.p.static_steal_counter !=
1724  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1725  oldVictimIdx != victimIdx) {
1726  victimIdx = (victimIdx + 1) % nproc;
1727  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1728  other_threads[victimIdx]
1729  ->th.th_dispatch->th_dispatch_pr_current);
1730  }
1731  if (!victim ||
1732  (*(volatile T *)&victim->u.p.static_steal_counter !=
1733  *(volatile T *)&pr->u.p.static_steal_counter)) {
1734  continue; // try once more (nproc attempts in total)
1735  // no victim is ready yet to participate in stealing
1736  // because all victims are still in kmp_init_dispatch
1737  }
1738  pr->u.p.parm4 = victimIdx; // new victim found
1739  while (1) { // CAS loop if victim has enough chunks to steal
1740  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1741  vnew = vold;
1742 
1743  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1744  if (vnew.p.count >= (UT)vnew.p.ub ||
1745  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1746  pr->u.p.parm4 =
1747  (victimIdx + 1) % nproc; // shift start victim id
1748  break; // not enough chunks to steal, goto next victim
1749  }
1750  if (remaining > 3) {
1751  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1752  } else {
1753  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1754  }
1755  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1756  // TODO: Should this be acquire or release?
1757  if (KMP_COMPARE_AND_STORE_ACQ64(
1758  (volatile kmp_int64 *)&victim->u.p.count,
1759  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1760  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1761  // stealing succeeded
1762  KMP_COUNT_VALUE(FOR_static_steal_stolen,
1763  vold.p.ub - vnew.p.ub);
1764  status = 1;
1765  while_index = 0;
1766  // now update own count and ub
1767  init = vnew.p.ub;
1768  vold.p.count = init + 1;
1769 #if KMP_ARCH_X86
1770  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1771  vold.b);
1772 #else
1773  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1774 #endif
1775  break;
1776  } // if (check CAS result)
1777  KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1778  } // while (try to steal from particular victim)
1779  } // while (search for victim)
1780  } // if (try to find victim and steal)
1781  } // if (4-byte induction variable)
1782  if (!status) {
1783  *p_lb = 0;
1784  *p_ub = 0;
1785  if (p_st != NULL)
1786  *p_st = 0;
1787  } else {
1788  start = pr->u.p.parm2;
1789  init *= chunk;
1790  limit = chunk + init - 1;
1791  incr = pr->u.p.st;
1792  KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1793 
1794  KMP_DEBUG_ASSERT(init <= trip);
1795  if ((last = (limit >= trip)) != 0)
1796  limit = trip;
1797  if (p_st != NULL)
1798  *p_st = incr;
1799 
1800  if (incr == 1) {
1801  *p_lb = start + init;
1802  *p_ub = start + limit;
1803  } else {
1804  *p_lb = start + init * incr;
1805  *p_ub = start + limit * incr;
1806  }
1807 
1808  if (pr->ordered) {
1809  pr->u.p.ordered_lower = init;
1810  pr->u.p.ordered_upper = limit;
1811 #ifdef KMP_DEBUG
1812  {
1813  char *buff;
1814  // create format specifiers before the debug output
1815  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1816  "ordered_lower:%%%s ordered_upper:%%%s\n",
1817  traits_t<UT>::spec, traits_t<UT>::spec);
1818  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1819  pr->u.p.ordered_upper));
1820  __kmp_str_free(&buff);
1821  }
1822 #endif
1823  } // if
1824  } // if
1825  break;
1826  } // case
1827 #endif // ( KMP_STATIC_STEAL_ENABLED )
1828  case kmp_sch_static_balanced: {
1829  KD_TRACE(
1830  100,
1831  ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1832  if ((status = !pr->u.p.count) !=
1833  0) { /* check if thread has any iteration to do */
1834  pr->u.p.count = 1;
1835  *p_lb = pr->u.p.lb;
1836  *p_ub = pr->u.p.ub;
1837  last = pr->u.p.parm1;
1838  if (p_st != NULL)
1839  *p_st = pr->u.p.st;
1840  } else { /* no iterations to do */
1841  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1842  }
1843  if (pr->ordered) {
1844 #ifdef KMP_DEBUG
1845  {
1846  char *buff;
1847  // create format specifiers before the debug output
1848  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1849  "ordered_lower:%%%s ordered_upper:%%%s\n",
1850  traits_t<UT>::spec, traits_t<UT>::spec);
1851  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1852  pr->u.p.ordered_upper));
1853  __kmp_str_free(&buff);
1854  }
1855 #endif
1856  } // if
1857  } // case
1858  break;
1859  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1860  merged here */
1861  case kmp_sch_static_chunked: {
1862  T parm1;
1863 
1864  KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1865  "kmp_sch_static_[affinity|chunked] case\n",
1866  gtid));
1867  parm1 = pr->u.p.parm1;
1868 
1869  trip = pr->u.p.tc - 1;
1870  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1871 
1872  if ((status = (init <= trip)) != 0) {
1873  start = pr->u.p.lb;
1874  incr = pr->u.p.st;
1875  limit = parm1 + init - 1;
1876 
1877  if ((last = (limit >= trip)) != 0)
1878  limit = trip;
1879 
1880  if (p_st != NULL)
1881  *p_st = incr;
1882 
1883  pr->u.p.count += th->th.th_team_nproc;
1884 
1885  if (incr == 1) {
1886  *p_lb = start + init;
1887  *p_ub = start + limit;
1888  } else {
1889  *p_lb = start + init * incr;
1890  *p_ub = start + limit * incr;
1891  }
1892 
1893  if (pr->ordered) {
1894  pr->u.p.ordered_lower = init;
1895  pr->u.p.ordered_upper = limit;
1896 #ifdef KMP_DEBUG
1897  {
1898  char *buff;
1899  // create format specifiers before the debug output
1900  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1901  "ordered_lower:%%%s ordered_upper:%%%s\n",
1902  traits_t<UT>::spec, traits_t<UT>::spec);
1903  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1904  pr->u.p.ordered_upper));
1905  __kmp_str_free(&buff);
1906  }
1907 #endif
1908  } // if
1909  } // if
1910  } // case
1911  break;
1912 
1913  case kmp_sch_dynamic_chunked: {
1914  T chunk = pr->u.p.parm1;
1915 
1916  KD_TRACE(
1917  100,
1918  ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1919 
1920  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1921  trip = pr->u.p.tc - 1;
1922 
1923  if ((status = (init <= trip)) == 0) {
1924  *p_lb = 0;
1925  *p_ub = 0;
1926  if (p_st != NULL)
1927  *p_st = 0;
1928  } else {
1929  start = pr->u.p.lb;
1930  limit = chunk + init - 1;
1931  incr = pr->u.p.st;
1932 
1933  if ((last = (limit >= trip)) != 0)
1934  limit = trip;
1935 
1936  if (p_st != NULL)
1937  *p_st = incr;
1938 
1939  if (incr == 1) {
1940  *p_lb = start + init;
1941  *p_ub = start + limit;
1942  } else {
1943  *p_lb = start + init * incr;
1944  *p_ub = start + limit * incr;
1945  }
1946 
1947  if (pr->ordered) {
1948  pr->u.p.ordered_lower = init;
1949  pr->u.p.ordered_upper = limit;
1950 #ifdef KMP_DEBUG
1951  {
1952  char *buff;
1953  // create format specifiers before the debug output
1954  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1955  "ordered_lower:%%%s ordered_upper:%%%s\n",
1956  traits_t<UT>::spec, traits_t<UT>::spec);
1957  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1958  pr->u.p.ordered_upper));
1959  __kmp_str_free(&buff);
1960  }
1961 #endif
1962  } // if
1963  } // if
1964  } // case
1965  break;
1966 
1967  case kmp_sch_guided_iterative_chunked: {
1968  T chunkspec = pr->u.p.parm1;
1969  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1970  "iterative case\n",
1971  gtid));
1972  trip = pr->u.p.tc;
1973  // Start atomic part of calculations
1974  while (1) {
1975  ST remaining; // signed, because can be < 0
1976  init = sh->u.s.iteration; // shared value
1977  remaining = trip - init;
1978  if (remaining <= 0) { // AC: need to compare with 0 first
1979  // nothing to do, don't try atomic op
1980  status = 0;
1981  break;
1982  }
1983  if ((T)remaining <
1984  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1985  // use dynamic-style shcedule
1986  // atomically inrement iterations, get old value
1987  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1988  (ST)chunkspec);
1989  remaining = trip - init;
1990  if (remaining <= 0) {
1991  status = 0; // all iterations got by other threads
1992  } else { // got some iterations to work on
1993  status = 1;
1994  if ((T)remaining > chunkspec) {
1995  limit = init + chunkspec - 1;
1996  } else {
1997  last = 1; // the last chunk
1998  limit = init + remaining - 1;
1999  } // if
2000  } // if
2001  break;
2002  } // if
2003  limit = init + (UT)(remaining *
2004  *(double *)&pr->u.p.parm3); // divide by K*nproc
2005  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2006  (ST)init, (ST)limit)) {
2007  // CAS was successful, chunk obtained
2008  status = 1;
2009  --limit;
2010  break;
2011  } // if
2012  } // while
2013  if (status != 0) {
2014  start = pr->u.p.lb;
2015  incr = pr->u.p.st;
2016  if (p_st != NULL)
2017  *p_st = incr;
2018  *p_lb = start + init * incr;
2019  *p_ub = start + limit * incr;
2020  if (pr->ordered) {
2021  pr->u.p.ordered_lower = init;
2022  pr->u.p.ordered_upper = limit;
2023 #ifdef KMP_DEBUG
2024  {
2025  char *buff;
2026  // create format specifiers before the debug output
2027  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2028  "ordered_lower:%%%s ordered_upper:%%%s\n",
2029  traits_t<UT>::spec, traits_t<UT>::spec);
2030  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2031  pr->u.p.ordered_upper));
2032  __kmp_str_free(&buff);
2033  }
2034 #endif
2035  } // if
2036  } else {
2037  *p_lb = 0;
2038  *p_ub = 0;
2039  if (p_st != NULL)
2040  *p_st = 0;
2041  } // if
2042  } // case
2043  break;
2044 
2045  case kmp_sch_guided_simd: {
2046  // same as iterative but curr-chunk adjusted to be multiple of given
2047  // chunk
2048  T chunk = pr->u.p.parm1;
2049  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2050  gtid));
2051  trip = pr->u.p.tc;
2052  // Start atomic part of calculations
2053  while (1) {
2054  ST remaining; // signed, because can be < 0
2055  init = sh->u.s.iteration; // shared value
2056  remaining = trip - init;
2057  if (remaining <= 0) { // AC: need to compare with 0 first
2058  status = 0; // nothing to do, don't try atomic op
2059  break;
2060  }
2061  KMP_DEBUG_ASSERT(init % chunk == 0);
2062  // compare with K*nproc*(chunk+1), K=2 by default
2063  if ((T)remaining < pr->u.p.parm2) {
2064  // use dynamic-style shcedule
2065  // atomically inrement iterations, get old value
2066  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2067  (ST)chunk);
2068  remaining = trip - init;
2069  if (remaining <= 0) {
2070  status = 0; // all iterations got by other threads
2071  } else {
2072  // got some iterations to work on
2073  status = 1;
2074  if ((T)remaining > chunk) {
2075  limit = init + chunk - 1;
2076  } else {
2077  last = 1; // the last chunk
2078  limit = init + remaining - 1;
2079  } // if
2080  } // if
2081  break;
2082  } // if
2083  // divide by K*nproc
2084  UT span = remaining * (*(double *)&pr->u.p.parm3);
2085  UT rem = span % chunk;
2086  if (rem) // adjust so that span%chunk == 0
2087  span += chunk - rem;
2088  limit = init + span;
2089  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2090  (ST)init, (ST)limit)) {
2091  // CAS was successful, chunk obtained
2092  status = 1;
2093  --limit;
2094  break;
2095  } // if
2096  } // while
2097  if (status != 0) {
2098  start = pr->u.p.lb;
2099  incr = pr->u.p.st;
2100  if (p_st != NULL)
2101  *p_st = incr;
2102  *p_lb = start + init * incr;
2103  *p_ub = start + limit * incr;
2104  if (pr->ordered) {
2105  pr->u.p.ordered_lower = init;
2106  pr->u.p.ordered_upper = limit;
2107 #ifdef KMP_DEBUG
2108  {
2109  char *buff;
2110  // create format specifiers before the debug output
2111  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2112  "ordered_lower:%%%s ordered_upper:%%%s\n",
2113  traits_t<UT>::spec, traits_t<UT>::spec);
2114  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2115  pr->u.p.ordered_upper));
2116  __kmp_str_free(&buff);
2117  }
2118 #endif
2119  } // if
2120  } else {
2121  *p_lb = 0;
2122  *p_ub = 0;
2123  if (p_st != NULL)
2124  *p_st = 0;
2125  } // if
2126  } // case
2127  break;
2128 
2129  case kmp_sch_guided_analytical_chunked: {
2130  T chunkspec = pr->u.p.parm1;
2131  UT chunkIdx;
2132 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2133  /* for storing original FPCW value for Windows* OS on
2134  IA-32 architecture 8-byte version */
2135  unsigned int oldFpcw;
2136  unsigned int fpcwSet = 0;
2137 #endif
2138  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2139  "analytical case\n",
2140  gtid));
2141 
2142  trip = pr->u.p.tc;
2143 
2144  KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2145  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2146  trip);
2147 
2148  while (1) { /* this while loop is a safeguard against unexpected zero
2149  chunk sizes */
2150  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2151  if (chunkIdx >= (UT)pr->u.p.parm2) {
2152  --trip;
2153  /* use dynamic-style scheduling */
2154  init = chunkIdx * chunkspec + pr->u.p.count;
2155  /* need to verify init > 0 in case of overflow in the above
2156  * calculation */
2157  if ((status = (init > 0 && init <= trip)) != 0) {
2158  limit = init + chunkspec - 1;
2159 
2160  if ((last = (limit >= trip)) != 0)
2161  limit = trip;
2162  }
2163  break;
2164  } else {
2165 /* use exponential-style scheduling */
2166 /* The following check is to workaround the lack of long double precision on
2167  Windows* OS.
2168  This check works around the possible effect that init != 0 for chunkIdx == 0.
2169  */
2170 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2171  /* If we haven't already done so, save original FPCW and set
2172  precision to 64-bit, as Windows* OS on IA-32 architecture
2173  defaults to 53-bit */
2174  if (!fpcwSet) {
2175  oldFpcw = _control87(0, 0);
2176  _control87(_PC_64, _MCW_PC);
2177  fpcwSet = 0x30000;
2178  }
2179 #endif
2180  if (chunkIdx) {
2181  init = __kmp_dispatch_guided_remaining<T>(
2182  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2183  KMP_DEBUG_ASSERT(init);
2184  init = trip - init;
2185  } else
2186  init = 0;
2187  limit = trip - __kmp_dispatch_guided_remaining<T>(
2188  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2189  KMP_ASSERT(init <= limit);
2190  if (init < limit) {
2191  KMP_DEBUG_ASSERT(limit <= trip);
2192  --limit;
2193  status = 1;
2194  break;
2195  } // if
2196  } // if
2197  } // while (1)
2198 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2199  /* restore FPCW if necessary
2200  AC: check fpcwSet flag first because oldFpcw can be uninitialized
2201  here */
2202  if (fpcwSet && (oldFpcw & fpcwSet))
2203  _control87(oldFpcw, _MCW_PC);
2204 #endif
2205  if (status != 0) {
2206  start = pr->u.p.lb;
2207  incr = pr->u.p.st;
2208  if (p_st != NULL)
2209  *p_st = incr;
2210  *p_lb = start + init * incr;
2211  *p_ub = start + limit * incr;
2212  if (pr->ordered) {
2213  pr->u.p.ordered_lower = init;
2214  pr->u.p.ordered_upper = limit;
2215 #ifdef KMP_DEBUG
2216  {
2217  char *buff;
2218  // create format specifiers before the debug output
2219  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2220  "ordered_lower:%%%s ordered_upper:%%%s\n",
2221  traits_t<UT>::spec, traits_t<UT>::spec);
2222  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2223  pr->u.p.ordered_upper));
2224  __kmp_str_free(&buff);
2225  }
2226 #endif
2227  }
2228  } else {
2229  *p_lb = 0;
2230  *p_ub = 0;
2231  if (p_st != NULL)
2232  *p_st = 0;
2233  }
2234  } // case
2235  break;
2236 
2237  case kmp_sch_trapezoidal: {
2238  UT index;
2239  T parm2 = pr->u.p.parm2;
2240  T parm3 = pr->u.p.parm3;
2241  T parm4 = pr->u.p.parm4;
2242  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2243  gtid));
2244 
2245  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2246 
2247  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2248  trip = pr->u.p.tc - 1;
2249 
2250  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2251  *p_lb = 0;
2252  *p_ub = 0;
2253  if (p_st != NULL)
2254  *p_st = 0;
2255  } else {
2256  start = pr->u.p.lb;
2257  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2258  incr = pr->u.p.st;
2259 
2260  if ((last = (limit >= trip)) != 0)
2261  limit = trip;
2262 
2263  if (p_st != NULL)
2264  *p_st = incr;
2265 
2266  if (incr == 1) {
2267  *p_lb = start + init;
2268  *p_ub = start + limit;
2269  } else {
2270  *p_lb = start + init * incr;
2271  *p_ub = start + limit * incr;
2272  }
2273 
2274  if (pr->ordered) {
2275  pr->u.p.ordered_lower = init;
2276  pr->u.p.ordered_upper = limit;
2277 #ifdef KMP_DEBUG
2278  {
2279  char *buff;
2280  // create format specifiers before the debug output
2281  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2282  "ordered_lower:%%%s ordered_upper:%%%s\n",
2283  traits_t<UT>::spec, traits_t<UT>::spec);
2284  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2285  pr->u.p.ordered_upper));
2286  __kmp_str_free(&buff);
2287  }
2288 #endif
2289  } // if
2290  } // if
2291  } // case
2292  break;
2293  default: {
2294  status = 0; // to avoid complaints on uninitialized variable use
2295  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2296  KMP_HNT(GetNewerLibrary), // Hint
2297  __kmp_msg_null // Variadic argument list terminator
2298  );
2299  } break;
2300  } // switch
2301  } // if tc == 0;
2302 
2303  if (status == 0) {
2304  UT num_done;
2305 
2306  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2307 #ifdef KMP_DEBUG
2308  {
2309  char *buff;
2310  // create format specifiers before the debug output
2311  buff = __kmp_str_format(
2312  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2313  traits_t<UT>::spec);
2314  KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2315  __kmp_str_free(&buff);
2316  }
2317 #endif
2318 
2319  if ((ST)num_done == th->th.th_team_nproc - 1) {
2320 #if (KMP_STATIC_STEAL_ENABLED)
2321  if (pr->schedule == kmp_sch_static_steal &&
2322  traits_t<T>::type_size > 4) {
2323  int i;
2324  kmp_info_t **other_threads = team->t.t_threads;
2325  // loop complete, safe to destroy locks used for stealing
2326  for (i = 0; i < th->th.th_team_nproc; ++i) {
2327  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2328  KMP_ASSERT(lck != NULL);
2329  __kmp_destroy_lock(lck);
2330  __kmp_free(lck);
2331  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2332  }
2333  }
2334 #endif
2335  /* NOTE: release this buffer to be reused */
2336 
2337  KMP_MB(); /* Flush all pending memory write invalidates. */
2338 
2339  sh->u.s.num_done = 0;
2340  sh->u.s.iteration = 0;
2341 
2342  /* TODO replace with general release procedure? */
2343  if (pr->ordered) {
2344  sh->u.s.ordered_iteration = 0;
2345  }
2346 
2347  KMP_MB(); /* Flush all pending memory write invalidates. */
2348 
2349  sh->buffer_index += __kmp_dispatch_num_buffers;
2350  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2351  gtid, sh->buffer_index));
2352 
2353  KMP_MB(); /* Flush all pending memory write invalidates. */
2354 
2355  } // if
2356  if (__kmp_env_consistency_check) {
2357  if (pr->pushed_ws != ct_none) {
2358  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2359  }
2360  }
2361 
2362  th->th.th_dispatch->th_deo_fcn = NULL;
2363  th->th.th_dispatch->th_dxo_fcn = NULL;
2364  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2365  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2366  } // if (status == 0)
2367 #if KMP_OS_WINDOWS
2368  else if (last) {
2369  pr->u.p.last_upper = pr->u.p.ub;
2370  }
2371 #endif /* KMP_OS_WINDOWS */
2372  if (p_last != NULL && status != 0)
2373  *p_last = last;
2374  } // if
2375 
2376 #ifdef KMP_DEBUG
2377  {
2378  char *buff;
2379  // create format specifiers before the debug output
2380  buff = __kmp_str_format(
2381  "__kmp_dispatch_next: T#%%d normal case: "
2382  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2383  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2384  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2385  __kmp_str_free(&buff);
2386  }
2387 #endif
2388 #if INCLUDE_SSC_MARKS
2389  SSC_MARK_DISPATCH_NEXT();
2390 #endif
2391  OMPT_LOOP_END;
2392  return status;
2393 }
2394 
2395 template <typename T>
2396 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2397  kmp_int32 *plastiter, T *plower, T *pupper,
2398  typename traits_t<T>::signed_t incr) {
2399  typedef typename traits_t<T>::unsigned_t UT;
2400  typedef typename traits_t<T>::signed_t ST;
2401  kmp_uint32 team_id;
2402  kmp_uint32 nteams;
2403  UT trip_count;
2404  kmp_team_t *team;
2405  kmp_info_t *th;
2406 
2407  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2408  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2409 #ifdef KMP_DEBUG
2410  {
2411  char *buff;
2412  // create format specifiers before the debug output
2413  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2414  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2415  traits_t<T>::spec, traits_t<T>::spec,
2416  traits_t<ST>::spec, traits_t<T>::spec);
2417  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2418  __kmp_str_free(&buff);
2419  }
2420 #endif
2421 
2422  if (__kmp_env_consistency_check) {
2423  if (incr == 0) {
2424  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2425  loc);
2426  }
2427  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2428  // The loop is illegal.
2429  // Some zero-trip loops maintained by compiler, e.g.:
2430  // for(i=10;i<0;++i) // lower >= upper - run-time check
2431  // for(i=0;i>10;--i) // lower <= upper - run-time check
2432  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2433  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2434  // Compiler does not check the following illegal loops:
2435  // for(i=0;i<10;i+=incr) // where incr<0
2436  // for(i=10;i>0;i-=incr) // where incr<0
2437  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2438  }
2439  }
2440  th = __kmp_threads[gtid];
2441  team = th->th.th_team;
2442 #if OMP_40_ENABLED
2443  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2444  nteams = th->th.th_teams_size.nteams;
2445 #endif
2446  team_id = team->t.t_master_tid;
2447  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2448 
2449  // compute global trip count
2450  if (incr == 1) {
2451  trip_count = *pupper - *plower + 1;
2452  } else if (incr == -1) {
2453  trip_count = *plower - *pupper + 1;
2454  } else if (incr > 0) {
2455  // upper-lower can exceed the limit of signed type
2456  trip_count = (UT)(*pupper - *plower) / incr + 1;
2457  } else {
2458  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2459  }
2460 
2461  if (trip_count <= nteams) {
2462  KMP_DEBUG_ASSERT(
2463  __kmp_static == kmp_sch_static_greedy ||
2464  __kmp_static ==
2465  kmp_sch_static_balanced); // Unknown static scheduling type.
2466  // only some teams get single iteration, others get nothing
2467  if (team_id < trip_count) {
2468  *pupper = *plower = *plower + team_id * incr;
2469  } else {
2470  *plower = *pupper + incr; // zero-trip loop
2471  }
2472  if (plastiter != NULL)
2473  *plastiter = (team_id == trip_count - 1);
2474  } else {
2475  if (__kmp_static == kmp_sch_static_balanced) {
2476  UT chunk = trip_count / nteams;
2477  UT extras = trip_count % nteams;
2478  *plower +=
2479  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2480  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2481  if (plastiter != NULL)
2482  *plastiter = (team_id == nteams - 1);
2483  } else {
2484  T chunk_inc_count =
2485  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2486  T upper = *pupper;
2487  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2488  // Unknown static scheduling type.
2489  *plower += team_id * chunk_inc_count;
2490  *pupper = *plower + chunk_inc_count - incr;
2491  // Check/correct bounds if needed
2492  if (incr > 0) {
2493  if (*pupper < *plower)
2494  *pupper = traits_t<T>::max_value;
2495  if (plastiter != NULL)
2496  *plastiter = *plower <= upper && *pupper > upper - incr;
2497  if (*pupper > upper)
2498  *pupper = upper; // tracker C73258
2499  } else {
2500  if (*pupper > *plower)
2501  *pupper = traits_t<T>::min_value;
2502  if (plastiter != NULL)
2503  *plastiter = *plower >= upper && *pupper < upper - incr;
2504  if (*pupper < upper)
2505  *pupper = upper; // tracker C73258
2506  }
2507  }
2508  }
2509 }
2510 
2511 //-----------------------------------------------------------------------------
2512 // Dispatch routines
2513 // Transfer call to template< type T >
2514 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2515 // T lb, T ub, ST st, ST chunk )
2516 extern "C" {
2517 
2534 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2535  enum sched_type schedule, kmp_int32 lb,
2536  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2537  KMP_DEBUG_ASSERT(__kmp_init_serial);
2538 #if OMPT_SUPPORT && OMPT_OPTIONAL
2539  OMPT_STORE_RETURN_ADDRESS(gtid);
2540 #endif
2541  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2542 }
2546 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2547  enum sched_type schedule, kmp_uint32 lb,
2548  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2549  KMP_DEBUG_ASSERT(__kmp_init_serial);
2550 #if OMPT_SUPPORT && OMPT_OPTIONAL
2551  OMPT_STORE_RETURN_ADDRESS(gtid);
2552 #endif
2553  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2554 }
2555 
2559 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2560  enum sched_type schedule, kmp_int64 lb,
2561  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2562  KMP_DEBUG_ASSERT(__kmp_init_serial);
2563 #if OMPT_SUPPORT && OMPT_OPTIONAL
2564  OMPT_STORE_RETURN_ADDRESS(gtid);
2565 #endif
2566  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2567 }
2568 
2572 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2573  enum sched_type schedule, kmp_uint64 lb,
2574  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2575  KMP_DEBUG_ASSERT(__kmp_init_serial);
2576 #if OMPT_SUPPORT && OMPT_OPTIONAL
2577  OMPT_STORE_RETURN_ADDRESS(gtid);
2578 #endif
2579  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2580 }
2581 
2591 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2592  enum sched_type schedule, kmp_int32 *p_last,
2593  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2594  kmp_int32 chunk) {
2595  KMP_DEBUG_ASSERT(__kmp_init_serial);
2596 #if OMPT_SUPPORT && OMPT_OPTIONAL
2597  OMPT_STORE_RETURN_ADDRESS(gtid);
2598 #endif
2599  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2600  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2601 }
2602 
2603 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2604  enum sched_type schedule, kmp_int32 *p_last,
2605  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2606  kmp_int32 chunk) {
2607  KMP_DEBUG_ASSERT(__kmp_init_serial);
2608 #if OMPT_SUPPORT && OMPT_OPTIONAL
2609  OMPT_STORE_RETURN_ADDRESS(gtid);
2610 #endif
2611  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2612  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2613 }
2614 
2615 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2616  enum sched_type schedule, kmp_int32 *p_last,
2617  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2618  kmp_int64 chunk) {
2619  KMP_DEBUG_ASSERT(__kmp_init_serial);
2620 #if OMPT_SUPPORT && OMPT_OPTIONAL
2621  OMPT_STORE_RETURN_ADDRESS(gtid);
2622 #endif
2623  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2624  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2625 }
2626 
2627 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2628  enum sched_type schedule, kmp_int32 *p_last,
2629  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2630  kmp_int64 chunk) {
2631  KMP_DEBUG_ASSERT(__kmp_init_serial);
2632 #if OMPT_SUPPORT && OMPT_OPTIONAL
2633  OMPT_STORE_RETURN_ADDRESS(gtid);
2634 #endif
2635  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2636  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2637 }
2638 
2652 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2653  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2654 #if OMPT_SUPPORT && OMPT_OPTIONAL
2655  OMPT_STORE_RETURN_ADDRESS(gtid);
2656 #endif
2657  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2658 #if OMPT_SUPPORT && OMPT_OPTIONAL
2659  ,
2660  OMPT_LOAD_RETURN_ADDRESS(gtid)
2661 #endif
2662  );
2663 }
2664 
2668 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2669  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2670  kmp_int32 *p_st) {
2671 #if OMPT_SUPPORT && OMPT_OPTIONAL
2672  OMPT_STORE_RETURN_ADDRESS(gtid);
2673 #endif
2674  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2675 #if OMPT_SUPPORT && OMPT_OPTIONAL
2676  ,
2677  OMPT_LOAD_RETURN_ADDRESS(gtid)
2678 #endif
2679  );
2680 }
2681 
2685 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2686  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2687 #if OMPT_SUPPORT && OMPT_OPTIONAL
2688  OMPT_STORE_RETURN_ADDRESS(gtid);
2689 #endif
2690  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2691 #if OMPT_SUPPORT && OMPT_OPTIONAL
2692  ,
2693  OMPT_LOAD_RETURN_ADDRESS(gtid)
2694 #endif
2695  );
2696 }
2697 
2701 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2702  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2703  kmp_int64 *p_st) {
2704 #if OMPT_SUPPORT && OMPT_OPTIONAL
2705  OMPT_STORE_RETURN_ADDRESS(gtid);
2706 #endif
2707  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2708 #if OMPT_SUPPORT && OMPT_OPTIONAL
2709  ,
2710  OMPT_LOAD_RETURN_ADDRESS(gtid)
2711 #endif
2712  );
2713 }
2714 
2721 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2722  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2723 }
2724 
2728 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2729  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2730 }
2731 
2735 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2736  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2737 }
2738 
2742 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2743  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2744 }
2747 //-----------------------------------------------------------------------------
2748 // Non-template routines from kmp_dispatch.cpp used in other sources
2749 
2750 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2751  return value == checker;
2752 }
2753 
2754 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2755  return value != checker;
2756 }
2757 
2758 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2759  return value < checker;
2760 }
2761 
2762 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2763  return value >= checker;
2764 }
2765 
2766 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2767  return value <= checker;
2768 }
2769 
2770 kmp_uint32
2771 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2772  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2773  void *obj // Higher-level synchronization object, or NULL.
2774  ) {
2775  // note: we may not belong to a team at this point
2776  volatile kmp_uint32 *spin = spinner;
2777  kmp_uint32 check = checker;
2778  kmp_uint32 spins;
2779  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2780  kmp_uint32 r;
2781 
2782  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2783  KMP_INIT_YIELD(spins);
2784  // main wait spin loop
2785  while (!f(r = TCR_4(*spin), check)) {
2786  KMP_FSYNC_SPIN_PREPARE(obj);
2787  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2788  split. It causes problems with infinite recursion because of exit lock */
2789  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2790  __kmp_abort_thread(); */
2791 
2792  /* if we have waited a bit, or are oversubscribed, yield */
2793  /* pause is in the following code */
2794  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2795  KMP_YIELD_SPIN(spins);
2796  }
2797  KMP_FSYNC_SPIN_ACQUIRED(obj);
2798  return r;
2799 }
2800 
2801 void __kmp_wait_yield_4_ptr(
2802  void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2803  void *obj // Higher-level synchronization object, or NULL.
2804  ) {
2805  // note: we may not belong to a team at this point
2806  void *spin = spinner;
2807  kmp_uint32 check = checker;
2808  kmp_uint32 spins;
2809  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2810 
2811  KMP_FSYNC_SPIN_INIT(obj, spin);
2812  KMP_INIT_YIELD(spins);
2813  // main wait spin loop
2814  while (!f(spin, check)) {
2815  KMP_FSYNC_SPIN_PREPARE(obj);
2816  /* if we have waited a bit, or are oversubscribed, yield */
2817  /* pause is in the following code */
2818  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2819  KMP_YIELD_SPIN(spins);
2820  }
2821  KMP_FSYNC_SPIN_ACQUIRED(obj);
2822 }
2823 
2824 } // extern "C"
2825 
2826 #ifdef KMP_GOMP_COMPAT
2827 
2828 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2829  enum sched_type schedule, kmp_int32 lb,
2830  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2831  int push_ws) {
2832  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2833  push_ws);
2834 }
2835 
2836 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2837  enum sched_type schedule, kmp_uint32 lb,
2838  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2839  int push_ws) {
2840  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2841  push_ws);
2842 }
2843 
2844 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2845  enum sched_type schedule, kmp_int64 lb,
2846  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2847  int push_ws) {
2848  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2849  push_ws);
2850 }
2851 
2852 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2853  enum sched_type schedule, kmp_uint64 lb,
2854  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2855  int push_ws) {
2856  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2857  push_ws);
2858 }
2859 
2860 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2861  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2862 }
2863 
2864 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2865  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2866 }
2867 
2868 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2869  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2870 }
2871 
2872 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2873  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2874 }
2875 
2876 #endif /* KMP_GOMP_COMPAT */
2877 
2878 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:790
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:803
sched_type
Definition: kmp.h:317
Definition: kmp.h:210
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)