LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_affinity.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_str.h"
21 #include "kmp_wrapper_getpid.h"
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
26 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
27 
28 
29 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
30  kmp_uint32 depth;
31  // The test below is true if affinity is available, but set to "none". Need to
32  // init on first use of hierarchical barrier.
33  if (TCR_1(machine_hierarchy.uninitialized))
34  machine_hierarchy.init(NULL, nproc);
35 
36  // Adjust the hierarchy in case num threads exceeds original
37  if (nproc > machine_hierarchy.base_num_threads)
38  machine_hierarchy.resize(nproc);
39 
40  depth = machine_hierarchy.depth;
41  KMP_DEBUG_ASSERT(depth > 0);
42 
43  thr_bar->depth = depth;
44  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
45  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
46 }
47 
48 #if KMP_AFFINITY_SUPPORTED
49 
50 bool KMPAffinity::picked_api = false;
51 
52 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
53 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
54 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
55 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
56 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
57 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
58 
59 void KMPAffinity::pick_api() {
60  KMPAffinity *affinity_dispatch;
61  if (picked_api)
62  return;
63 #if KMP_USE_HWLOC
64  // Only use Hwloc if affinity isn't explicitly disabled and
65  // user requests Hwloc topology method
66  if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
67  __kmp_affinity_type != affinity_disabled) {
68  affinity_dispatch = new KMPHwlocAffinity();
69  } else
70 #endif
71  {
72  affinity_dispatch = new KMPNativeAffinity();
73  }
74  __kmp_affinity_dispatch = affinity_dispatch;
75  picked_api = true;
76 }
77 
78 void KMPAffinity::destroy_api() {
79  if (__kmp_affinity_dispatch != NULL) {
80  delete __kmp_affinity_dispatch;
81  __kmp_affinity_dispatch = NULL;
82  picked_api = false;
83  }
84 }
85 
86 // Print the affinity mask to the character array in a pretty format.
87 char *__kmp_affinity_print_mask(char *buf, int buf_len,
88  kmp_affin_mask_t *mask) {
89  KMP_ASSERT(buf_len >= 40);
90  char *scan = buf;
91  char *end = buf + buf_len - 1;
92 
93  // Find first element / check for empty set.
94  size_t i;
95  i = mask->begin();
96  if (i == mask->end()) {
97  KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
98  while (*scan != '\0')
99  scan++;
100  KMP_ASSERT(scan <= end);
101  return buf;
102  }
103 
104  KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i);
105  while (*scan != '\0')
106  scan++;
107  i++;
108  for (; i != mask->end(); i = mask->next(i)) {
109  if (!KMP_CPU_ISSET(i, mask)) {
110  continue;
111  }
112 
113  // Check for buffer overflow. A string of the form ",<n>" will have at most
114  // 10 characters, plus we want to leave room to print ",...}" if the set is
115  // too large to print for a total of 15 characters. We already left room for
116  // '\0' in setting end.
117  if (end - scan < 15) {
118  break;
119  }
120  KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i);
121  while (*scan != '\0')
122  scan++;
123  }
124  if (i != mask->end()) {
125  KMP_SNPRINTF(scan, end - scan + 1, ",...");
126  while (*scan != '\0')
127  scan++;
128  }
129  KMP_SNPRINTF(scan, end - scan + 1, "}");
130  while (*scan != '\0')
131  scan++;
132  KMP_ASSERT(scan <= end);
133  return buf;
134 }
135 
136 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
137  KMP_CPU_ZERO(mask);
138 
139 #if KMP_GROUP_AFFINITY
140 
141  if (__kmp_num_proc_groups > 1) {
142  int group;
143  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
144  for (group = 0; group < __kmp_num_proc_groups; group++) {
145  int i;
146  int num = __kmp_GetActiveProcessorCount(group);
147  for (i = 0; i < num; i++) {
148  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
149  }
150  }
151  } else
152 
153 #endif /* KMP_GROUP_AFFINITY */
154 
155  {
156  int proc;
157  for (proc = 0; proc < __kmp_xproc; proc++) {
158  KMP_CPU_SET(proc, mask);
159  }
160  }
161 }
162 
163 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
164 // called to renumber the labels from [0..n] and place them into the child_num
165 // vector of the address object. This is done in case the labels used for
166 // the children at one node of the hierarchy differ from those used for
167 // another node at the same level. Example: suppose the machine has 2 nodes
168 // with 2 packages each. The first node contains packages 601 and 602, and
169 // second node contains packages 603 and 604. If we try to sort the table
170 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
171 // because we are paying attention to the labels themselves, not the ordinal
172 // child numbers. By using the child numbers in the sort, the result is
173 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
174 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
175  int numAddrs) {
176  KMP_DEBUG_ASSERT(numAddrs > 0);
177  int depth = address2os->first.depth;
178  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
179  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
180  int labCt;
181  for (labCt = 0; labCt < depth; labCt++) {
182  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
183  lastLabel[labCt] = address2os[0].first.labels[labCt];
184  }
185  int i;
186  for (i = 1; i < numAddrs; i++) {
187  for (labCt = 0; labCt < depth; labCt++) {
188  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
189  int labCt2;
190  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
191  counts[labCt2] = 0;
192  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
193  }
194  counts[labCt]++;
195  lastLabel[labCt] = address2os[i].first.labels[labCt];
196  break;
197  }
198  }
199  for (labCt = 0; labCt < depth; labCt++) {
200  address2os[i].first.childNums[labCt] = counts[labCt];
201  }
202  for (; labCt < (int)Address::maxDepth; labCt++) {
203  address2os[i].first.childNums[labCt] = 0;
204  }
205  }
206  __kmp_free(lastLabel);
207  __kmp_free(counts);
208 }
209 
210 // All of the __kmp_affinity_create_*_map() routines should set
211 // __kmp_affinity_masks to a vector of affinity mask objects of length
212 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
213 // the number of levels in the machine topology tree (zero if
214 // __kmp_affinity_type == affinity_none).
215 //
216 // All of the __kmp_affinity_create_*_map() routines should set
217 // *__kmp_affin_fullMask to the affinity mask for the initialization thread.
218 // They need to save and restore the mask, and it could be needed later, so
219 // saving it is just an optimization to avoid calling kmp_get_system_affinity()
220 // again.
221 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
222 
223 static int nCoresPerPkg, nPackages;
224 static int __kmp_nThreadsPerCore;
225 #ifndef KMP_DFLT_NTH_CORES
226 static int __kmp_ncores;
227 #endif
228 static int *__kmp_pu_os_idx = NULL;
229 
230 // __kmp_affinity_uniform_topology() doesn't work when called from
231 // places which support arbitrarily many levels in the machine topology
232 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
233 // __kmp_affinity_create_x2apicid_map().
234 inline static bool __kmp_affinity_uniform_topology() {
235  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
236 }
237 
238 // Print out the detailed machine topology map, i.e. the physical locations
239 // of each OS proc.
240 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
241  int depth, int pkgLevel,
242  int coreLevel, int threadLevel) {
243  int proc;
244 
245  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
246  for (proc = 0; proc < len; proc++) {
247  int level;
248  kmp_str_buf_t buf;
249  __kmp_str_buf_init(&buf);
250  for (level = 0; level < depth; level++) {
251  if (level == threadLevel) {
252  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
253  } else if (level == coreLevel) {
254  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
255  } else if (level == pkgLevel) {
256  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
257  } else if (level > pkgLevel) {
258  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
259  level - pkgLevel - 1);
260  } else {
261  __kmp_str_buf_print(&buf, "L%d ", level);
262  }
263  __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
264  }
265  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
266  buf.str);
267  __kmp_str_buf_free(&buf);
268  }
269 }
270 
271 #if KMP_USE_HWLOC
272 
273 // This function removes the topology levels that are radix 1 and don't offer
274 // further information about the topology. The most common example is when you
275 // have one thread context per core, we don't want the extra thread context
276 // level if it offers no unique labels. So they are removed.
277 // return value: the new depth of address2os
278 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os,
279  int nActiveThreads, int depth,
280  int *pkgLevel, int *coreLevel,
281  int *threadLevel) {
282  int level;
283  int i;
284  int radix1_detected;
285 
286  for (level = depth - 1; level >= 0; --level) {
287  // Always keep the package level
288  if (level == *pkgLevel)
289  continue;
290  // Detect if this level is radix 1
291  radix1_detected = 1;
292  for (i = 1; i < nActiveThreads; ++i) {
293  if (address2os[0].first.labels[level] !=
294  address2os[i].first.labels[level]) {
295  // There are differing label values for this level so it stays
296  radix1_detected = 0;
297  break;
298  }
299  }
300  if (!radix1_detected)
301  continue;
302  // Radix 1 was detected
303  if (level == *threadLevel) {
304  // If only one thread per core, then just decrement
305  // the depth which removes the threadlevel from address2os
306  for (i = 0; i < nActiveThreads; ++i) {
307  address2os[i].first.depth--;
308  }
309  *threadLevel = -1;
310  } else if (level == *coreLevel) {
311  // For core level, we move the thread labels over if they are still
312  // valid (*threadLevel != -1), and also reduce the depth another level
313  for (i = 0; i < nActiveThreads; ++i) {
314  if (*threadLevel != -1) {
315  address2os[i].first.labels[*coreLevel] =
316  address2os[i].first.labels[*threadLevel];
317  }
318  address2os[i].first.depth--;
319  }
320  *coreLevel = -1;
321  }
322  }
323  return address2os[0].first.depth;
324 }
325 
326 // Returns the number of objects of type 'type' below 'obj' within the topology
327 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
328 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
329 // object.
330 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
331  hwloc_obj_type_t type) {
332  int retval = 0;
333  hwloc_obj_t first;
334  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
335  obj->logical_index, type, 0);
336  first != NULL &&
337  hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
338  obj;
339  first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
340  first)) {
341  ++retval;
342  }
343  return retval;
344 }
345 
346 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
347  kmp_i18n_id_t *const msg_id) {
348  *address2os = NULL;
349  *msg_id = kmp_i18n_null;
350 
351  // Save the affinity mask for the current thread.
352  kmp_affin_mask_t *oldMask;
353  KMP_CPU_ALLOC(oldMask);
354  __kmp_get_system_affinity(oldMask, TRUE);
355 
356  int depth = 3;
357  int pkgLevel = 0;
358  int coreLevel = 1;
359  int threadLevel = 2;
360 
361  if (!KMP_AFFINITY_CAPABLE()) {
362  // Hack to try and infer the machine topology using only the data
363  // available from cpuid on the current thread, and __kmp_xproc.
364  KMP_ASSERT(__kmp_affinity_type == affinity_none);
365 
366  nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
367  hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0),
368  HWLOC_OBJ_CORE);
369  __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
370  hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0),
371  HWLOC_OBJ_PU);
372  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
373  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
374  if (__kmp_affinity_verbose) {
375  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
376  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
377  if (__kmp_affinity_uniform_topology()) {
378  KMP_INFORM(Uniform, "KMP_AFFINITY");
379  } else {
380  KMP_INFORM(NonUniform, "KMP_AFFINITY");
381  }
382  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
383  __kmp_nThreadsPerCore, __kmp_ncores);
384  }
385  KMP_CPU_FREE(oldMask);
386  return 0;
387  }
388 
389  // Allocate the data structure to be returned.
390  AddrUnsPair *retval =
391  (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
392  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
393 
394  // When affinity is off, this routine will still be called to set
395  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
396  // nCoresPerPkg, & nPackages. Make sure all these vars are set
397  // correctly, and return if affinity is not enabled.
398 
399  hwloc_obj_t pu;
400  hwloc_obj_t core;
401  hwloc_obj_t socket;
402  int nActiveThreads = 0;
403  int socket_identifier = 0;
404  // re-calculate globals to count only accessible resources
405  __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
406  for (socket =
407  hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0);
408  socket != NULL; socket = hwloc_get_next_obj_by_type(
409  __kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, socket),
410  socket_identifier++) {
411  int core_identifier = 0;
412  int num_active_cores = 0;
413  for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type,
414  socket->logical_index,
415  HWLOC_OBJ_CORE, 0);
416  core != NULL &&
417  hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type,
418  core) == socket;
419  core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE,
420  core),
421  core_identifier++) {
422  int pu_identifier = 0;
423  int num_active_threads = 0;
424  for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type,
425  core->logical_index, HWLOC_OBJ_PU,
426  0);
427  pu != NULL &&
428  hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type,
429  pu) == core;
430  pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU,
431  pu),
432  pu_identifier++) {
433  Address addr(3);
434  if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
435  continue; // skip inactive (inaccessible) unit
436  KA_TRACE(20,
437  ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
438  socket->os_index, socket->logical_index, core->os_index,
439  core->logical_index, pu->os_index, pu->logical_index));
440  addr.labels[0] = socket_identifier; // package
441  addr.labels[1] = core_identifier; // core
442  addr.labels[2] = pu_identifier; // pu
443  retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
444  __kmp_pu_os_idx[nActiveThreads] =
445  pu->os_index; // keep os index for each active pu
446  nActiveThreads++;
447  ++num_active_threads; // count active threads per core
448  }
449  if (num_active_threads) { // were there any active threads on the core?
450  ++__kmp_ncores; // count total active cores
451  ++num_active_cores; // count active cores per socket
452  if (num_active_threads > __kmp_nThreadsPerCore)
453  __kmp_nThreadsPerCore = num_active_threads; // calc maximum
454  }
455  }
456  if (num_active_cores) { // were there any active cores on the socket?
457  ++nPackages; // count total active packages
458  if (num_active_cores > nCoresPerPkg)
459  nCoresPerPkg = num_active_cores; // calc maximum
460  }
461  }
462 
463  // If there's only one thread context to bind to, return now.
464  KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
465  KMP_ASSERT(nActiveThreads > 0);
466  if (nActiveThreads == 1) {
467  __kmp_ncores = nPackages = 1;
468  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
469  if (__kmp_affinity_verbose) {
470  char buf[KMP_AFFIN_MASK_PRINT_LEN];
471  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
472 
473  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
474  if (__kmp_affinity_respect_mask) {
475  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
476  } else {
477  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
478  }
479  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
480  KMP_INFORM(Uniform, "KMP_AFFINITY");
481  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
482  __kmp_nThreadsPerCore, __kmp_ncores);
483  }
484 
485  if (__kmp_affinity_type == affinity_none) {
486  __kmp_free(retval);
487  KMP_CPU_FREE(oldMask);
488  return 0;
489  }
490 
491  // Form an Address object which only includes the package level.
492  Address addr(1);
493  addr.labels[0] = retval[0].first.labels[pkgLevel];
494  retval[0].first = addr;
495 
496  if (__kmp_affinity_gran_levels < 0) {
497  __kmp_affinity_gran_levels = 0;
498  }
499 
500  if (__kmp_affinity_verbose) {
501  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
502  }
503 
504  *address2os = retval;
505  KMP_CPU_FREE(oldMask);
506  return 1;
507  }
508 
509  // Sort the table by physical Id.
510  qsort(retval, nActiveThreads, sizeof(*retval),
511  __kmp_affinity_cmp_Address_labels);
512 
513  // Check to see if the machine topology is uniform
514  unsigned uniform =
515  (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
516 
517  // Print the machine topology summary.
518  if (__kmp_affinity_verbose) {
519  char mask[KMP_AFFIN_MASK_PRINT_LEN];
520  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
521 
522  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
523  if (__kmp_affinity_respect_mask) {
524  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
525  } else {
526  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
527  }
528  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
529  if (uniform) {
530  KMP_INFORM(Uniform, "KMP_AFFINITY");
531  } else {
532  KMP_INFORM(NonUniform, "KMP_AFFINITY");
533  }
534 
535  kmp_str_buf_t buf;
536  __kmp_str_buf_init(&buf);
537 
538  __kmp_str_buf_print(&buf, "%d", nPackages);
539  // for (level = 1; level <= pkgLevel; level++) {
540  // __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
541  // }
542  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
543  __kmp_nThreadsPerCore, __kmp_ncores);
544 
545  __kmp_str_buf_free(&buf);
546  }
547 
548  if (__kmp_affinity_type == affinity_none) {
549  __kmp_free(retval);
550  KMP_CPU_FREE(oldMask);
551  return 0;
552  }
553 
554  // Find any levels with radiix 1, and remove them from the map
555  // (except for the package level).
556  depth = __kmp_affinity_remove_radix_one_levels(
557  retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
558 
559  if (__kmp_affinity_gran_levels < 0) {
560  // Set the granularity level based on what levels are modeled
561  // in the machine topology map.
562  __kmp_affinity_gran_levels = 0;
563  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
564  __kmp_affinity_gran_levels++;
565  }
566  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
567  __kmp_affinity_gran_levels++;
568  }
569  if (__kmp_affinity_gran > affinity_gran_package) {
570  __kmp_affinity_gran_levels++;
571  }
572  }
573 
574  if (__kmp_affinity_verbose) {
575  __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
576  coreLevel, threadLevel);
577  }
578 
579  KMP_CPU_FREE(oldMask);
580  *address2os = retval;
581  return depth;
582 }
583 #endif // KMP_USE_HWLOC
584 
585 // If we don't know how to retrieve the machine's processor topology, or
586 // encounter an error in doing so, this routine is called to form a "flat"
587 // mapping of os thread id's <-> processor id's.
588 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
589  kmp_i18n_id_t *const msg_id) {
590  *address2os = NULL;
591  *msg_id = kmp_i18n_null;
592 
593  // Even if __kmp_affinity_type == affinity_none, this routine might still
594  // called to set __kmp_ncores, as well as
595  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
596  if (!KMP_AFFINITY_CAPABLE()) {
597  KMP_ASSERT(__kmp_affinity_type == affinity_none);
598  __kmp_ncores = nPackages = __kmp_xproc;
599  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
600  if (__kmp_affinity_verbose) {
601  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
602  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
603  KMP_INFORM(Uniform, "KMP_AFFINITY");
604  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
605  __kmp_nThreadsPerCore, __kmp_ncores);
606  }
607  return 0;
608  }
609 
610  // When affinity is off, this routine will still be called to set
611  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
612  // Make sure all these vars are set correctly, and return now if affinity is
613  // not enabled.
614  __kmp_ncores = nPackages = __kmp_avail_proc;
615  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
616  if (__kmp_affinity_verbose) {
617  char buf[KMP_AFFIN_MASK_PRINT_LEN];
618  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
619  __kmp_affin_fullMask);
620 
621  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
622  if (__kmp_affinity_respect_mask) {
623  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
624  } else {
625  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
626  }
627  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
628  KMP_INFORM(Uniform, "KMP_AFFINITY");
629  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
630  __kmp_nThreadsPerCore, __kmp_ncores);
631  }
632  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
633  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
634  if (__kmp_affinity_type == affinity_none) {
635  int avail_ct = 0;
636  int i;
637  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
638  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
639  continue;
640  __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
641  }
642  return 0;
643  }
644 
645  // Contruct the data structure to be returned.
646  *address2os =
647  (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
648  int avail_ct = 0;
649  unsigned int i;
650  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
651  // Skip this proc if it is not included in the machine model.
652  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
653  continue;
654  }
655  __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
656  Address addr(1);
657  addr.labels[0] = i;
658  (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
659  }
660  if (__kmp_affinity_verbose) {
661  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
662  }
663 
664  if (__kmp_affinity_gran_levels < 0) {
665  // Only the package level is modeled in the machine topology map,
666  // so the #levels of granularity is either 0 or 1.
667  if (__kmp_affinity_gran > affinity_gran_package) {
668  __kmp_affinity_gran_levels = 1;
669  } else {
670  __kmp_affinity_gran_levels = 0;
671  }
672  }
673  return 1;
674 }
675 
676 #if KMP_GROUP_AFFINITY
677 
678 // If multiple Windows* OS processor groups exist, we can create a 2-level
679 // topology map with the groups at level 0 and the individual procs at level 1.
680 // This facilitates letting the threads float among all procs in a group,
681 // if granularity=group (the default when there are multiple groups).
682 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
683  kmp_i18n_id_t *const msg_id) {
684  *address2os = NULL;
685  *msg_id = kmp_i18n_null;
686 
687  // If we aren't affinity capable, then return now.
688  // The flat mapping will be used.
689  if (!KMP_AFFINITY_CAPABLE()) {
690  // FIXME set *msg_id
691  return -1;
692  }
693 
694  // Contruct the data structure to be returned.
695  *address2os =
696  (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
697  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
698  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
699  int avail_ct = 0;
700  int i;
701  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
702  // Skip this proc if it is not included in the machine model.
703  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
704  continue;
705  }
706  __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
707  Address addr(2);
708  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
709  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
710  (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
711 
712  if (__kmp_affinity_verbose) {
713  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
714  addr.labels[1]);
715  }
716  }
717 
718  if (__kmp_affinity_gran_levels < 0) {
719  if (__kmp_affinity_gran == affinity_gran_group) {
720  __kmp_affinity_gran_levels = 1;
721  } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
722  (__kmp_affinity_gran == affinity_gran_thread)) {
723  __kmp_affinity_gran_levels = 0;
724  } else {
725  const char *gran_str = NULL;
726  if (__kmp_affinity_gran == affinity_gran_core) {
727  gran_str = "core";
728  } else if (__kmp_affinity_gran == affinity_gran_package) {
729  gran_str = "package";
730  } else if (__kmp_affinity_gran == affinity_gran_node) {
731  gran_str = "node";
732  } else {
733  KMP_ASSERT(0);
734  }
735 
736  // Warning: can't use affinity granularity \"gran\" with group topology
737  // method, using "thread"
738  __kmp_affinity_gran_levels = 0;
739  }
740  }
741  return 2;
742 }
743 
744 #endif /* KMP_GROUP_AFFINITY */
745 
746 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
747 
748 static int __kmp_cpuid_mask_width(int count) {
749  int r = 0;
750 
751  while ((1 << r) < count)
752  ++r;
753  return r;
754 }
755 
756 class apicThreadInfo {
757 public:
758  unsigned osId; // param to __kmp_affinity_bind_thread
759  unsigned apicId; // from cpuid after binding
760  unsigned maxCoresPerPkg; // ""
761  unsigned maxThreadsPerPkg; // ""
762  unsigned pkgId; // inferred from above values
763  unsigned coreId; // ""
764  unsigned threadId; // ""
765 };
766 
767 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a,
768  const void *b) {
769  const apicThreadInfo *aa = (const apicThreadInfo *)a;
770  const apicThreadInfo *bb = (const apicThreadInfo *)b;
771  if (aa->osId < bb->osId)
772  return -1;
773  if (aa->osId > bb->osId)
774  return 1;
775  return 0;
776 }
777 
778 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
779  const void *b) {
780  const apicThreadInfo *aa = (const apicThreadInfo *)a;
781  const apicThreadInfo *bb = (const apicThreadInfo *)b;
782  if (aa->pkgId < bb->pkgId)
783  return -1;
784  if (aa->pkgId > bb->pkgId)
785  return 1;
786  if (aa->coreId < bb->coreId)
787  return -1;
788  if (aa->coreId > bb->coreId)
789  return 1;
790  if (aa->threadId < bb->threadId)
791  return -1;
792  if (aa->threadId > bb->threadId)
793  return 1;
794  return 0;
795 }
796 
797 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
798 // an algorithm which cycles through the available os threads, setting
799 // the current thread's affinity mask to that thread, and then retrieves
800 // the Apic Id for each thread context using the cpuid instruction.
801 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
802  kmp_i18n_id_t *const msg_id) {
803  kmp_cpuid buf;
804  int rc;
805  *address2os = NULL;
806  *msg_id = kmp_i18n_null;
807 
808  // Check if cpuid leaf 4 is supported.
809  __kmp_x86_cpuid(0, 0, &buf);
810  if (buf.eax < 4) {
811  *msg_id = kmp_i18n_str_NoLeaf4Support;
812  return -1;
813  }
814 
815  // The algorithm used starts by setting the affinity to each available thread
816  // and retrieving info from the cpuid instruction, so if we are not capable of
817  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
818  // need to do something else - use the defaults that we calculated from
819  // issuing cpuid without binding to each proc.
820  if (!KMP_AFFINITY_CAPABLE()) {
821  // Hack to try and infer the machine topology using only the data
822  // available from cpuid on the current thread, and __kmp_xproc.
823  KMP_ASSERT(__kmp_affinity_type == affinity_none);
824 
825  // Get an upper bound on the number of threads per package using cpuid(1).
826  // On some OS/chps combinations where HT is supported by the chip but is
827  // disabled, this value will be 2 on a single core chip. Usually, it will be
828  // 2 if HT is enabled and 1 if HT is disabled.
829  __kmp_x86_cpuid(1, 0, &buf);
830  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
831  if (maxThreadsPerPkg == 0) {
832  maxThreadsPerPkg = 1;
833  }
834 
835  // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
836  // value.
837  //
838  // The author of cpu_count.cpp treated this only an upper bound on the
839  // number of cores, but I haven't seen any cases where it was greater than
840  // the actual number of cores, so we will treat it as exact in this block of
841  // code.
842  //
843  // First, we need to check if cpuid(4) is supported on this chip. To see if
844  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
845  // greater.
846  __kmp_x86_cpuid(0, 0, &buf);
847  if (buf.eax >= 4) {
848  __kmp_x86_cpuid(4, 0, &buf);
849  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
850  } else {
851  nCoresPerPkg = 1;
852  }
853 
854  // There is no way to reliably tell if HT is enabled without issuing the
855  // cpuid instruction from every thread, can correlating the cpuid info, so
856  // if the machine is not affinity capable, we assume that HT is off. We have
857  // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
858  // does not support HT.
859  //
860  // - Older OSes are usually found on machines with older chips, which do not
861  // support HT.
862  // - The performance penalty for mistakenly identifying a machine as HT when
863  // it isn't (which results in blocktime being incorrecly set to 0) is
864  // greater than the penalty when for mistakenly identifying a machine as
865  // being 1 thread/core when it is really HT enabled (which results in
866  // blocktime being incorrectly set to a positive value).
867  __kmp_ncores = __kmp_xproc;
868  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
869  __kmp_nThreadsPerCore = 1;
870  if (__kmp_affinity_verbose) {
871  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
872  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
873  if (__kmp_affinity_uniform_topology()) {
874  KMP_INFORM(Uniform, "KMP_AFFINITY");
875  } else {
876  KMP_INFORM(NonUniform, "KMP_AFFINITY");
877  }
878  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
879  __kmp_nThreadsPerCore, __kmp_ncores);
880  }
881  return 0;
882  }
883 
884  // From here on, we can assume that it is safe to call
885  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
886  // __kmp_affinity_type = affinity_none.
887 
888  // Save the affinity mask for the current thread.
889  kmp_affin_mask_t *oldMask;
890  KMP_CPU_ALLOC(oldMask);
891  KMP_ASSERT(oldMask != NULL);
892  __kmp_get_system_affinity(oldMask, TRUE);
893 
894  // Run through each of the available contexts, binding the current thread
895  // to it, and obtaining the pertinent information using the cpuid instr.
896  //
897  // The relevant information is:
898  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
899  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
900  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
901  // of this field determines the width of the core# + thread# fields in the
902  // Apic Id. It is also an upper bound on the number of threads per
903  // package, but it has been verified that situations happen were it is not
904  // exact. In particular, on certain OS/chip combinations where Intel(R)
905  // Hyper-Threading Technology is supported by the chip but has been
906  // disabled, the value of this field will be 2 (for a single core chip).
907  // On other OS/chip combinations supporting Intel(R) Hyper-Threading
908  // Technology, the value of this field will be 1 when Intel(R)
909  // Hyper-Threading Technology is disabled and 2 when it is enabled.
910  // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
911  // of this field (+1) determines the width of the core# field in the Apic
912  // Id. The comments in "cpucount.cpp" say that this value is an upper
913  // bound, but the IA-32 architecture manual says that it is exactly the
914  // number of cores per package, and I haven't seen any case where it
915  // wasn't.
916  //
917  // From this information, deduce the package Id, core Id, and thread Id,
918  // and set the corresponding fields in the apicThreadInfo struct.
919  unsigned i;
920  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
921  __kmp_avail_proc * sizeof(apicThreadInfo));
922  unsigned nApics = 0;
923  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
924  // Skip this proc if it is not included in the machine model.
925  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
926  continue;
927  }
928  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
929 
930  __kmp_affinity_dispatch->bind_thread(i);
931  threadInfo[nApics].osId = i;
932 
933  // The apic id and max threads per pkg come from cpuid(1).
934  __kmp_x86_cpuid(1, 0, &buf);
935  if (((buf.edx >> 9) & 1) == 0) {
936  __kmp_set_system_affinity(oldMask, TRUE);
937  __kmp_free(threadInfo);
938  KMP_CPU_FREE(oldMask);
939  *msg_id = kmp_i18n_str_ApicNotPresent;
940  return -1;
941  }
942  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
943  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
944  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
945  threadInfo[nApics].maxThreadsPerPkg = 1;
946  }
947 
948  // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
949  // value.
950  //
951  // First, we need to check if cpuid(4) is supported on this chip. To see if
952  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
953  // or greater.
954  __kmp_x86_cpuid(0, 0, &buf);
955  if (buf.eax >= 4) {
956  __kmp_x86_cpuid(4, 0, &buf);
957  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
958  } else {
959  threadInfo[nApics].maxCoresPerPkg = 1;
960  }
961 
962  // Infer the pkgId / coreId / threadId using only the info obtained locally.
963  int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
964  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
965 
966  int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
967  int widthT = widthCT - widthC;
968  if (widthT < 0) {
969  // I've never seen this one happen, but I suppose it could, if the cpuid
970  // instruction on a chip was really screwed up. Make sure to restore the
971  // affinity mask before the tail call.
972  __kmp_set_system_affinity(oldMask, TRUE);
973  __kmp_free(threadInfo);
974  KMP_CPU_FREE(oldMask);
975  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
976  return -1;
977  }
978 
979  int maskC = (1 << widthC) - 1;
980  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
981 
982  int maskT = (1 << widthT) - 1;
983  threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
984 
985  nApics++;
986  }
987 
988  // We've collected all the info we need.
989  // Restore the old affinity mask for this thread.
990  __kmp_set_system_affinity(oldMask, TRUE);
991 
992  // If there's only one thread context to bind to, form an Address object
993  // with depth 1 and return immediately (or, if affinity is off, set
994  // address2os to NULL and return).
995  //
996  // If it is configured to omit the package level when there is only a single
997  // package, the logic at the end of this routine won't work if there is only
998  // a single thread - it would try to form an Address object with depth 0.
999  KMP_ASSERT(nApics > 0);
1000  if (nApics == 1) {
1001  __kmp_ncores = nPackages = 1;
1002  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1003  if (__kmp_affinity_verbose) {
1004  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1005  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1006 
1007  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1008  if (__kmp_affinity_respect_mask) {
1009  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1010  } else {
1011  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1012  }
1013  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1014  KMP_INFORM(Uniform, "KMP_AFFINITY");
1015  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1016  __kmp_nThreadsPerCore, __kmp_ncores);
1017  }
1018 
1019  if (__kmp_affinity_type == affinity_none) {
1020  __kmp_free(threadInfo);
1021  KMP_CPU_FREE(oldMask);
1022  return 0;
1023  }
1024 
1025  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
1026  Address addr(1);
1027  addr.labels[0] = threadInfo[0].pkgId;
1028  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1029 
1030  if (__kmp_affinity_gran_levels < 0) {
1031  __kmp_affinity_gran_levels = 0;
1032  }
1033 
1034  if (__kmp_affinity_verbose) {
1035  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1036  }
1037 
1038  __kmp_free(threadInfo);
1039  KMP_CPU_FREE(oldMask);
1040  return 1;
1041  }
1042 
1043  // Sort the threadInfo table by physical Id.
1044  qsort(threadInfo, nApics, sizeof(*threadInfo),
1045  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1046 
1047  // The table is now sorted by pkgId / coreId / threadId, but we really don't
1048  // know the radix of any of the fields. pkgId's may be sparsely assigned among
1049  // the chips on a system. Although coreId's are usually assigned
1050  // [0 .. coresPerPkg-1] and threadId's are usually assigned
1051  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1052  //
1053  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1054  // total # packages) are at this point - we want to determine that now. We
1055  // only have an upper bound on the first two figures.
1056  //
1057  // We also perform a consistency check at this point: the values returned by
1058  // the cpuid instruction for any thread bound to a given package had better
1059  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1060  nPackages = 1;
1061  nCoresPerPkg = 1;
1062  __kmp_nThreadsPerCore = 1;
1063  unsigned nCores = 1;
1064 
1065  unsigned pkgCt = 1; // to determine radii
1066  unsigned lastPkgId = threadInfo[0].pkgId;
1067  unsigned coreCt = 1;
1068  unsigned lastCoreId = threadInfo[0].coreId;
1069  unsigned threadCt = 1;
1070  unsigned lastThreadId = threadInfo[0].threadId;
1071 
1072  // intra-pkg consist checks
1073  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1074  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1075 
1076  for (i = 1; i < nApics; i++) {
1077  if (threadInfo[i].pkgId != lastPkgId) {
1078  nCores++;
1079  pkgCt++;
1080  lastPkgId = threadInfo[i].pkgId;
1081  if ((int)coreCt > nCoresPerPkg)
1082  nCoresPerPkg = coreCt;
1083  coreCt = 1;
1084  lastCoreId = threadInfo[i].coreId;
1085  if ((int)threadCt > __kmp_nThreadsPerCore)
1086  __kmp_nThreadsPerCore = threadCt;
1087  threadCt = 1;
1088  lastThreadId = threadInfo[i].threadId;
1089 
1090  // This is a different package, so go on to the next iteration without
1091  // doing any consistency checks. Reset the consistency check vars, though.
1092  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1093  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1094  continue;
1095  }
1096 
1097  if (threadInfo[i].coreId != lastCoreId) {
1098  nCores++;
1099  coreCt++;
1100  lastCoreId = threadInfo[i].coreId;
1101  if ((int)threadCt > __kmp_nThreadsPerCore)
1102  __kmp_nThreadsPerCore = threadCt;
1103  threadCt = 1;
1104  lastThreadId = threadInfo[i].threadId;
1105  } else if (threadInfo[i].threadId != lastThreadId) {
1106  threadCt++;
1107  lastThreadId = threadInfo[i].threadId;
1108  } else {
1109  __kmp_free(threadInfo);
1110  KMP_CPU_FREE(oldMask);
1111  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1112  return -1;
1113  }
1114 
1115  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1116  // fields agree between all the threads bounds to a given package.
1117  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
1118  (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1119  __kmp_free(threadInfo);
1120  KMP_CPU_FREE(oldMask);
1121  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1122  return -1;
1123  }
1124  }
1125  nPackages = pkgCt;
1126  if ((int)coreCt > nCoresPerPkg)
1127  nCoresPerPkg = coreCt;
1128  if ((int)threadCt > __kmp_nThreadsPerCore)
1129  __kmp_nThreadsPerCore = threadCt;
1130 
1131  // When affinity is off, this routine will still be called to set
1132  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1133  // Make sure all these vars are set correctly, and return now if affinity is
1134  // not enabled.
1135  __kmp_ncores = nCores;
1136  if (__kmp_affinity_verbose) {
1137  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1138  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1139 
1140  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1141  if (__kmp_affinity_respect_mask) {
1142  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1143  } else {
1144  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1145  }
1146  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1147  if (__kmp_affinity_uniform_topology()) {
1148  KMP_INFORM(Uniform, "KMP_AFFINITY");
1149  } else {
1150  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1151  }
1152  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1153  __kmp_nThreadsPerCore, __kmp_ncores);
1154  }
1155  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1156  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1157  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1158  for (i = 0; i < nApics; ++i) {
1159  __kmp_pu_os_idx[i] = threadInfo[i].osId;
1160  }
1161  if (__kmp_affinity_type == affinity_none) {
1162  __kmp_free(threadInfo);
1163  KMP_CPU_FREE(oldMask);
1164  return 0;
1165  }
1166 
1167  // Now that we've determined the number of packages, the number of cores per
1168  // package, and the number of threads per core, we can construct the data
1169  // structure that is to be returned.
1170  int pkgLevel = 0;
1171  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1172  int threadLevel =
1173  (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1174  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1175 
1176  KMP_ASSERT(depth > 0);
1177  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1178 
1179  for (i = 0; i < nApics; ++i) {
1180  Address addr(depth);
1181  unsigned os = threadInfo[i].osId;
1182  int d = 0;
1183 
1184  if (pkgLevel >= 0) {
1185  addr.labels[d++] = threadInfo[i].pkgId;
1186  }
1187  if (coreLevel >= 0) {
1188  addr.labels[d++] = threadInfo[i].coreId;
1189  }
1190  if (threadLevel >= 0) {
1191  addr.labels[d++] = threadInfo[i].threadId;
1192  }
1193  (*address2os)[i] = AddrUnsPair(addr, os);
1194  }
1195 
1196  if (__kmp_affinity_gran_levels < 0) {
1197  // Set the granularity level based on what levels are modeled in the machine
1198  // topology map.
1199  __kmp_affinity_gran_levels = 0;
1200  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1201  __kmp_affinity_gran_levels++;
1202  }
1203  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1204  __kmp_affinity_gran_levels++;
1205  }
1206  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1207  __kmp_affinity_gran_levels++;
1208  }
1209  }
1210 
1211  if (__kmp_affinity_verbose) {
1212  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1213  coreLevel, threadLevel);
1214  }
1215 
1216  __kmp_free(threadInfo);
1217  KMP_CPU_FREE(oldMask);
1218  return depth;
1219 }
1220 
1221 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1222 // architectures support a newer interface for specifying the x2APIC Ids,
1223 // based on cpuid leaf 11.
1224 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1225  kmp_i18n_id_t *const msg_id) {
1226  kmp_cpuid buf;
1227  *address2os = NULL;
1228  *msg_id = kmp_i18n_null;
1229 
1230  // Check to see if cpuid leaf 11 is supported.
1231  __kmp_x86_cpuid(0, 0, &buf);
1232  if (buf.eax < 11) {
1233  *msg_id = kmp_i18n_str_NoLeaf11Support;
1234  return -1;
1235  }
1236  __kmp_x86_cpuid(11, 0, &buf);
1237  if (buf.ebx == 0) {
1238  *msg_id = kmp_i18n_str_NoLeaf11Support;
1239  return -1;
1240  }
1241 
1242  // Find the number of levels in the machine topology. While we're at it, get
1243  // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
1244  // get more accurate values later by explicitly counting them, but get
1245  // reasonable defaults now, in case we return early.
1246  int level;
1247  int threadLevel = -1;
1248  int coreLevel = -1;
1249  int pkgLevel = -1;
1250  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1251 
1252  for (level = 0;; level++) {
1253  if (level > 31) {
1254  // FIXME: Hack for DPD200163180
1255  //
1256  // If level is big then something went wrong -> exiting
1257  //
1258  // There could actually be 32 valid levels in the machine topology, but so
1259  // far, the only machine we have seen which does not exit this loop before
1260  // iteration 32 has fubar x2APIC settings.
1261  //
1262  // For now, just reject this case based upon loop trip count.
1263  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1264  return -1;
1265  }
1266  __kmp_x86_cpuid(11, level, &buf);
1267  if (buf.ebx == 0) {
1268  if (pkgLevel < 0) {
1269  // Will infer nPackages from __kmp_xproc
1270  pkgLevel = level;
1271  level++;
1272  }
1273  break;
1274  }
1275  int kind = (buf.ecx >> 8) & 0xff;
1276  if (kind == 1) {
1277  // SMT level
1278  threadLevel = level;
1279  coreLevel = -1;
1280  pkgLevel = -1;
1281  __kmp_nThreadsPerCore = buf.ebx & 0xffff;
1282  if (__kmp_nThreadsPerCore == 0) {
1283  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1284  return -1;
1285  }
1286  } else if (kind == 2) {
1287  // core level
1288  coreLevel = level;
1289  pkgLevel = -1;
1290  nCoresPerPkg = buf.ebx & 0xffff;
1291  if (nCoresPerPkg == 0) {
1292  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1293  return -1;
1294  }
1295  } else {
1296  if (level <= 0) {
1297  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1298  return -1;
1299  }
1300  if (pkgLevel >= 0) {
1301  continue;
1302  }
1303  pkgLevel = level;
1304  nPackages = buf.ebx & 0xffff;
1305  if (nPackages == 0) {
1306  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1307  return -1;
1308  }
1309  }
1310  }
1311  int depth = level;
1312 
1313  // In the above loop, "level" was counted from the finest level (usually
1314  // thread) to the coarsest. The caller expects that we will place the labels
1315  // in (*address2os)[].first.labels[] in the inverse order, so we need to
1316  // invert the vars saying which level means what.
1317  if (threadLevel >= 0) {
1318  threadLevel = depth - threadLevel - 1;
1319  }
1320  if (coreLevel >= 0) {
1321  coreLevel = depth - coreLevel - 1;
1322  }
1323  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1324  pkgLevel = depth - pkgLevel - 1;
1325 
1326  // The algorithm used starts by setting the affinity to each available thread
1327  // and retrieving info from the cpuid instruction, so if we are not capable of
1328  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1329  // need to do something else - use the defaults that we calculated from
1330  // issuing cpuid without binding to each proc.
1331  if (!KMP_AFFINITY_CAPABLE()) {
1332  // Hack to try and infer the machine topology using only the data
1333  // available from cpuid on the current thread, and __kmp_xproc.
1334  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1335 
1336  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1337  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1338  if (__kmp_affinity_verbose) {
1339  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1340  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1341  if (__kmp_affinity_uniform_topology()) {
1342  KMP_INFORM(Uniform, "KMP_AFFINITY");
1343  } else {
1344  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1345  }
1346  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1347  __kmp_nThreadsPerCore, __kmp_ncores);
1348  }
1349  return 0;
1350  }
1351 
1352  // From here on, we can assume that it is safe to call
1353  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1354  // __kmp_affinity_type = affinity_none.
1355 
1356  // Save the affinity mask for the current thread.
1357  kmp_affin_mask_t *oldMask;
1358  KMP_CPU_ALLOC(oldMask);
1359  __kmp_get_system_affinity(oldMask, TRUE);
1360 
1361  // Allocate the data structure to be returned.
1362  AddrUnsPair *retval =
1363  (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1364 
1365  // Run through each of the available contexts, binding the current thread
1366  // to it, and obtaining the pertinent information using the cpuid instr.
1367  unsigned int proc;
1368  int nApics = 0;
1369  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1370  // Skip this proc if it is not included in the machine model.
1371  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1372  continue;
1373  }
1374  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1375 
1376  __kmp_affinity_dispatch->bind_thread(proc);
1377 
1378  // Extract labels for each level in the machine topology map from Apic ID.
1379  Address addr(depth);
1380  int prev_shift = 0;
1381 
1382  for (level = 0; level < depth; level++) {
1383  __kmp_x86_cpuid(11, level, &buf);
1384  unsigned apicId = buf.edx;
1385  if (buf.ebx == 0) {
1386  if (level != depth - 1) {
1387  KMP_CPU_FREE(oldMask);
1388  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1389  return -1;
1390  }
1391  addr.labels[depth - level - 1] = apicId >> prev_shift;
1392  level++;
1393  break;
1394  }
1395  int shift = buf.eax & 0x1f;
1396  int mask = (1 << shift) - 1;
1397  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1398  prev_shift = shift;
1399  }
1400  if (level != depth) {
1401  KMP_CPU_FREE(oldMask);
1402  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1403  return -1;
1404  }
1405 
1406  retval[nApics] = AddrUnsPair(addr, proc);
1407  nApics++;
1408  }
1409 
1410  // We've collected all the info we need.
1411  // Restore the old affinity mask for this thread.
1412  __kmp_set_system_affinity(oldMask, TRUE);
1413 
1414  // If there's only one thread context to bind to, return now.
1415  KMP_ASSERT(nApics > 0);
1416  if (nApics == 1) {
1417  __kmp_ncores = nPackages = 1;
1418  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1419  if (__kmp_affinity_verbose) {
1420  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1421  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1422 
1423  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1424  if (__kmp_affinity_respect_mask) {
1425  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1426  } else {
1427  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1428  }
1429  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1430  KMP_INFORM(Uniform, "KMP_AFFINITY");
1431  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1432  __kmp_nThreadsPerCore, __kmp_ncores);
1433  }
1434 
1435  if (__kmp_affinity_type == affinity_none) {
1436  __kmp_free(retval);
1437  KMP_CPU_FREE(oldMask);
1438  return 0;
1439  }
1440 
1441  // Form an Address object which only includes the package level.
1442  Address addr(1);
1443  addr.labels[0] = retval[0].first.labels[pkgLevel];
1444  retval[0].first = addr;
1445 
1446  if (__kmp_affinity_gran_levels < 0) {
1447  __kmp_affinity_gran_levels = 0;
1448  }
1449 
1450  if (__kmp_affinity_verbose) {
1451  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1452  }
1453 
1454  *address2os = retval;
1455  KMP_CPU_FREE(oldMask);
1456  return 1;
1457  }
1458 
1459  // Sort the table by physical Id.
1460  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1461 
1462  // Find the radix at each of the levels.
1463  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1464  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1465  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1466  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1467  for (level = 0; level < depth; level++) {
1468  totals[level] = 1;
1469  maxCt[level] = 1;
1470  counts[level] = 1;
1471  last[level] = retval[0].first.labels[level];
1472  }
1473 
1474  // From here on, the iteration variable "level" runs from the finest level to
1475  // the coarsest, i.e. we iterate forward through
1476  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1477  // backwards.
1478  for (proc = 1; (int)proc < nApics; proc++) {
1479  int level;
1480  for (level = 0; level < depth; level++) {
1481  if (retval[proc].first.labels[level] != last[level]) {
1482  int j;
1483  for (j = level + 1; j < depth; j++) {
1484  totals[j]++;
1485  counts[j] = 1;
1486  // The line below causes printing incorrect topology information in
1487  // case the max value for some level (maxCt[level]) is encountered
1488  // earlier than some less value while going through the array. For
1489  // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
1490  // maxCt[1] == 2
1491  // whereas it must be 4.
1492  // TODO!!! Check if it can be commented safely
1493  // maxCt[j] = 1;
1494  last[j] = retval[proc].first.labels[j];
1495  }
1496  totals[level]++;
1497  counts[level]++;
1498  if (counts[level] > maxCt[level]) {
1499  maxCt[level] = counts[level];
1500  }
1501  last[level] = retval[proc].first.labels[level];
1502  break;
1503  } else if (level == depth - 1) {
1504  __kmp_free(last);
1505  __kmp_free(maxCt);
1506  __kmp_free(counts);
1507  __kmp_free(totals);
1508  __kmp_free(retval);
1509  KMP_CPU_FREE(oldMask);
1510  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1511  return -1;
1512  }
1513  }
1514  }
1515 
1516  // When affinity is off, this routine will still be called to set
1517  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1518  // Make sure all these vars are set correctly, and return if affinity is not
1519  // enabled.
1520  if (threadLevel >= 0) {
1521  __kmp_nThreadsPerCore = maxCt[threadLevel];
1522  } else {
1523  __kmp_nThreadsPerCore = 1;
1524  }
1525  nPackages = totals[pkgLevel];
1526 
1527  if (coreLevel >= 0) {
1528  __kmp_ncores = totals[coreLevel];
1529  nCoresPerPkg = maxCt[coreLevel];
1530  } else {
1531  __kmp_ncores = nPackages;
1532  nCoresPerPkg = 1;
1533  }
1534 
1535  // Check to see if the machine topology is uniform
1536  unsigned prod = maxCt[0];
1537  for (level = 1; level < depth; level++) {
1538  prod *= maxCt[level];
1539  }
1540  bool uniform = (prod == totals[level - 1]);
1541 
1542  // Print the machine topology summary.
1543  if (__kmp_affinity_verbose) {
1544  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1545  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1546 
1547  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1548  if (__kmp_affinity_respect_mask) {
1549  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1550  } else {
1551  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1552  }
1553  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1554  if (uniform) {
1555  KMP_INFORM(Uniform, "KMP_AFFINITY");
1556  } else {
1557  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1558  }
1559 
1560  kmp_str_buf_t buf;
1561  __kmp_str_buf_init(&buf);
1562 
1563  __kmp_str_buf_print(&buf, "%d", totals[0]);
1564  for (level = 1; level <= pkgLevel; level++) {
1565  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1566  }
1567  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1568  __kmp_nThreadsPerCore, __kmp_ncores);
1569 
1570  __kmp_str_buf_free(&buf);
1571  }
1572  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1573  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1574  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1575  for (proc = 0; (int)proc < nApics; ++proc) {
1576  __kmp_pu_os_idx[proc] = retval[proc].second;
1577  }
1578  if (__kmp_affinity_type == affinity_none) {
1579  __kmp_free(last);
1580  __kmp_free(maxCt);
1581  __kmp_free(counts);
1582  __kmp_free(totals);
1583  __kmp_free(retval);
1584  KMP_CPU_FREE(oldMask);
1585  return 0;
1586  }
1587 
1588  // Find any levels with radiix 1, and remove them from the map
1589  // (except for the package level).
1590  int new_depth = 0;
1591  for (level = 0; level < depth; level++) {
1592  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1593  continue;
1594  }
1595  new_depth++;
1596  }
1597 
1598  // If we are removing any levels, allocate a new vector to return,
1599  // and copy the relevant information to it.
1600  if (new_depth != depth) {
1601  AddrUnsPair *new_retval =
1602  (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1603  for (proc = 0; (int)proc < nApics; proc++) {
1604  Address addr(new_depth);
1605  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1606  }
1607  int new_level = 0;
1608  int newPkgLevel = -1;
1609  int newCoreLevel = -1;
1610  int newThreadLevel = -1;
1611  int i;
1612  for (level = 0; level < depth; level++) {
1613  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1614  // Remove this level. Never remove the package level
1615  continue;
1616  }
1617  if (level == pkgLevel) {
1618  newPkgLevel = level;
1619  }
1620  if (level == coreLevel) {
1621  newCoreLevel = level;
1622  }
1623  if (level == threadLevel) {
1624  newThreadLevel = level;
1625  }
1626  for (proc = 0; (int)proc < nApics; proc++) {
1627  new_retval[proc].first.labels[new_level] =
1628  retval[proc].first.labels[level];
1629  }
1630  new_level++;
1631  }
1632 
1633  __kmp_free(retval);
1634  retval = new_retval;
1635  depth = new_depth;
1636  pkgLevel = newPkgLevel;
1637  coreLevel = newCoreLevel;
1638  threadLevel = newThreadLevel;
1639  }
1640 
1641  if (__kmp_affinity_gran_levels < 0) {
1642  // Set the granularity level based on what levels are modeled
1643  // in the machine topology map.
1644  __kmp_affinity_gran_levels = 0;
1645  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1646  __kmp_affinity_gran_levels++;
1647  }
1648  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1649  __kmp_affinity_gran_levels++;
1650  }
1651  if (__kmp_affinity_gran > affinity_gran_package) {
1652  __kmp_affinity_gran_levels++;
1653  }
1654  }
1655 
1656  if (__kmp_affinity_verbose) {
1657  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
1658  threadLevel);
1659  }
1660 
1661  __kmp_free(last);
1662  __kmp_free(maxCt);
1663  __kmp_free(counts);
1664  __kmp_free(totals);
1665  KMP_CPU_FREE(oldMask);
1666  *address2os = retval;
1667  return depth;
1668 }
1669 
1670 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1671 
1672 #define osIdIndex 0
1673 #define threadIdIndex 1
1674 #define coreIdIndex 2
1675 #define pkgIdIndex 3
1676 #define nodeIdIndex 4
1677 
1678 typedef unsigned *ProcCpuInfo;
1679 static unsigned maxIndex = pkgIdIndex;
1680 
1681 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) {
1682  const unsigned *aa = (const unsigned *)a;
1683  const unsigned *bb = (const unsigned *)b;
1684  if (aa[osIdIndex] < bb[osIdIndex])
1685  return -1;
1686  if (aa[osIdIndex] > bb[osIdIndex])
1687  return 1;
1688  return 0;
1689 };
1690 
1691 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
1692  const void *b) {
1693  unsigned i;
1694  const unsigned *aa = *(unsigned *const *)a;
1695  const unsigned *bb = *(unsigned *const *)b;
1696  for (i = maxIndex;; i--) {
1697  if (aa[i] < bb[i])
1698  return -1;
1699  if (aa[i] > bb[i])
1700  return 1;
1701  if (i == osIdIndex)
1702  break;
1703  }
1704  return 0;
1705 }
1706 
1707 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1708 // affinity map.
1709 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
1710  int *line,
1711  kmp_i18n_id_t *const msg_id,
1712  FILE *f) {
1713  *address2os = NULL;
1714  *msg_id = kmp_i18n_null;
1715 
1716  // Scan of the file, and count the number of "processor" (osId) fields,
1717  // and find the highest value of <n> for a node_<n> field.
1718  char buf[256];
1719  unsigned num_records = 0;
1720  while (!feof(f)) {
1721  buf[sizeof(buf) - 1] = 1;
1722  if (!fgets(buf, sizeof(buf), f)) {
1723  // Read errors presumably because of EOF
1724  break;
1725  }
1726 
1727  char s1[] = "processor";
1728  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1729  num_records++;
1730  continue;
1731  }
1732 
1733  // FIXME - this will match "node_<n> <garbage>"
1734  unsigned level;
1735  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
1736  if (nodeIdIndex + level >= maxIndex) {
1737  maxIndex = nodeIdIndex + level;
1738  }
1739  continue;
1740  }
1741  }
1742 
1743  // Check for empty file / no valid processor records, or too many. The number
1744  // of records can't exceed the number of valid bits in the affinity mask.
1745  if (num_records == 0) {
1746  *line = 0;
1747  *msg_id = kmp_i18n_str_NoProcRecords;
1748  return -1;
1749  }
1750  if (num_records > (unsigned)__kmp_xproc) {
1751  *line = 0;
1752  *msg_id = kmp_i18n_str_TooManyProcRecords;
1753  return -1;
1754  }
1755 
1756  // Set the file pointer back to the begginning, so that we can scan the file
1757  // again, this time performing a full parse of the data. Allocate a vector of
1758  // ProcCpuInfo object, where we will place the data. Adding an extra element
1759  // at the end allows us to remove a lot of extra checks for termination
1760  // conditions.
1761  if (fseek(f, 0, SEEK_SET) != 0) {
1762  *line = 0;
1763  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1764  return -1;
1765  }
1766 
1767  // Allocate the array of records to store the proc info in. The dummy
1768  // element at the end makes the logic in filling them out easier to code.
1769  unsigned **threadInfo =
1770  (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
1771  unsigned i;
1772  for (i = 0; i <= num_records; i++) {
1773  threadInfo[i] =
1774  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
1775  }
1776 
1777 #define CLEANUP_THREAD_INFO \
1778  for (i = 0; i <= num_records; i++) { \
1779  __kmp_free(threadInfo[i]); \
1780  } \
1781  __kmp_free(threadInfo);
1782 
1783  // A value of UINT_MAX means that we didn't find the field
1784  unsigned __index;
1785 
1786 #define INIT_PROC_INFO(p) \
1787  for (__index = 0; __index <= maxIndex; __index++) { \
1788  (p)[__index] = UINT_MAX; \
1789  }
1790 
1791  for (i = 0; i <= num_records; i++) {
1792  INIT_PROC_INFO(threadInfo[i]);
1793  }
1794 
1795  unsigned num_avail = 0;
1796  *line = 0;
1797  while (!feof(f)) {
1798  // Create an inner scoping level, so that all the goto targets at the end of
1799  // the loop appear in an outer scoping level. This avoids warnings about
1800  // jumping past an initialization to a target in the same block.
1801  {
1802  buf[sizeof(buf) - 1] = 1;
1803  bool long_line = false;
1804  if (!fgets(buf, sizeof(buf), f)) {
1805  // Read errors presumably because of EOF
1806  // If there is valid data in threadInfo[num_avail], then fake
1807  // a blank line in ensure that the last address gets parsed.
1808  bool valid = false;
1809  for (i = 0; i <= maxIndex; i++) {
1810  if (threadInfo[num_avail][i] != UINT_MAX) {
1811  valid = true;
1812  }
1813  }
1814  if (!valid) {
1815  break;
1816  }
1817  buf[0] = 0;
1818  } else if (!buf[sizeof(buf) - 1]) {
1819  // The line is longer than the buffer. Set a flag and don't
1820  // emit an error if we were going to ignore the line, anyway.
1821  long_line = true;
1822 
1823 #define CHECK_LINE \
1824  if (long_line) { \
1825  CLEANUP_THREAD_INFO; \
1826  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1827  return -1; \
1828  }
1829  }
1830  (*line)++;
1831 
1832  char s1[] = "processor";
1833  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1834  CHECK_LINE;
1835  char *p = strchr(buf + sizeof(s1) - 1, ':');
1836  unsigned val;
1837  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
1838  goto no_val;
1839  if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
1840  goto dup_field;
1841  threadInfo[num_avail][osIdIndex] = val;
1842 #if KMP_OS_LINUX && USE_SYSFS_INFO
1843  char path[256];
1844  KMP_SNPRINTF(
1845  path, sizeof(path),
1846  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
1847  threadInfo[num_avail][osIdIndex]);
1848  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
1849 
1850  KMP_SNPRINTF(path, sizeof(path),
1851  "/sys/devices/system/cpu/cpu%u/topology/core_id",
1852  threadInfo[num_avail][osIdIndex]);
1853  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
1854  continue;
1855 #else
1856  }
1857  char s2[] = "physical id";
1858  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1859  CHECK_LINE;
1860  char *p = strchr(buf + sizeof(s2) - 1, ':');
1861  unsigned val;
1862  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
1863  goto no_val;
1864  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
1865  goto dup_field;
1866  threadInfo[num_avail][pkgIdIndex] = val;
1867  continue;
1868  }
1869  char s3[] = "core id";
1870  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1871  CHECK_LINE;
1872  char *p = strchr(buf + sizeof(s3) - 1, ':');
1873  unsigned val;
1874  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
1875  goto no_val;
1876  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
1877  goto dup_field;
1878  threadInfo[num_avail][coreIdIndex] = val;
1879  continue;
1880 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
1881  }
1882  char s4[] = "thread id";
1883  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1884  CHECK_LINE;
1885  char *p = strchr(buf + sizeof(s4) - 1, ':');
1886  unsigned val;
1887  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
1888  goto no_val;
1889  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
1890  goto dup_field;
1891  threadInfo[num_avail][threadIdIndex] = val;
1892  continue;
1893  }
1894  unsigned level;
1895  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1896  CHECK_LINE;
1897  char *p = strchr(buf + sizeof(s4) - 1, ':');
1898  unsigned val;
1899  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
1900  goto no_val;
1901  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1902  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
1903  goto dup_field;
1904  threadInfo[num_avail][nodeIdIndex + level] = val;
1905  continue;
1906  }
1907 
1908  // We didn't recognize the leading token on the line. There are lots of
1909  // leading tokens that we don't recognize - if the line isn't empty, go on
1910  // to the next line.
1911  if ((*buf != 0) && (*buf != '\n')) {
1912  // If the line is longer than the buffer, read characters
1913  // until we find a newline.
1914  if (long_line) {
1915  int ch;
1916  while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
1917  ;
1918  }
1919  continue;
1920  }
1921 
1922  // A newline has signalled the end of the processor record.
1923  // Check that there aren't too many procs specified.
1924  if ((int)num_avail == __kmp_xproc) {
1925  CLEANUP_THREAD_INFO;
1926  *msg_id = kmp_i18n_str_TooManyEntries;
1927  return -1;
1928  }
1929 
1930  // Check for missing fields. The osId field must be there, and we
1931  // currently require that the physical id field is specified, also.
1932  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1933  CLEANUP_THREAD_INFO;
1934  *msg_id = kmp_i18n_str_MissingProcField;
1935  return -1;
1936  }
1937  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
1938  CLEANUP_THREAD_INFO;
1939  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
1940  return -1;
1941  }
1942 
1943  // Skip this proc if it is not included in the machine model.
1944  if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
1945  __kmp_affin_fullMask)) {
1946  INIT_PROC_INFO(threadInfo[num_avail]);
1947  continue;
1948  }
1949 
1950  // We have a successful parse of this proc's info.
1951  // Increment the counter, and prepare for the next proc.
1952  num_avail++;
1953  KMP_ASSERT(num_avail <= num_records);
1954  INIT_PROC_INFO(threadInfo[num_avail]);
1955  }
1956  continue;
1957 
1958  no_val:
1959  CLEANUP_THREAD_INFO;
1960  *msg_id = kmp_i18n_str_MissingValCpuinfo;
1961  return -1;
1962 
1963  dup_field:
1964  CLEANUP_THREAD_INFO;
1965  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
1966  return -1;
1967  }
1968  *line = 0;
1969 
1970 #if KMP_MIC && REDUCE_TEAM_SIZE
1971  unsigned teamSize = 0;
1972 #endif // KMP_MIC && REDUCE_TEAM_SIZE
1973 
1974  // check for num_records == __kmp_xproc ???
1975 
1976  // If there's only one thread context to bind to, form an Address object with
1977  // depth 1 and return immediately (or, if affinity is off, set address2os to
1978  // NULL and return).
1979  //
1980  // If it is configured to omit the package level when there is only a single
1981  // package, the logic at the end of this routine won't work if there is only a
1982  // single thread - it would try to form an Address object with depth 0.
1983  KMP_ASSERT(num_avail > 0);
1984  KMP_ASSERT(num_avail <= num_records);
1985  if (num_avail == 1) {
1986  __kmp_ncores = 1;
1987  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1988  if (__kmp_affinity_verbose) {
1989  if (!KMP_AFFINITY_CAPABLE()) {
1990  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
1991  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1992  KMP_INFORM(Uniform, "KMP_AFFINITY");
1993  } else {
1994  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1995  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
1996  __kmp_affin_fullMask);
1997  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
1998  if (__kmp_affinity_respect_mask) {
1999  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2000  } else {
2001  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2002  }
2003  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2004  KMP_INFORM(Uniform, "KMP_AFFINITY");
2005  }
2006  int index;
2007  kmp_str_buf_t buf;
2008  __kmp_str_buf_init(&buf);
2009  __kmp_str_buf_print(&buf, "1");
2010  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2011  __kmp_str_buf_print(&buf, " x 1");
2012  }
2013  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2014  __kmp_str_buf_free(&buf);
2015  }
2016 
2017  if (__kmp_affinity_type == affinity_none) {
2018  CLEANUP_THREAD_INFO;
2019  return 0;
2020  }
2021 
2022  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
2023  Address addr(1);
2024  addr.labels[0] = threadInfo[0][pkgIdIndex];
2025  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2026 
2027  if (__kmp_affinity_gran_levels < 0) {
2028  __kmp_affinity_gran_levels = 0;
2029  }
2030 
2031  if (__kmp_affinity_verbose) {
2032  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2033  }
2034 
2035  CLEANUP_THREAD_INFO;
2036  return 1;
2037  }
2038 
2039  // Sort the threadInfo table by physical Id.
2040  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2041  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2042 
2043  // The table is now sorted by pkgId / coreId / threadId, but we really don't
2044  // know the radix of any of the fields. pkgId's may be sparsely assigned among
2045  // the chips on a system. Although coreId's are usually assigned
2046  // [0 .. coresPerPkg-1] and threadId's are usually assigned
2047  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2048  //
2049  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2050  // total # packages) are at this point - we want to determine that now. We
2051  // only have an upper bound on the first two figures.
2052  unsigned *counts =
2053  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2054  unsigned *maxCt =
2055  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2056  unsigned *totals =
2057  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2058  unsigned *lastId =
2059  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2060 
2061  bool assign_thread_ids = false;
2062  unsigned threadIdCt;
2063  unsigned index;
2064 
2065 restart_radix_check:
2066  threadIdCt = 0;
2067 
2068  // Initialize the counter arrays with data from threadInfo[0].
2069  if (assign_thread_ids) {
2070  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2071  threadInfo[0][threadIdIndex] = threadIdCt++;
2072  } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2073  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2074  }
2075  }
2076  for (index = 0; index <= maxIndex; index++) {
2077  counts[index] = 1;
2078  maxCt[index] = 1;
2079  totals[index] = 1;
2080  lastId[index] = threadInfo[0][index];
2081  ;
2082  }
2083 
2084  // Run through the rest of the OS procs.
2085  for (i = 1; i < num_avail; i++) {
2086  // Find the most significant index whose id differs from the id for the
2087  // previous OS proc.
2088  for (index = maxIndex; index >= threadIdIndex; index--) {
2089  if (assign_thread_ids && (index == threadIdIndex)) {
2090  // Auto-assign the thread id field if it wasn't specified.
2091  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2092  threadInfo[i][threadIdIndex] = threadIdCt++;
2093  }
2094  // Apparently the thread id field was specified for some entries and not
2095  // others. Start the thread id counter off at the next higher thread id.
2096  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2097  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2098  }
2099  }
2100  if (threadInfo[i][index] != lastId[index]) {
2101  // Run through all indices which are less significant, and reset the
2102  // counts to 1. At all levels up to and including index, we need to
2103  // increment the totals and record the last id.
2104  unsigned index2;
2105  for (index2 = threadIdIndex; index2 < index; index2++) {
2106  totals[index2]++;
2107  if (counts[index2] > maxCt[index2]) {
2108  maxCt[index2] = counts[index2];
2109  }
2110  counts[index2] = 1;
2111  lastId[index2] = threadInfo[i][index2];
2112  }
2113  counts[index]++;
2114  totals[index]++;
2115  lastId[index] = threadInfo[i][index];
2116 
2117  if (assign_thread_ids && (index > threadIdIndex)) {
2118 
2119 #if KMP_MIC && REDUCE_TEAM_SIZE
2120  // The default team size is the total #threads in the machine
2121  // minus 1 thread for every core that has 3 or more threads.
2122  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2123 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2124 
2125  // Restart the thread counter, as we are on a new core.
2126  threadIdCt = 0;
2127 
2128  // Auto-assign the thread id field if it wasn't specified.
2129  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2130  threadInfo[i][threadIdIndex] = threadIdCt++;
2131  }
2132 
2133  // Aparrently the thread id field was specified for some entries and
2134  // not others. Start the thread id counter off at the next higher
2135  // thread id.
2136  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2137  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2138  }
2139  }
2140  break;
2141  }
2142  }
2143  if (index < threadIdIndex) {
2144  // If thread ids were specified, it is an error if they are not unique.
2145  // Also, check that we waven't already restarted the loop (to be safe -
2146  // shouldn't need to).
2147  if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
2148  __kmp_free(lastId);
2149  __kmp_free(totals);
2150  __kmp_free(maxCt);
2151  __kmp_free(counts);
2152  CLEANUP_THREAD_INFO;
2153  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2154  return -1;
2155  }
2156 
2157  // If the thread ids were not specified and we see entries entries that
2158  // are duplicates, start the loop over and assign the thread ids manually.
2159  assign_thread_ids = true;
2160  goto restart_radix_check;
2161  }
2162  }
2163 
2164 #if KMP_MIC && REDUCE_TEAM_SIZE
2165  // The default team size is the total #threads in the machine
2166  // minus 1 thread for every core that has 3 or more threads.
2167  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2168 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2169 
2170  for (index = threadIdIndex; index <= maxIndex; index++) {
2171  if (counts[index] > maxCt[index]) {
2172  maxCt[index] = counts[index];
2173  }
2174  }
2175 
2176  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2177  nCoresPerPkg = maxCt[coreIdIndex];
2178  nPackages = totals[pkgIdIndex];
2179 
2180  // Check to see if the machine topology is uniform
2181  unsigned prod = totals[maxIndex];
2182  for (index = threadIdIndex; index < maxIndex; index++) {
2183  prod *= maxCt[index];
2184  }
2185  bool uniform = (prod == totals[threadIdIndex]);
2186 
2187  // When affinity is off, this routine will still be called to set
2188  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2189  // Make sure all these vars are set correctly, and return now if affinity is
2190  // not enabled.
2191  __kmp_ncores = totals[coreIdIndex];
2192 
2193  if (__kmp_affinity_verbose) {
2194  if (!KMP_AFFINITY_CAPABLE()) {
2195  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2196  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2197  if (uniform) {
2198  KMP_INFORM(Uniform, "KMP_AFFINITY");
2199  } else {
2200  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2201  }
2202  } else {
2203  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2204  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2205  __kmp_affin_fullMask);
2206  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2207  if (__kmp_affinity_respect_mask) {
2208  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2209  } else {
2210  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2211  }
2212  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2213  if (uniform) {
2214  KMP_INFORM(Uniform, "KMP_AFFINITY");
2215  } else {
2216  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2217  }
2218  }
2219  kmp_str_buf_t buf;
2220  __kmp_str_buf_init(&buf);
2221 
2222  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2223  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2224  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2225  }
2226  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2227  maxCt[threadIdIndex], __kmp_ncores);
2228 
2229  __kmp_str_buf_free(&buf);
2230  }
2231 
2232 #if KMP_MIC && REDUCE_TEAM_SIZE
2233  // Set the default team size.
2234  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2235  __kmp_dflt_team_nth = teamSize;
2236  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2237  "__kmp_dflt_team_nth = %d\n",
2238  __kmp_dflt_team_nth));
2239  }
2240 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2241 
2242  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2243  KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
2244  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2245  for (i = 0; i < num_avail; ++i) { // fill the os indices
2246  __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2247  }
2248 
2249  if (__kmp_affinity_type == affinity_none) {
2250  __kmp_free(lastId);
2251  __kmp_free(totals);
2252  __kmp_free(maxCt);
2253  __kmp_free(counts);
2254  CLEANUP_THREAD_INFO;
2255  return 0;
2256  }
2257 
2258  // Count the number of levels which have more nodes at that level than at the
2259  // parent's level (with there being an implicit root node of the top level).
2260  // This is equivalent to saying that there is at least one node at this level
2261  // which has a sibling. These levels are in the map, and the package level is
2262  // always in the map.
2263  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2264  int level = 0;
2265  for (index = threadIdIndex; index < maxIndex; index++) {
2266  KMP_ASSERT(totals[index] >= totals[index + 1]);
2267  inMap[index] = (totals[index] > totals[index + 1]);
2268  }
2269  inMap[maxIndex] = (totals[maxIndex] > 1);
2270  inMap[pkgIdIndex] = true;
2271 
2272  int depth = 0;
2273  for (index = threadIdIndex; index <= maxIndex; index++) {
2274  if (inMap[index]) {
2275  depth++;
2276  }
2277  }
2278  KMP_ASSERT(depth > 0);
2279 
2280  // Construct the data structure that is to be returned.
2281  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2282  int pkgLevel = -1;
2283  int coreLevel = -1;
2284  int threadLevel = -1;
2285 
2286  for (i = 0; i < num_avail; ++i) {
2287  Address addr(depth);
2288  unsigned os = threadInfo[i][osIdIndex];
2289  int src_index;
2290  int dst_index = 0;
2291 
2292  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2293  if (!inMap[src_index]) {
2294  continue;
2295  }
2296  addr.labels[dst_index] = threadInfo[i][src_index];
2297  if (src_index == pkgIdIndex) {
2298  pkgLevel = dst_index;
2299  } else if (src_index == coreIdIndex) {
2300  coreLevel = dst_index;
2301  } else if (src_index == threadIdIndex) {
2302  threadLevel = dst_index;
2303  }
2304  dst_index++;
2305  }
2306  (*address2os)[i] = AddrUnsPair(addr, os);
2307  }
2308 
2309  if (__kmp_affinity_gran_levels < 0) {
2310  // Set the granularity level based on what levels are modeled
2311  // in the machine topology map.
2312  unsigned src_index;
2313  __kmp_affinity_gran_levels = 0;
2314  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2315  if (!inMap[src_index]) {
2316  continue;
2317  }
2318  switch (src_index) {
2319  case threadIdIndex:
2320  if (__kmp_affinity_gran > affinity_gran_thread) {
2321  __kmp_affinity_gran_levels++;
2322  }
2323 
2324  break;
2325  case coreIdIndex:
2326  if (__kmp_affinity_gran > affinity_gran_core) {
2327  __kmp_affinity_gran_levels++;
2328  }
2329  break;
2330 
2331  case pkgIdIndex:
2332  if (__kmp_affinity_gran > affinity_gran_package) {
2333  __kmp_affinity_gran_levels++;
2334  }
2335  break;
2336  }
2337  }
2338  }
2339 
2340  if (__kmp_affinity_verbose) {
2341  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2342  coreLevel, threadLevel);
2343  }
2344 
2345  __kmp_free(inMap);
2346  __kmp_free(lastId);
2347  __kmp_free(totals);
2348  __kmp_free(maxCt);
2349  __kmp_free(counts);
2350  CLEANUP_THREAD_INFO;
2351  return depth;
2352 }
2353 
2354 // Create and return a table of affinity masks, indexed by OS thread ID.
2355 // This routine handles OR'ing together all the affinity masks of threads
2356 // that are sufficiently close, if granularity > fine.
2357 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
2358  unsigned *numUnique,
2359  AddrUnsPair *address2os,
2360  unsigned numAddrs) {
2361  // First form a table of affinity masks in order of OS thread id.
2362  unsigned depth;
2363  unsigned maxOsId;
2364  unsigned i;
2365 
2366  KMP_ASSERT(numAddrs > 0);
2367  depth = address2os[0].first.depth;
2368 
2369  maxOsId = 0;
2370  for (i = 0; i < numAddrs; i++) {
2371  unsigned osId = address2os[i].second;
2372  if (osId > maxOsId) {
2373  maxOsId = osId;
2374  }
2375  }
2376  kmp_affin_mask_t *osId2Mask;
2377  KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
2378 
2379  // Sort the address2os table according to physical order. Doing so will put
2380  // all threads on the same core/package/node in consecutive locations.
2381  qsort(address2os, numAddrs, sizeof(*address2os),
2382  __kmp_affinity_cmp_Address_labels);
2383 
2384  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2385  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2386  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2387  }
2388  if (__kmp_affinity_gran_levels >= (int)depth) {
2389  if (__kmp_affinity_verbose ||
2390  (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2391  KMP_WARNING(AffThreadsMayMigrate);
2392  }
2393  }
2394 
2395  // Run through the table, forming the masks for all threads on each core.
2396  // Threads on the same core will have identical "Address" objects, not
2397  // considering the last level, which must be the thread id. All threads on a
2398  // core will appear consecutively.
2399  unsigned unique = 0;
2400  unsigned j = 0; // index of 1st thread on core
2401  unsigned leader = 0;
2402  Address *leaderAddr = &(address2os[0].first);
2403  kmp_affin_mask_t *sum;
2404  KMP_CPU_ALLOC_ON_STACK(sum);
2405  KMP_CPU_ZERO(sum);
2406  KMP_CPU_SET(address2os[0].second, sum);
2407  for (i = 1; i < numAddrs; i++) {
2408  // If this thread is sufficiently close to the leader (within the
2409  // granularity setting), then set the bit for this os thread in the
2410  // affinity mask for this group, and go on to the next thread.
2411  if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
2412  KMP_CPU_SET(address2os[i].second, sum);
2413  continue;
2414  }
2415 
2416  // For every thread in this group, copy the mask to the thread's entry in
2417  // the osId2Mask table. Mark the first address as a leader.
2418  for (; j < i; j++) {
2419  unsigned osId = address2os[j].second;
2420  KMP_DEBUG_ASSERT(osId <= maxOsId);
2421  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2422  KMP_CPU_COPY(mask, sum);
2423  address2os[j].first.leader = (j == leader);
2424  }
2425  unique++;
2426 
2427  // Start a new mask.
2428  leader = i;
2429  leaderAddr = &(address2os[i].first);
2430  KMP_CPU_ZERO(sum);
2431  KMP_CPU_SET(address2os[i].second, sum);
2432  }
2433 
2434  // For every thread in last group, copy the mask to the thread's
2435  // entry in the osId2Mask table.
2436  for (; j < i; j++) {
2437  unsigned osId = address2os[j].second;
2438  KMP_DEBUG_ASSERT(osId <= maxOsId);
2439  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2440  KMP_CPU_COPY(mask, sum);
2441  address2os[j].first.leader = (j == leader);
2442  }
2443  unique++;
2444  KMP_CPU_FREE_FROM_STACK(sum);
2445 
2446  *maxIndex = maxOsId;
2447  *numUnique = unique;
2448  return osId2Mask;
2449 }
2450 
2451 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2452 // as file-static than to try and pass them through the calling sequence of
2453 // the recursive-descent OMP_PLACES parser.
2454 static kmp_affin_mask_t *newMasks;
2455 static int numNewMasks;
2456 static int nextNewMask;
2457 
2458 #define ADD_MASK(_mask) \
2459  { \
2460  if (nextNewMask >= numNewMasks) { \
2461  int i; \
2462  numNewMasks *= 2; \
2463  kmp_affin_mask_t *temp; \
2464  KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
2465  for (i = 0; i < numNewMasks / 2; i++) { \
2466  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
2467  kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
2468  KMP_CPU_COPY(dest, src); \
2469  } \
2470  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
2471  newMasks = temp; \
2472  } \
2473  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2474  nextNewMask++; \
2475  }
2476 
2477 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
2478  { \
2479  if (((_osId) > _maxOsId) || \
2480  (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2481  if (__kmp_affinity_verbose || \
2482  (__kmp_affinity_warnings && \
2483  (__kmp_affinity_type != affinity_none))) { \
2484  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2485  } \
2486  } else { \
2487  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2488  } \
2489  }
2490 
2491 // Re-parse the proclist (for the explicit affinity type), and form the list
2492 // of affinity newMasks indexed by gtid.
2493 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2494  unsigned int *out_numMasks,
2495  const char *proclist,
2496  kmp_affin_mask_t *osId2Mask,
2497  int maxOsId) {
2498  int i;
2499  const char *scan = proclist;
2500  const char *next = proclist;
2501 
2502  // We use malloc() for the temporary mask vector, so that we can use
2503  // realloc() to extend it.
2504  numNewMasks = 2;
2505  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2506  nextNewMask = 0;
2507  kmp_affin_mask_t *sumMask;
2508  KMP_CPU_ALLOC(sumMask);
2509  int setSize = 0;
2510 
2511  for (;;) {
2512  int start, end, stride;
2513 
2514  SKIP_WS(scan);
2515  next = scan;
2516  if (*next == '\0') {
2517  break;
2518  }
2519 
2520  if (*next == '{') {
2521  int num;
2522  setSize = 0;
2523  next++; // skip '{'
2524  SKIP_WS(next);
2525  scan = next;
2526 
2527  // Read the first integer in the set.
2528  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
2529  SKIP_DIGITS(next);
2530  num = __kmp_str_to_int(scan, *next);
2531  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2532 
2533  // Copy the mask for that osId to the sum (union) mask.
2534  if ((num > maxOsId) ||
2535  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2536  if (__kmp_affinity_verbose ||
2537  (__kmp_affinity_warnings &&
2538  (__kmp_affinity_type != affinity_none))) {
2539  KMP_WARNING(AffIgnoreInvalidProcID, num);
2540  }
2541  KMP_CPU_ZERO(sumMask);
2542  } else {
2543  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2544  setSize = 1;
2545  }
2546 
2547  for (;;) {
2548  // Check for end of set.
2549  SKIP_WS(next);
2550  if (*next == '}') {
2551  next++; // skip '}'
2552  break;
2553  }
2554 
2555  // Skip optional comma.
2556  if (*next == ',') {
2557  next++;
2558  }
2559  SKIP_WS(next);
2560 
2561  // Read the next integer in the set.
2562  scan = next;
2563  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2564 
2565  SKIP_DIGITS(next);
2566  num = __kmp_str_to_int(scan, *next);
2567  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2568 
2569  // Add the mask for that osId to the sum mask.
2570  if ((num > maxOsId) ||
2571  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2572  if (__kmp_affinity_verbose ||
2573  (__kmp_affinity_warnings &&
2574  (__kmp_affinity_type != affinity_none))) {
2575  KMP_WARNING(AffIgnoreInvalidProcID, num);
2576  }
2577  } else {
2578  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2579  setSize++;
2580  }
2581  }
2582  if (setSize > 0) {
2583  ADD_MASK(sumMask);
2584  }
2585 
2586  SKIP_WS(next);
2587  if (*next == ',') {
2588  next++;
2589  }
2590  scan = next;
2591  continue;
2592  }
2593 
2594  // Read the first integer.
2595  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2596  SKIP_DIGITS(next);
2597  start = __kmp_str_to_int(scan, *next);
2598  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2599  SKIP_WS(next);
2600 
2601  // If this isn't a range, then add a mask to the list and go on.
2602  if (*next != '-') {
2603  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2604 
2605  // Skip optional comma.
2606  if (*next == ',') {
2607  next++;
2608  }
2609  scan = next;
2610  continue;
2611  }
2612 
2613  // This is a range. Skip over the '-' and read in the 2nd int.
2614  next++; // skip '-'
2615  SKIP_WS(next);
2616  scan = next;
2617  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2618  SKIP_DIGITS(next);
2619  end = __kmp_str_to_int(scan, *next);
2620  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2621 
2622  // Check for a stride parameter
2623  stride = 1;
2624  SKIP_WS(next);
2625  if (*next == ':') {
2626  // A stride is specified. Skip over the ':" and read the 3rd int.
2627  int sign = +1;
2628  next++; // skip ':'
2629  SKIP_WS(next);
2630  scan = next;
2631  if (*next == '-') {
2632  sign = -1;
2633  next++;
2634  SKIP_WS(next);
2635  scan = next;
2636  }
2637  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2638  SKIP_DIGITS(next);
2639  stride = __kmp_str_to_int(scan, *next);
2640  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2641  stride *= sign;
2642  }
2643 
2644  // Do some range checks.
2645  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2646  if (stride > 0) {
2647  KMP_ASSERT2(start <= end, "bad explicit proc list");
2648  } else {
2649  KMP_ASSERT2(start >= end, "bad explicit proc list");
2650  }
2651  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2652 
2653  // Add the mask for each OS proc # to the list.
2654  if (stride > 0) {
2655  do {
2656  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2657  start += stride;
2658  } while (start <= end);
2659  } else {
2660  do {
2661  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2662  start += stride;
2663  } while (start >= end);
2664  }
2665 
2666  // Skip optional comma.
2667  SKIP_WS(next);
2668  if (*next == ',') {
2669  next++;
2670  }
2671  scan = next;
2672  }
2673 
2674  *out_numMasks = nextNewMask;
2675  if (nextNewMask == 0) {
2676  *out_masks = NULL;
2677  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
2678  return;
2679  }
2680  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
2681  for (i = 0; i < nextNewMask; i++) {
2682  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
2683  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
2684  KMP_CPU_COPY(dest, src);
2685  }
2686  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
2687  KMP_CPU_FREE(sumMask);
2688 }
2689 
2690 #if OMP_40_ENABLED
2691 
2692 /*-----------------------------------------------------------------------------
2693 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2694 places. Again, Here is the grammar:
2695 
2696 place_list := place
2697 place_list := place , place_list
2698 place := num
2699 place := place : num
2700 place := place : num : signed
2701 place := { subplacelist }
2702 place := ! place // (lowest priority)
2703 subplace_list := subplace
2704 subplace_list := subplace , subplace_list
2705 subplace := num
2706 subplace := num : num
2707 subplace := num : num : signed
2708 signed := num
2709 signed := + signed
2710 signed := - signed
2711 -----------------------------------------------------------------------------*/
2712 
2713 static void __kmp_process_subplace_list(const char **scan,
2714  kmp_affin_mask_t *osId2Mask,
2715  int maxOsId, kmp_affin_mask_t *tempMask,
2716  int *setSize) {
2717  const char *next;
2718 
2719  for (;;) {
2720  int start, count, stride, i;
2721 
2722  // Read in the starting proc id
2723  SKIP_WS(*scan);
2724  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
2725  next = *scan;
2726  SKIP_DIGITS(next);
2727  start = __kmp_str_to_int(*scan, *next);
2728  KMP_ASSERT(start >= 0);
2729  *scan = next;
2730 
2731  // valid follow sets are ',' ':' and '}'
2732  SKIP_WS(*scan);
2733  if (**scan == '}' || **scan == ',') {
2734  if ((start > maxOsId) ||
2735  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2736  if (__kmp_affinity_verbose ||
2737  (__kmp_affinity_warnings &&
2738  (__kmp_affinity_type != affinity_none))) {
2739  KMP_WARNING(AffIgnoreInvalidProcID, start);
2740  }
2741  } else {
2742  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2743  (*setSize)++;
2744  }
2745  if (**scan == '}') {
2746  break;
2747  }
2748  (*scan)++; // skip ','
2749  continue;
2750  }
2751  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2752  (*scan)++; // skip ':'
2753 
2754  // Read count parameter
2755  SKIP_WS(*scan);
2756  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
2757  next = *scan;
2758  SKIP_DIGITS(next);
2759  count = __kmp_str_to_int(*scan, *next);
2760  KMP_ASSERT(count >= 0);
2761  *scan = next;
2762 
2763  // valid follow sets are ',' ':' and '}'
2764  SKIP_WS(*scan);
2765  if (**scan == '}' || **scan == ',') {
2766  for (i = 0; i < count; i++) {
2767  if ((start > maxOsId) ||
2768  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2769  if (__kmp_affinity_verbose ||
2770  (__kmp_affinity_warnings &&
2771  (__kmp_affinity_type != affinity_none))) {
2772  KMP_WARNING(AffIgnoreInvalidProcID, start);
2773  }
2774  break; // don't proliferate warnings for large count
2775  } else {
2776  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2777  start++;
2778  (*setSize)++;
2779  }
2780  }
2781  if (**scan == '}') {
2782  break;
2783  }
2784  (*scan)++; // skip ','
2785  continue;
2786  }
2787  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2788  (*scan)++; // skip ':'
2789 
2790  // Read stride parameter
2791  int sign = +1;
2792  for (;;) {
2793  SKIP_WS(*scan);
2794  if (**scan == '+') {
2795  (*scan)++; // skip '+'
2796  continue;
2797  }
2798  if (**scan == '-') {
2799  sign *= -1;
2800  (*scan)++; // skip '-'
2801  continue;
2802  }
2803  break;
2804  }
2805  SKIP_WS(*scan);
2806  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
2807  next = *scan;
2808  SKIP_DIGITS(next);
2809  stride = __kmp_str_to_int(*scan, *next);
2810  KMP_ASSERT(stride >= 0);
2811  *scan = next;
2812  stride *= sign;
2813 
2814  // valid follow sets are ',' and '}'
2815  SKIP_WS(*scan);
2816  if (**scan == '}' || **scan == ',') {
2817  for (i = 0; i < count; i++) {
2818  if ((start > maxOsId) ||
2819  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2820  if (__kmp_affinity_verbose ||
2821  (__kmp_affinity_warnings &&
2822  (__kmp_affinity_type != affinity_none))) {
2823  KMP_WARNING(AffIgnoreInvalidProcID, start);
2824  }
2825  break; // don't proliferate warnings for large count
2826  } else {
2827  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2828  start += stride;
2829  (*setSize)++;
2830  }
2831  }
2832  if (**scan == '}') {
2833  break;
2834  }
2835  (*scan)++; // skip ','
2836  continue;
2837  }
2838 
2839  KMP_ASSERT2(0, "bad explicit places list");
2840  }
2841 }
2842 
2843 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
2844  int maxOsId, kmp_affin_mask_t *tempMask,
2845  int *setSize) {
2846  const char *next;
2847 
2848  // valid follow sets are '{' '!' and num
2849  SKIP_WS(*scan);
2850  if (**scan == '{') {
2851  (*scan)++; // skip '{'
2852  __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
2853  KMP_ASSERT2(**scan == '}', "bad explicit places list");
2854  (*scan)++; // skip '}'
2855  } else if (**scan == '!') {
2856  (*scan)++; // skip '!'
2857  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
2858  KMP_CPU_COMPLEMENT(maxOsId, tempMask);
2859  } else if ((**scan >= '0') && (**scan <= '9')) {
2860  next = *scan;
2861  SKIP_DIGITS(next);
2862  int num = __kmp_str_to_int(*scan, *next);
2863  KMP_ASSERT(num >= 0);
2864  if ((num > maxOsId) ||
2865  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2866  if (__kmp_affinity_verbose ||
2867  (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2868  KMP_WARNING(AffIgnoreInvalidProcID, num);
2869  }
2870  } else {
2871  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
2872  (*setSize)++;
2873  }
2874  *scan = next; // skip num
2875  } else {
2876  KMP_ASSERT2(0, "bad explicit places list");
2877  }
2878 }
2879 
2880 // static void
2881 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
2882  unsigned int *out_numMasks,
2883  const char *placelist,
2884  kmp_affin_mask_t *osId2Mask,
2885  int maxOsId) {
2886  int i, j, count, stride, sign;
2887  const char *scan = placelist;
2888  const char *next = placelist;
2889 
2890  numNewMasks = 2;
2891  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2892  nextNewMask = 0;
2893 
2894  // tempMask is modified based on the previous or initial
2895  // place to form the current place
2896  // previousMask contains the previous place
2897  kmp_affin_mask_t *tempMask;
2898  kmp_affin_mask_t *previousMask;
2899  KMP_CPU_ALLOC(tempMask);
2900  KMP_CPU_ZERO(tempMask);
2901  KMP_CPU_ALLOC(previousMask);
2902  KMP_CPU_ZERO(previousMask);
2903  int setSize = 0;
2904 
2905  for (;;) {
2906  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
2907 
2908  // valid follow sets are ',' ':' and EOL
2909  SKIP_WS(scan);
2910  if (*scan == '\0' || *scan == ',') {
2911  if (setSize > 0) {
2912  ADD_MASK(tempMask);
2913  }
2914  KMP_CPU_ZERO(tempMask);
2915  setSize = 0;
2916  if (*scan == '\0') {
2917  break;
2918  }
2919  scan++; // skip ','
2920  continue;
2921  }
2922 
2923  KMP_ASSERT2(*scan == ':', "bad explicit places list");
2924  scan++; // skip ':'
2925 
2926  // Read count parameter
2927  SKIP_WS(scan);
2928  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
2929  next = scan;
2930  SKIP_DIGITS(next);
2931  count = __kmp_str_to_int(scan, *next);
2932  KMP_ASSERT(count >= 0);
2933  scan = next;
2934 
2935  // valid follow sets are ',' ':' and EOL
2936  SKIP_WS(scan);
2937  if (*scan == '\0' || *scan == ',') {
2938  stride = +1;
2939  } else {
2940  KMP_ASSERT2(*scan == ':', "bad explicit places list");
2941  scan++; // skip ':'
2942 
2943  // Read stride parameter
2944  sign = +1;
2945  for (;;) {
2946  SKIP_WS(scan);
2947  if (*scan == '+') {
2948  scan++; // skip '+'
2949  continue;
2950  }
2951  if (*scan == '-') {
2952  sign *= -1;
2953  scan++; // skip '-'
2954  continue;
2955  }
2956  break;
2957  }
2958  SKIP_WS(scan);
2959  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
2960  next = scan;
2961  SKIP_DIGITS(next);
2962  stride = __kmp_str_to_int(scan, *next);
2963  KMP_DEBUG_ASSERT(stride >= 0);
2964  scan = next;
2965  stride *= sign;
2966  }
2967 
2968  // Add places determined by initial_place : count : stride
2969  for (i = 0; i < count; i++) {
2970  if (setSize == 0) {
2971  break;
2972  }
2973  // Add the current place, then build the next place (tempMask) from that
2974  KMP_CPU_COPY(previousMask, tempMask);
2975  ADD_MASK(previousMask);
2976  KMP_CPU_ZERO(tempMask);
2977  setSize = 0;
2978  KMP_CPU_SET_ITERATE(j, previousMask) {
2979  if (!KMP_CPU_ISSET(j, previousMask)) {
2980  continue;
2981  }
2982  if ((j + stride > maxOsId) || (j + stride < 0) ||
2983  (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
2984  (!KMP_CPU_ISSET(j + stride,
2985  KMP_CPU_INDEX(osId2Mask, j + stride)))) {
2986  if ((__kmp_affinity_verbose ||
2987  (__kmp_affinity_warnings &&
2988  (__kmp_affinity_type != affinity_none))) &&
2989  i < count - 1) {
2990  KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
2991  }
2992  continue;
2993  }
2994  KMP_CPU_SET(j + stride, tempMask);
2995  setSize++;
2996  }
2997  }
2998  KMP_CPU_ZERO(tempMask);
2999  setSize = 0;
3000 
3001  // valid follow sets are ',' and EOL
3002  SKIP_WS(scan);
3003  if (*scan == '\0') {
3004  break;
3005  }
3006  if (*scan == ',') {
3007  scan++; // skip ','
3008  continue;
3009  }
3010 
3011  KMP_ASSERT2(0, "bad explicit places list");
3012  }
3013 
3014  *out_numMasks = nextNewMask;
3015  if (nextNewMask == 0) {
3016  *out_masks = NULL;
3017  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3018  return;
3019  }
3020  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3021  KMP_CPU_FREE(tempMask);
3022  KMP_CPU_FREE(previousMask);
3023  for (i = 0; i < nextNewMask; i++) {
3024  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3025  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3026  KMP_CPU_COPY(dest, src);
3027  }
3028  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3029 }
3030 
3031 #endif /* OMP_40_ENABLED */
3032 
3033 #undef ADD_MASK
3034 #undef ADD_MASK_OSID
3035 
3036 #if KMP_USE_HWLOC
3037 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
3038  hwloc_obj_type_t type,
3039  hwloc_obj_t *f) {
3040  if (!hwloc_compare_types(o->type, type)) {
3041  if (*f == NULL)
3042  *f = o; // output first descendant found
3043  return 1;
3044  }
3045  int sum = 0;
3046  for (unsigned i = 0; i < o->arity; i++)
3047  sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
3048  return sum; // will be 0 if no one found (as PU arity is 0)
3049 }
3050 
3051 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
3052  hwloc_obj_t o, unsigned depth,
3053  hwloc_obj_t *f) {
3054  if (o->depth == depth) {
3055  if (*f == NULL)
3056  *f = o; // output first descendant found
3057  return 1;
3058  }
3059  int sum = 0;
3060  for (unsigned i = 0; i < o->arity; i++)
3061  sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
3062  return sum; // will be 0 if no one found (as PU arity is 0)
3063 }
3064 
3065 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
3066  // skip PUs descendants of the object o
3067  int skipped = 0;
3068  hwloc_obj_t hT = NULL;
3069  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3070  for (int i = 0; i < N; ++i) {
3071  KMP_DEBUG_ASSERT(hT);
3072  unsigned idx = hT->os_index;
3073  if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3074  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3075  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3076  ++skipped;
3077  }
3078  hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3079  }
3080  return skipped; // count number of skipped units
3081 }
3082 
3083 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
3084  // check if obj has PUs present in fullMask
3085  hwloc_obj_t hT = NULL;
3086  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3087  for (int i = 0; i < N; ++i) {
3088  KMP_DEBUG_ASSERT(hT);
3089  unsigned idx = hT->os_index;
3090  if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
3091  return 1; // found PU
3092  hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3093  }
3094  return 0; // no PUs found
3095 }
3096 #endif // KMP_USE_HWLOC
3097 
3098 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
3099  AddrUnsPair *newAddr;
3100  if (__kmp_hws_requested == 0)
3101  goto _exit; // no topology limiting actions requested, exit
3102 #if KMP_USE_HWLOC
3103  if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3104  // Number of subobjects calculated dynamically, this works fine for
3105  // any non-uniform topology.
3106  // L2 cache objects are determined by depth, other objects - by type.
3107  hwloc_topology_t tp = __kmp_hwloc_topology;
3108  int nS = 0, nN = 0, nL = 0, nC = 0,
3109  nT = 0; // logical index including skipped
3110  int nCr = 0, nTr = 0; // number of requested units
3111  int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
3112  hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
3113  int L2depth, idx;
3114 
3115  // check support of extensions ----------------------------------
3116  int numa_support = 0, tile_support = 0;
3117  if (__kmp_pu_os_idx)
3118  hT = hwloc_get_pu_obj_by_os_index(tp,
3119  __kmp_pu_os_idx[__kmp_avail_proc - 1]);
3120  else
3121  hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
3122  if (hT == NULL) { // something's gone wrong
3123  KMP_WARNING(AffHWSubsetUnsupported);
3124  goto _exit;
3125  }
3126  // check NUMA node
3127  hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
3128  hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
3129  if (hN != NULL && hN->depth > hS->depth) {
3130  numa_support = 1; // 1 in case socket includes node(s)
3131  } else if (__kmp_hws_node.num > 0) {
3132  // don't support sockets inside NUMA node (no such HW found for testing)
3133  KMP_WARNING(AffHWSubsetUnsupported);
3134  goto _exit;
3135  }
3136  // check L2 cahce, get object by depth because of multiple caches
3137  L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
3138  hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
3139  if (hL != NULL &&
3140  __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
3141  tile_support = 1; // no sense to count L2 if it includes single core
3142  } else if (__kmp_hws_tile.num > 0) {
3143  if (__kmp_hws_core.num == 0) {
3144  __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
3145  __kmp_hws_tile.num = 0;
3146  } else {
3147  // L2 and core are both requested, but represent same object
3148  KMP_WARNING(AffHWSubsetInvalid);
3149  goto _exit;
3150  }
3151  }
3152  // end of check of extensions -----------------------------------
3153 
3154  // fill in unset items, validate settings -----------------------
3155  if (__kmp_hws_socket.num == 0)
3156  __kmp_hws_socket.num = nPackages; // use all available sockets
3157  if (__kmp_hws_socket.offset >= nPackages) {
3158  KMP_WARNING(AffHWSubsetManySockets);
3159  goto _exit;
3160  }
3161  if (numa_support) {
3162  int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
3163  &hN); // num nodes in socket
3164  if (__kmp_hws_node.num == 0)
3165  __kmp_hws_node.num = NN; // use all available nodes
3166  if (__kmp_hws_node.offset >= NN) {
3167  KMP_WARNING(AffHWSubsetManyNodes);
3168  goto _exit;
3169  }
3170  if (tile_support) {
3171  // get num tiles in node
3172  int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3173  if (__kmp_hws_tile.num == 0) {
3174  __kmp_hws_tile.num = NL + 1;
3175  } // use all available tiles, some node may have more tiles, thus +1
3176  if (__kmp_hws_tile.offset >= NL) {
3177  KMP_WARNING(AffHWSubsetManyTiles);
3178  goto _exit;
3179  }
3180  int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3181  &hC); // num cores in tile
3182  if (__kmp_hws_core.num == 0)
3183  __kmp_hws_core.num = NC; // use all available cores
3184  if (__kmp_hws_core.offset >= NC) {
3185  KMP_WARNING(AffHWSubsetManyCores);
3186  goto _exit;
3187  }
3188  } else { // tile_support
3189  int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
3190  &hC); // num cores in node
3191  if (__kmp_hws_core.num == 0)
3192  __kmp_hws_core.num = NC; // use all available cores
3193  if (__kmp_hws_core.offset >= NC) {
3194  KMP_WARNING(AffHWSubsetManyCores);
3195  goto _exit;
3196  }
3197  } // tile_support
3198  } else { // numa_support
3199  if (tile_support) {
3200  // get num tiles in socket
3201  int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3202  if (__kmp_hws_tile.num == 0)
3203  __kmp_hws_tile.num = NL; // use all available tiles
3204  if (__kmp_hws_tile.offset >= NL) {
3205  KMP_WARNING(AffHWSubsetManyTiles);
3206  goto _exit;
3207  }
3208  int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3209  &hC); // num cores in tile
3210  if (__kmp_hws_core.num == 0)
3211  __kmp_hws_core.num = NC; // use all available cores
3212  if (__kmp_hws_core.offset >= NC) {
3213  KMP_WARNING(AffHWSubsetManyCores);
3214  goto _exit;
3215  }
3216  } else { // tile_support
3217  int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
3218  &hC); // num cores in socket
3219  if (__kmp_hws_core.num == 0)
3220  __kmp_hws_core.num = NC; // use all available cores
3221  if (__kmp_hws_core.offset >= NC) {
3222  KMP_WARNING(AffHWSubsetManyCores);
3223  goto _exit;
3224  }
3225  } // tile_support
3226  }
3227  if (__kmp_hws_proc.num == 0)
3228  __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
3229  if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
3230  KMP_WARNING(AffHWSubsetManyProcs);
3231  goto _exit;
3232  }
3233  // end of validation --------------------------------------------
3234 
3235  if (pAddr) // pAddr is NULL in case of affinity_none
3236  newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
3237  __kmp_avail_proc); // max size
3238  // main loop to form HW subset ----------------------------------
3239  hS = NULL;
3240  int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
3241  for (int s = 0; s < NP; ++s) {
3242  // Check Socket -----------------------------------------------
3243  hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
3244  if (!__kmp_hwloc_obj_has_PUs(tp, hS))
3245  continue; // skip socket if all PUs are out of fullMask
3246  ++nS; // only count objects those have PUs in affinity mask
3247  if (nS <= __kmp_hws_socket.offset ||
3248  nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
3249  n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
3250  continue; // move to next socket
3251  }
3252  nCr = 0; // count number of cores per socket
3253  // socket requested, go down the topology tree
3254  // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
3255  if (numa_support) {
3256  nN = 0;
3257  hN = NULL;
3258  // num nodes in current socket
3259  int NN =
3260  __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
3261  for (int n = 0; n < NN; ++n) {
3262  // Check NUMA Node ----------------------------------------
3263  if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
3264  hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3265  continue; // skip node if all PUs are out of fullMask
3266  }
3267  ++nN;
3268  if (nN <= __kmp_hws_node.offset ||
3269  nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
3270  // skip node as not requested
3271  n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
3272  hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3273  continue; // move to next node
3274  }
3275  // node requested, go down the topology tree
3276  if (tile_support) {
3277  nL = 0;
3278  hL = NULL;
3279  int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3280  for (int l = 0; l < NL; ++l) {
3281  // Check L2 (tile) ------------------------------------
3282  if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3283  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3284  continue; // skip tile if all PUs are out of fullMask
3285  }
3286  ++nL;
3287  if (nL <= __kmp_hws_tile.offset ||
3288  nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3289  // skip tile as not requested
3290  n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3291  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3292  continue; // move to next tile
3293  }
3294  // tile requested, go down the topology tree
3295  nC = 0;
3296  hC = NULL;
3297  // num cores in current tile
3298  int NC = __kmp_hwloc_count_children_by_type(tp, hL,
3299  HWLOC_OBJ_CORE, &hC);
3300  for (int c = 0; c < NC; ++c) {
3301  // Check Core ---------------------------------------
3302  if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3303  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3304  continue; // skip core if all PUs are out of fullMask
3305  }
3306  ++nC;
3307  if (nC <= __kmp_hws_core.offset ||
3308  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3309  // skip node as not requested
3310  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3311  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3312  continue; // move to next node
3313  }
3314  // core requested, go down to PUs
3315  nT = 0;
3316  nTr = 0;
3317  hT = NULL;
3318  // num procs in current core
3319  int NT = __kmp_hwloc_count_children_by_type(tp, hC,
3320  HWLOC_OBJ_PU, &hT);
3321  for (int t = 0; t < NT; ++t) {
3322  // Check PU ---------------------------------------
3323  idx = hT->os_index;
3324  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3325  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3326  continue; // skip PU if not in fullMask
3327  }
3328  ++nT;
3329  if (nT <= __kmp_hws_proc.offset ||
3330  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3331  // skip PU
3332  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3333  ++n_old;
3334  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3335  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3336  continue; // move to next node
3337  }
3338  ++nTr;
3339  if (pAddr) // collect requested thread's data
3340  newAddr[n_new] = (*pAddr)[n_old];
3341  ++n_new;
3342  ++n_old;
3343  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3344  } // threads loop
3345  if (nTr > 0) {
3346  ++nCr; // num cores per socket
3347  ++nCo; // total num cores
3348  if (nTr > nTpC)
3349  nTpC = nTr; // calc max threads per core
3350  }
3351  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3352  } // cores loop
3353  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3354  } // tiles loop
3355  } else { // tile_support
3356  // no tiles, check cores
3357  nC = 0;
3358  hC = NULL;
3359  // num cores in current node
3360  int NC =
3361  __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
3362  for (int c = 0; c < NC; ++c) {
3363  // Check Core ---------------------------------------
3364  if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3365  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3366  continue; // skip core if all PUs are out of fullMask
3367  }
3368  ++nC;
3369  if (nC <= __kmp_hws_core.offset ||
3370  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3371  // skip node as not requested
3372  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3373  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3374  continue; // move to next node
3375  }
3376  // core requested, go down to PUs
3377  nT = 0;
3378  nTr = 0;
3379  hT = NULL;
3380  int NT =
3381  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3382  for (int t = 0; t < NT; ++t) {
3383  // Check PU ---------------------------------------
3384  idx = hT->os_index;
3385  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3386  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3387  continue; // skip PU if not in fullMask
3388  }
3389  ++nT;
3390  if (nT <= __kmp_hws_proc.offset ||
3391  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3392  // skip PU
3393  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3394  ++n_old;
3395  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3396  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3397  continue; // move to next node
3398  }
3399  ++nTr;
3400  if (pAddr) // collect requested thread's data
3401  newAddr[n_new] = (*pAddr)[n_old];
3402  ++n_new;
3403  ++n_old;
3404  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3405  } // threads loop
3406  if (nTr > 0) {
3407  ++nCr; // num cores per socket
3408  ++nCo; // total num cores
3409  if (nTr > nTpC)
3410  nTpC = nTr; // calc max threads per core
3411  }
3412  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3413  } // cores loop
3414  } // tiles support
3415  hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3416  } // nodes loop
3417  } else { // numa_support
3418  // no NUMA support
3419  if (tile_support) {
3420  nL = 0;
3421  hL = NULL;
3422  // num tiles in current socket
3423  int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3424  for (int l = 0; l < NL; ++l) {
3425  // Check L2 (tile) ------------------------------------
3426  if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3427  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3428  continue; // skip tile if all PUs are out of fullMask
3429  }
3430  ++nL;
3431  if (nL <= __kmp_hws_tile.offset ||
3432  nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3433  // skip tile as not requested
3434  n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3435  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3436  continue; // move to next tile
3437  }
3438  // tile requested, go down the topology tree
3439  nC = 0;
3440  hC = NULL;
3441  // num cores per tile
3442  int NC =
3443  __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
3444  for (int c = 0; c < NC; ++c) {
3445  // Check Core ---------------------------------------
3446  if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3447  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3448  continue; // skip core if all PUs are out of fullMask
3449  }
3450  ++nC;
3451  if (nC <= __kmp_hws_core.offset ||
3452  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3453  // skip node as not requested
3454  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3455  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3456  continue; // move to next node
3457  }
3458  // core requested, go down to PUs
3459  nT = 0;
3460  nTr = 0;
3461  hT = NULL;
3462  // num procs per core
3463  int NT =
3464  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3465  for (int t = 0; t < NT; ++t) {
3466  // Check PU ---------------------------------------
3467  idx = hT->os_index;
3468  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3469  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3470  continue; // skip PU if not in fullMask
3471  }
3472  ++nT;
3473  if (nT <= __kmp_hws_proc.offset ||
3474  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3475  // skip PU
3476  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3477  ++n_old;
3478  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3479  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3480  continue; // move to next node
3481  }
3482  ++nTr;
3483  if (pAddr) // collect requested thread's data
3484  newAddr[n_new] = (*pAddr)[n_old];
3485  ++n_new;
3486  ++n_old;
3487  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3488  } // threads loop
3489  if (nTr > 0) {
3490  ++nCr; // num cores per socket
3491  ++nCo; // total num cores
3492  if (nTr > nTpC)
3493  nTpC = nTr; // calc max threads per core
3494  }
3495  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3496  } // cores loop
3497  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3498  } // tiles loop
3499  } else { // tile_support
3500  // no tiles, check cores
3501  nC = 0;
3502  hC = NULL;
3503  // num cores in socket
3504  int NC =
3505  __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
3506  for (int c = 0; c < NC; ++c) {
3507  // Check Core -------------------------------------------
3508  if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3509  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3510  continue; // skip core if all PUs are out of fullMask
3511  }
3512  ++nC;
3513  if (nC <= __kmp_hws_core.offset ||
3514  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3515  // skip node as not requested
3516  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3517  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3518  continue; // move to next node
3519  }
3520  // core requested, go down to PUs
3521  nT = 0;
3522  nTr = 0;
3523  hT = NULL;
3524  // num procs per core
3525  int NT =
3526  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3527  for (int t = 0; t < NT; ++t) {
3528  // Check PU ---------------------------------------
3529  idx = hT->os_index;
3530  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3531  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3532  continue; // skip PU if not in fullMask
3533  }
3534  ++nT;
3535  if (nT <= __kmp_hws_proc.offset ||
3536  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3537  // skip PU
3538  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3539  ++n_old;
3540  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3541  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3542  continue; // move to next node
3543  }
3544  ++nTr;
3545  if (pAddr) // collect requested thread's data
3546  newAddr[n_new] = (*pAddr)[n_old];
3547  ++n_new;
3548  ++n_old;
3549  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3550  } // threads loop
3551  if (nTr > 0) {
3552  ++nCr; // num cores per socket
3553  ++nCo; // total num cores
3554  if (nTr > nTpC)
3555  nTpC = nTr; // calc max threads per core
3556  }
3557  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3558  } // cores loop
3559  } // tiles support
3560  } // numa_support
3561  if (nCr > 0) { // found cores?
3562  ++nPkg; // num sockets
3563  if (nCr > nCpP)
3564  nCpP = nCr; // calc max cores per socket
3565  }
3566  } // sockets loop
3567 
3568  // check the subset is valid
3569  KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
3570  KMP_DEBUG_ASSERT(nPkg > 0);
3571  KMP_DEBUG_ASSERT(nCpP > 0);
3572  KMP_DEBUG_ASSERT(nTpC > 0);
3573  KMP_DEBUG_ASSERT(nCo > 0);
3574  KMP_DEBUG_ASSERT(nPkg <= nPackages);
3575  KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
3576  KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
3577  KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
3578 
3579  nPackages = nPkg; // correct num sockets
3580  nCoresPerPkg = nCpP; // correct num cores per socket
3581  __kmp_nThreadsPerCore = nTpC; // correct num threads per core
3582  __kmp_avail_proc = n_new; // correct num procs
3583  __kmp_ncores = nCo; // correct num cores
3584  // hwloc topology method end
3585  } else
3586 #endif // KMP_USE_HWLOC
3587  {
3588  int n_old = 0, n_new = 0, proc_num = 0;
3589  if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
3590  KMP_WARNING(AffHWSubsetNoHWLOC);
3591  goto _exit;
3592  }
3593  if (__kmp_hws_socket.num == 0)
3594  __kmp_hws_socket.num = nPackages; // use all available sockets
3595  if (__kmp_hws_core.num == 0)
3596  __kmp_hws_core.num = nCoresPerPkg; // use all available cores
3597  if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
3598  __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
3599  if (!__kmp_affinity_uniform_topology()) {
3600  KMP_WARNING(AffHWSubsetNonUniform);
3601  goto _exit; // don't support non-uniform topology
3602  }
3603  if (depth > 3) {
3604  KMP_WARNING(AffHWSubsetNonThreeLevel);
3605  goto _exit; // don't support not-3-level topology
3606  }
3607  if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
3608  KMP_WARNING(AffHWSubsetManySockets);
3609  goto _exit;
3610  }
3611  if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
3612  KMP_WARNING(AffHWSubsetManyCores);
3613  goto _exit;
3614  }
3615  // Form the requested subset
3616  if (pAddr) // pAddr is NULL in case of affinity_none
3617  newAddr = (AddrUnsPair *)__kmp_allocate(
3618  sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
3619  __kmp_hws_proc.num);
3620  for (int i = 0; i < nPackages; ++i) {
3621  if (i < __kmp_hws_socket.offset ||
3622  i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
3623  // skip not-requested socket
3624  n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
3625  if (__kmp_pu_os_idx != NULL) {
3626  // walk through skipped socket
3627  for (int j = 0; j < nCoresPerPkg; ++j) {
3628  for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3629  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3630  ++proc_num;
3631  }
3632  }
3633  }
3634  } else {
3635  // walk through requested socket
3636  for (int j = 0; j < nCoresPerPkg; ++j) {
3637  if (j < __kmp_hws_core.offset ||
3638  j >= __kmp_hws_core.offset +
3639  __kmp_hws_core.num) { // skip not-requested core
3640  n_old += __kmp_nThreadsPerCore;
3641  if (__kmp_pu_os_idx != NULL) {
3642  for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3643  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3644  ++proc_num;
3645  }
3646  }
3647  } else {
3648  // walk through requested core
3649  for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3650  if (k < __kmp_hws_proc.num) {
3651  if (pAddr) // collect requested thread's data
3652  newAddr[n_new] = (*pAddr)[n_old];
3653  n_new++;
3654  } else {
3655  if (__kmp_pu_os_idx != NULL)
3656  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3657  }
3658  n_old++;
3659  ++proc_num;
3660  }
3661  }
3662  }
3663  }
3664  }
3665  KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3666  KMP_DEBUG_ASSERT(n_new ==
3667  __kmp_hws_socket.num * __kmp_hws_core.num *
3668  __kmp_hws_proc.num);
3669  nPackages = __kmp_hws_socket.num; // correct nPackages
3670  nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
3671  __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
3672  __kmp_avail_proc = n_new; // correct avail_proc
3673  __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
3674  } // non-hwloc topology method
3675  if (pAddr) {
3676  __kmp_free(*pAddr);
3677  *pAddr = newAddr; // replace old topology with new one
3678  }
3679  if (__kmp_affinity_verbose) {
3680  char m[KMP_AFFIN_MASK_PRINT_LEN];
3681  __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
3682  __kmp_affin_fullMask);
3683  if (__kmp_affinity_respect_mask) {
3684  KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
3685  } else {
3686  KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
3687  }
3688  KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
3689  kmp_str_buf_t buf;
3690  __kmp_str_buf_init(&buf);
3691  __kmp_str_buf_print(&buf, "%d", nPackages);
3692  KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
3693  __kmp_nThreadsPerCore, __kmp_ncores);
3694  __kmp_str_buf_free(&buf);
3695  }
3696 _exit:
3697  if (__kmp_pu_os_idx != NULL) {
3698  __kmp_free(__kmp_pu_os_idx);
3699  __kmp_pu_os_idx = NULL;
3700  }
3701 }
3702 
3703 // This function figures out the deepest level at which there is at least one
3704 // cluster/core with more than one processing unit bound to it.
3705 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
3706  int nprocs, int bottom_level) {
3707  int core_level = 0;
3708 
3709  for (int i = 0; i < nprocs; i++) {
3710  for (int j = bottom_level; j > 0; j--) {
3711  if (address2os[i].first.labels[j] > 0) {
3712  if (core_level < (j - 1)) {
3713  core_level = j - 1;
3714  }
3715  }
3716  }
3717  }
3718  return core_level;
3719 }
3720 
3721 // This function counts number of clusters/cores at given level.
3722 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
3723  int nprocs, int bottom_level,
3724  int core_level) {
3725  int ncores = 0;
3726  int i, j;
3727 
3728  j = bottom_level;
3729  for (i = 0; i < nprocs; i++) {
3730  for (j = bottom_level; j > core_level; j--) {
3731  if ((i + 1) < nprocs) {
3732  if (address2os[i + 1].first.labels[j] > 0) {
3733  break;
3734  }
3735  }
3736  }
3737  if (j == core_level) {
3738  ncores++;
3739  }
3740  }
3741  if (j > core_level) {
3742  // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
3743  // core. May occur when called from __kmp_affinity_find_core().
3744  ncores++;
3745  }
3746  return ncores;
3747 }
3748 
3749 // This function finds to which cluster/core given processing unit is bound.
3750 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
3751  int bottom_level, int core_level) {
3752  return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
3753  core_level) -
3754  1;
3755 }
3756 
3757 // This function finds maximal number of processing units bound to a
3758 // cluster/core at given level.
3759 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
3760  int nprocs, int bottom_level,
3761  int core_level) {
3762  int maxprocpercore = 0;
3763 
3764  if (core_level < bottom_level) {
3765  for (int i = 0; i < nprocs; i++) {
3766  int percore = address2os[i].first.labels[core_level + 1] + 1;
3767 
3768  if (percore > maxprocpercore) {
3769  maxprocpercore = percore;
3770  }
3771  }
3772  } else {
3773  maxprocpercore = 1;
3774  }
3775  return maxprocpercore;
3776 }
3777 
3778 static AddrUnsPair *address2os = NULL;
3779 static int *procarr = NULL;
3780 static int __kmp_aff_depth = 0;
3781 
3782 #define KMP_EXIT_AFF_NONE \
3783  KMP_ASSERT(__kmp_affinity_type == affinity_none); \
3784  KMP_ASSERT(address2os == NULL); \
3785  __kmp_apply_thread_places(NULL, 0); \
3786  return;
3787 
3788 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
3789  const Address *aa = &(((const AddrUnsPair *)a)->first);
3790  const Address *bb = &(((const AddrUnsPair *)b)->first);
3791  unsigned depth = aa->depth;
3792  unsigned i;
3793  KMP_DEBUG_ASSERT(depth == bb->depth);
3794  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
3795  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
3796  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
3797  int j = depth - i - 1;
3798  if (aa->childNums[j] < bb->childNums[j])
3799  return -1;
3800  if (aa->childNums[j] > bb->childNums[j])
3801  return 1;
3802  }
3803  for (; i < depth; i++) {
3804  int j = i - __kmp_affinity_compact;
3805  if (aa->childNums[j] < bb->childNums[j])
3806  return -1;
3807  if (aa->childNums[j] > bb->childNums[j])
3808  return 1;
3809  }
3810  return 0;
3811 }
3812 
3813 static void __kmp_aux_affinity_initialize(void) {
3814  if (__kmp_affinity_masks != NULL) {
3815  KMP_ASSERT(__kmp_affin_fullMask != NULL);
3816  return;
3817  }
3818 
3819  // Create the "full" mask - this defines all of the processors that we
3820  // consider to be in the machine model. If respect is set, then it is the
3821  // initialization thread's affinity mask. Otherwise, it is all processors that
3822  // we know about on the machine.
3823  if (__kmp_affin_fullMask == NULL) {
3824  KMP_CPU_ALLOC(__kmp_affin_fullMask);
3825  }
3826  if (KMP_AFFINITY_CAPABLE()) {
3827  if (__kmp_affinity_respect_mask) {
3828  __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3829 
3830  // Count the number of available processors.
3831  unsigned i;
3832  __kmp_avail_proc = 0;
3833  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3834  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3835  continue;
3836  }
3837  __kmp_avail_proc++;
3838  }
3839  if (__kmp_avail_proc > __kmp_xproc) {
3840  if (__kmp_affinity_verbose ||
3841  (__kmp_affinity_warnings &&
3842  (__kmp_affinity_type != affinity_none))) {
3843  KMP_WARNING(ErrorInitializeAffinity);
3844  }
3845  __kmp_affinity_type = affinity_none;
3846  KMP_AFFINITY_DISABLE();
3847  return;
3848  }
3849  } else {
3850  __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3851  __kmp_avail_proc = __kmp_xproc;
3852  }
3853  }
3854 
3855  int depth = -1;
3856  kmp_i18n_id_t msg_id = kmp_i18n_null;
3857 
3858  // For backward compatibility, setting KMP_CPUINFO_FILE =>
3859  // KMP_TOPOLOGY_METHOD=cpuinfo
3860  if ((__kmp_cpuinfo_file != NULL) &&
3861  (__kmp_affinity_top_method == affinity_top_method_all)) {
3862  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3863  }
3864 
3865  if (__kmp_affinity_top_method == affinity_top_method_all) {
3866  // In the default code path, errors are not fatal - we just try using
3867  // another method. We only emit a warning message if affinity is on, or the
3868  // verbose flag is set, an the nowarnings flag was not set.
3869  const char *file_name = NULL;
3870  int line = 0;
3871 #if KMP_USE_HWLOC
3872  if (depth < 0 &&
3873  __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3874  if (__kmp_affinity_verbose) {
3875  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
3876  }
3877  if (!__kmp_hwloc_error) {
3878  depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
3879  if (depth == 0) {
3880  KMP_EXIT_AFF_NONE;
3881  } else if (depth < 0 && __kmp_affinity_verbose) {
3882  KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3883  }
3884  } else if (__kmp_affinity_verbose) {
3885  KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3886  }
3887  }
3888 #endif
3889 
3890 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3891 
3892  if (depth < 0) {
3893  if (__kmp_affinity_verbose) {
3894  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3895  }
3896 
3897  file_name = NULL;
3898  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3899  if (depth == 0) {
3900  KMP_EXIT_AFF_NONE;
3901  }
3902 
3903  if (depth < 0) {
3904  if (__kmp_affinity_verbose) {
3905  if (msg_id != kmp_i18n_null) {
3906  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
3907  __kmp_i18n_catgets(msg_id),
3908  KMP_I18N_STR(DecodingLegacyAPIC));
3909  } else {
3910  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3911  KMP_I18N_STR(DecodingLegacyAPIC));
3912  }
3913  }
3914 
3915  file_name = NULL;
3916  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3917  if (depth == 0) {
3918  KMP_EXIT_AFF_NONE;
3919  }
3920  }
3921  }
3922 
3923 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3924 
3925 #if KMP_OS_LINUX
3926 
3927  if (depth < 0) {
3928  if (__kmp_affinity_verbose) {
3929  if (msg_id != kmp_i18n_null) {
3930  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
3931  __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3932  } else {
3933  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3934  }
3935  }
3936 
3937  FILE *f = fopen("/proc/cpuinfo", "r");
3938  if (f == NULL) {
3939  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3940  } else {
3941  file_name = "/proc/cpuinfo";
3942  depth =
3943  __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3944  fclose(f);
3945  if (depth == 0) {
3946  KMP_EXIT_AFF_NONE;
3947  }
3948  }
3949  }
3950 
3951 #endif /* KMP_OS_LINUX */
3952 
3953 #if KMP_GROUP_AFFINITY
3954 
3955  if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3956  if (__kmp_affinity_verbose) {
3957  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3958  }
3959 
3960  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3961  KMP_ASSERT(depth != 0);
3962  }
3963 
3964 #endif /* KMP_GROUP_AFFINITY */
3965 
3966  if (depth < 0) {
3967  if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3968  if (file_name == NULL) {
3969  KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3970  } else if (line == 0) {
3971  KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3972  } else {
3973  KMP_INFORM(UsingFlatOSFileLine, file_name, line,
3974  __kmp_i18n_catgets(msg_id));
3975  }
3976  }
3977  // FIXME - print msg if msg_id = kmp_i18n_null ???
3978 
3979  file_name = "";
3980  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3981  if (depth == 0) {
3982  KMP_EXIT_AFF_NONE;
3983  }
3984  KMP_ASSERT(depth > 0);
3985  KMP_ASSERT(address2os != NULL);
3986  }
3987  }
3988 
3989 // If the user has specified that a paricular topology discovery method is to be
3990 // used, then we abort if that method fails. The exception is group affinity,
3991 // which might have been implicitly set.
3992 
3993 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3994 
3995  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3996  if (__kmp_affinity_verbose) {
3997  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3998  }
3999 
4000  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
4001  if (depth == 0) {
4002  KMP_EXIT_AFF_NONE;
4003  }
4004  if (depth < 0) {
4005  KMP_ASSERT(msg_id != kmp_i18n_null);
4006  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4007  }
4008  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4009  if (__kmp_affinity_verbose) {
4010  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
4011  }
4012 
4013  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
4014  if (depth == 0) {
4015  KMP_EXIT_AFF_NONE;
4016  }
4017  if (depth < 0) {
4018  KMP_ASSERT(msg_id != kmp_i18n_null);
4019  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4020  }
4021  }
4022 
4023 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4024 
4025  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4026  const char *filename;
4027  if (__kmp_cpuinfo_file != NULL) {
4028  filename = __kmp_cpuinfo_file;
4029  } else {
4030  filename = "/proc/cpuinfo";
4031  }
4032 
4033  if (__kmp_affinity_verbose) {
4034  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
4035  }
4036 
4037  FILE *f = fopen(filename, "r");
4038  if (f == NULL) {
4039  int code = errno;
4040  if (__kmp_cpuinfo_file != NULL) {
4041  __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename),
4042  KMP_ERR(code), KMP_HNT(NameComesFrom_CPUINFO_FILE),
4043  __kmp_msg_null);
4044  } else {
4045  __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename),
4046  KMP_ERR(code), __kmp_msg_null);
4047  }
4048  }
4049  int line = 0;
4050  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
4051  fclose(f);
4052  if (depth < 0) {
4053  KMP_ASSERT(msg_id != kmp_i18n_null);
4054  if (line > 0) {
4055  KMP_FATAL(FileLineMsgExiting, filename, line,
4056  __kmp_i18n_catgets(msg_id));
4057  } else {
4058  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4059  }
4060  }
4061  if (__kmp_affinity_type == affinity_none) {
4062  KMP_ASSERT(depth == 0);
4063  KMP_EXIT_AFF_NONE;
4064  }
4065  }
4066 
4067 #if KMP_GROUP_AFFINITY
4068 
4069  else if (__kmp_affinity_top_method == affinity_top_method_group) {
4070  if (__kmp_affinity_verbose) {
4071  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
4072  }
4073 
4074  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
4075  KMP_ASSERT(depth != 0);
4076  if (depth < 0) {
4077  KMP_ASSERT(msg_id != kmp_i18n_null);
4078  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4079  }
4080  }
4081 
4082 #endif /* KMP_GROUP_AFFINITY */
4083 
4084  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4085  if (__kmp_affinity_verbose) {
4086  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
4087  }
4088 
4089  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
4090  if (depth == 0) {
4091  KMP_EXIT_AFF_NONE;
4092  }
4093  // should not fail
4094  KMP_ASSERT(depth > 0);
4095  KMP_ASSERT(address2os != NULL);
4096  }
4097 
4098 #if KMP_USE_HWLOC
4099  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4100  KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4101  if (__kmp_affinity_verbose) {
4102  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
4103  }
4104  depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
4105  if (depth == 0) {
4106  KMP_EXIT_AFF_NONE;
4107  }
4108  }
4109 #endif // KMP_USE_HWLOC
4110 
4111  if (address2os == NULL) {
4112  if (KMP_AFFINITY_CAPABLE() &&
4113  (__kmp_affinity_verbose ||
4114  (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
4115  KMP_WARNING(ErrorInitializeAffinity);
4116  }
4117  __kmp_affinity_type = affinity_none;
4118  KMP_AFFINITY_DISABLE();
4119  return;
4120  }
4121 
4122  __kmp_apply_thread_places(&address2os, depth);
4123 
4124  // Create the table of masks, indexed by thread Id.
4125  unsigned maxIndex;
4126  unsigned numUnique;
4127  kmp_affin_mask_t *osId2Mask =
4128  __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
4129  if (__kmp_affinity_gran_levels == 0) {
4130  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4131  }
4132 
4133  // Set the childNums vector in all Address objects. This must be done before
4134  // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
4135  // account the setting of __kmp_affinity_compact.
4136  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
4137 
4138  switch (__kmp_affinity_type) {
4139 
4140  case affinity_explicit:
4141  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
4142 #if OMP_40_ENABLED
4143  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4144 #endif
4145  {
4146  __kmp_affinity_process_proclist(
4147  &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4148  __kmp_affinity_proclist, osId2Mask, maxIndex);
4149  }
4150 #if OMP_40_ENABLED
4151  else {
4152  __kmp_affinity_process_placelist(
4153  &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4154  __kmp_affinity_proclist, osId2Mask, maxIndex);
4155  }
4156 #endif
4157  if (__kmp_affinity_num_masks == 0) {
4158  if (__kmp_affinity_verbose ||
4159  (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
4160  KMP_WARNING(AffNoValidProcID);
4161  }
4162  __kmp_affinity_type = affinity_none;
4163  return;
4164  }
4165  break;
4166 
4167  // The other affinity types rely on sorting the Addresses according to some
4168  // permutation of the machine topology tree. Set __kmp_affinity_compact and
4169  // __kmp_affinity_offset appropriately, then jump to a common code fragment
4170  // to do the sort and create the array of affinity masks.
4171 
4172  case affinity_logical:
4173  __kmp_affinity_compact = 0;
4174  if (__kmp_affinity_offset) {
4175  __kmp_affinity_offset =
4176  __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4177  }
4178  goto sortAddresses;
4179 
4180  case affinity_physical:
4181  if (__kmp_nThreadsPerCore > 1) {
4182  __kmp_affinity_compact = 1;
4183  if (__kmp_affinity_compact >= depth) {
4184  __kmp_affinity_compact = 0;
4185  }
4186  } else {
4187  __kmp_affinity_compact = 0;
4188  }
4189  if (__kmp_affinity_offset) {
4190  __kmp_affinity_offset =
4191  __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4192  }
4193  goto sortAddresses;
4194 
4195  case affinity_scatter:
4196  if (__kmp_affinity_compact >= depth) {
4197  __kmp_affinity_compact = 0;
4198  } else {
4199  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
4200  }
4201  goto sortAddresses;
4202 
4203  case affinity_compact:
4204  if (__kmp_affinity_compact >= depth) {
4205  __kmp_affinity_compact = depth - 1;
4206  }
4207  goto sortAddresses;
4208 
4209  case affinity_balanced:
4210  if (depth <= 1) {
4211  if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4212  KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4213  }
4214  __kmp_affinity_type = affinity_none;
4215  return;
4216  } else if (__kmp_affinity_uniform_topology()) {
4217  break;
4218  } else { // Non-uniform topology
4219 
4220  // Save the depth for further usage
4221  __kmp_aff_depth = depth;
4222 
4223  int core_level = __kmp_affinity_find_core_level(
4224  address2os, __kmp_avail_proc, depth - 1);
4225  int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
4226  depth - 1, core_level);
4227  int maxprocpercore = __kmp_affinity_max_proc_per_core(
4228  address2os, __kmp_avail_proc, depth - 1, core_level);
4229 
4230  int nproc = ncores * maxprocpercore;
4231  if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4232  if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4233  KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4234  }
4235  __kmp_affinity_type = affinity_none;
4236  return;
4237  }
4238 
4239  procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4240  for (int i = 0; i < nproc; i++) {
4241  procarr[i] = -1;
4242  }
4243 
4244  int lastcore = -1;
4245  int inlastcore = 0;
4246  for (int i = 0; i < __kmp_avail_proc; i++) {
4247  int proc = address2os[i].second;
4248  int core =
4249  __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
4250 
4251  if (core == lastcore) {
4252  inlastcore++;
4253  } else {
4254  inlastcore = 0;
4255  }
4256  lastcore = core;
4257 
4258  procarr[core * maxprocpercore + inlastcore] = proc;
4259  }
4260 
4261  break;
4262  }
4263 
4264  sortAddresses:
4265  // Allocate the gtid->affinity mask table.
4266  if (__kmp_affinity_dups) {
4267  __kmp_affinity_num_masks = __kmp_avail_proc;
4268  } else {
4269  __kmp_affinity_num_masks = numUnique;
4270  }
4271 
4272 #if OMP_40_ENABLED
4273  if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4274  (__kmp_affinity_num_places > 0) &&
4275  ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
4276  __kmp_affinity_num_masks = __kmp_affinity_num_places;
4277  }
4278 #endif
4279 
4280  KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4281 
4282  // Sort the address2os table according to the current setting of
4283  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4284  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4285  __kmp_affinity_cmp_Address_child_num);
4286  {
4287  int i;
4288  unsigned j;
4289  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4290  if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
4291  continue;
4292  }
4293  unsigned osId = address2os[i].second;
4294  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4295  kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4296  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4297  KMP_CPU_COPY(dest, src);
4298  if (++j >= __kmp_affinity_num_masks) {
4299  break;
4300  }
4301  }
4302  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4303  }
4304  break;
4305 
4306  default:
4307  KMP_ASSERT2(0, "Unexpected affinity setting");
4308  }
4309 
4310  KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
4311  machine_hierarchy.init(address2os, __kmp_avail_proc);
4312 }
4313 #undef KMP_EXIT_AFF_NONE
4314 
4315 void __kmp_affinity_initialize(void) {
4316  // Much of the code above was written assumming that if a machine was not
4317  // affinity capable, then __kmp_affinity_type == affinity_none. We now
4318  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4319  // There are too many checks for __kmp_affinity_type == affinity_none
4320  // in this code. Instead of trying to change them all, check if
4321  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4322  // affinity_none, call the real initialization routine, then restore
4323  // __kmp_affinity_type to affinity_disabled.
4324  int disabled = (__kmp_affinity_type == affinity_disabled);
4325  if (!KMP_AFFINITY_CAPABLE()) {
4326  KMP_ASSERT(disabled);
4327  }
4328  if (disabled) {
4329  __kmp_affinity_type = affinity_none;
4330  }
4331  __kmp_aux_affinity_initialize();
4332  if (disabled) {
4333  __kmp_affinity_type = affinity_disabled;
4334  }
4335 }
4336 
4337 void __kmp_affinity_uninitialize(void) {
4338  if (__kmp_affinity_masks != NULL) {
4339  KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4340  __kmp_affinity_masks = NULL;
4341  }
4342  if (__kmp_affin_fullMask != NULL) {
4343  KMP_CPU_FREE(__kmp_affin_fullMask);
4344  __kmp_affin_fullMask = NULL;
4345  }
4346  __kmp_affinity_num_masks = 0;
4347  __kmp_affinity_type = affinity_default;
4348 #if OMP_40_ENABLED
4349  __kmp_affinity_num_places = 0;
4350 #endif
4351  if (__kmp_affinity_proclist != NULL) {
4352  __kmp_free(__kmp_affinity_proclist);
4353  __kmp_affinity_proclist = NULL;
4354  }
4355  if (address2os != NULL) {
4356  __kmp_free(address2os);
4357  address2os = NULL;
4358  }
4359  if (procarr != NULL) {
4360  __kmp_free(procarr);
4361  procarr = NULL;
4362  }
4363 #if KMP_USE_HWLOC
4364  if (__kmp_hwloc_topology != NULL) {
4365  hwloc_topology_destroy(__kmp_hwloc_topology);
4366  __kmp_hwloc_topology = NULL;
4367  }
4368 #endif
4369  KMPAffinity::destroy_api();
4370 }
4371 
4372 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4373  if (!KMP_AFFINITY_CAPABLE()) {
4374  return;
4375  }
4376 
4377  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4378  if (th->th.th_affin_mask == NULL) {
4379  KMP_CPU_ALLOC(th->th.th_affin_mask);
4380  } else {
4381  KMP_CPU_ZERO(th->th.th_affin_mask);
4382  }
4383 
4384  // Copy the thread mask to the kmp_info_t strucuture. If
4385  // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
4386  // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
4387  // then the full mask is the same as the mask of the initialization thread.
4388  kmp_affin_mask_t *mask;
4389  int i;
4390 
4391 #if OMP_40_ENABLED
4392  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4393 #endif
4394  {
4395  if ((__kmp_affinity_type == affinity_none) ||
4396  (__kmp_affinity_type == affinity_balanced)) {
4397 #if KMP_GROUP_AFFINITY
4398  if (__kmp_num_proc_groups > 1) {
4399  return;
4400  }
4401 #endif
4402  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4403  i = KMP_PLACE_ALL;
4404  mask = __kmp_affin_fullMask;
4405  } else {
4406  KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4407  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4408  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4409  }
4410  }
4411 #if OMP_40_ENABLED
4412  else {
4413  if ((!isa_root) ||
4414  (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4415 #if KMP_GROUP_AFFINITY
4416  if (__kmp_num_proc_groups > 1) {
4417  return;
4418  }
4419 #endif
4420  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4421  i = KMP_PLACE_ALL;
4422  mask = __kmp_affin_fullMask;
4423  } else {
4424  // int i = some hash function or just a counter that doesn't
4425  // always start at 0. Use gtid for now.
4426  KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4427  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4428  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4429  }
4430  }
4431 #endif
4432 
4433 #if OMP_40_ENABLED
4434  th->th.th_current_place = i;
4435  if (isa_root) {
4436  th->th.th_new_place = i;
4437  th->th.th_first_place = 0;
4438  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4439  }
4440 
4441  if (i == KMP_PLACE_ALL) {
4442  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4443  gtid));
4444  } else {
4445  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4446  gtid, i));
4447  }
4448 #else
4449  if (i == -1) {
4450  KA_TRACE(
4451  100,
4452  ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
4453  gtid));
4454  } else {
4455  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4456  gtid, i));
4457  }
4458 #endif /* OMP_40_ENABLED */
4459 
4460  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4461 
4462  if (__kmp_affinity_verbose) {
4463  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4464  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4465  th->th.th_affin_mask);
4466  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4467  __kmp_gettid(), gtid, buf);
4468  }
4469 
4470 #if KMP_OS_WINDOWS
4471  // On Windows* OS, the process affinity mask might have changed. If the user
4472  // didn't request affinity and this call fails, just continue silently.
4473  // See CQ171393.
4474  if (__kmp_affinity_type == affinity_none) {
4475  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4476  } else
4477 #endif
4478  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4479 }
4480 
4481 #if OMP_40_ENABLED
4482 
4483 void __kmp_affinity_set_place(int gtid) {
4484  int retval;
4485 
4486  if (!KMP_AFFINITY_CAPABLE()) {
4487  return;
4488  }
4489 
4490  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4491 
4492  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4493  "place = %d)\n",
4494  gtid, th->th.th_new_place, th->th.th_current_place));
4495 
4496  // Check that the new place is within this thread's partition.
4497  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4498  KMP_ASSERT(th->th.th_new_place >= 0);
4499  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4500  if (th->th.th_first_place <= th->th.th_last_place) {
4501  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4502  (th->th.th_new_place <= th->th.th_last_place));
4503  } else {
4504  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4505  (th->th.th_new_place >= th->th.th_last_place));
4506  }
4507 
4508  // Copy the thread mask to the kmp_info_t strucuture,
4509  // and set this thread's affinity.
4510  kmp_affin_mask_t *mask =
4511  KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
4512  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4513  th->th.th_current_place = th->th.th_new_place;
4514 
4515  if (__kmp_affinity_verbose) {
4516  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4517  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4518  th->th.th_affin_mask);
4519  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4520  __kmp_gettid(), gtid, buf);
4521  }
4522  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4523 }
4524 
4525 #endif /* OMP_40_ENABLED */
4526 
4527 int __kmp_aux_set_affinity(void **mask) {
4528  int gtid;
4529  kmp_info_t *th;
4530  int retval;
4531 
4532  if (!KMP_AFFINITY_CAPABLE()) {
4533  return -1;
4534  }
4535 
4536  gtid = __kmp_entry_gtid();
4537  KA_TRACE(1000, ; {
4538  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4539  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4540  (kmp_affin_mask_t *)(*mask));
4541  __kmp_debug_printf(
4542  "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
4543  buf);
4544  });
4545 
4546  if (__kmp_env_consistency_check) {
4547  if ((mask == NULL) || (*mask == NULL)) {
4548  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4549  } else {
4550  unsigned proc;
4551  int num_procs = 0;
4552 
4553  KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4554  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4555  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4556  }
4557  if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4558  continue;
4559  }
4560  num_procs++;
4561  }
4562  if (num_procs == 0) {
4563  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4564  }
4565 
4566 #if KMP_GROUP_AFFINITY
4567  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4568  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4569  }
4570 #endif /* KMP_GROUP_AFFINITY */
4571  }
4572  }
4573 
4574  th = __kmp_threads[gtid];
4575  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4576  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4577  if (retval == 0) {
4578  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4579  }
4580 
4581 #if OMP_40_ENABLED
4582  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4583  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4584  th->th.th_first_place = 0;
4585  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4586 
4587  // Turn off 4.0 affinity for the current tread at this parallel level.
4588  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4589 #endif
4590 
4591  return retval;
4592 }
4593 
4594 int __kmp_aux_get_affinity(void **mask) {
4595  int gtid;
4596  int retval;
4597  kmp_info_t *th;
4598 
4599  if (!KMP_AFFINITY_CAPABLE()) {
4600  return -1;
4601  }
4602 
4603  gtid = __kmp_entry_gtid();
4604  th = __kmp_threads[gtid];
4605  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4606 
4607  KA_TRACE(1000, ; {
4608  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4609  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4610  th->th.th_affin_mask);
4611  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
4612  gtid, buf);
4613  });
4614 
4615  if (__kmp_env_consistency_check) {
4616  if ((mask == NULL) || (*mask == NULL)) {
4617  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4618  }
4619  }
4620 
4621 #if !KMP_OS_WINDOWS
4622 
4623  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4624  KA_TRACE(1000, ; {
4625  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4626  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4627  (kmp_affin_mask_t *)(*mask));
4628  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
4629  gtid, buf);
4630  });
4631  return retval;
4632 
4633 #else
4634 
4635  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4636  return 0;
4637 
4638 #endif /* KMP_OS_WINDOWS */
4639 }
4640 
4641 int __kmp_aux_get_affinity_max_proc() {
4642  if (!KMP_AFFINITY_CAPABLE()) {
4643  return 0;
4644  }
4645 #if KMP_GROUP_AFFINITY
4646  if (__kmp_num_proc_groups > 1) {
4647  return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
4648  }
4649 #endif
4650  return __kmp_xproc;
4651 }
4652 
4653 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
4654  int retval;
4655 
4656  if (!KMP_AFFINITY_CAPABLE()) {
4657  return -1;
4658  }
4659 
4660  KA_TRACE(1000, ; {
4661  int gtid = __kmp_entry_gtid();
4662  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4663  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4664  (kmp_affin_mask_t *)(*mask));
4665  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4666  "affinity mask for thread %d = %s\n",
4667  proc, gtid, buf);
4668  });
4669 
4670  if (__kmp_env_consistency_check) {
4671  if ((mask == NULL) || (*mask == NULL)) {
4672  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4673  }
4674  }
4675 
4676  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4677  return -1;
4678  }
4679  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4680  return -2;
4681  }
4682 
4683  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4684  return 0;
4685 }
4686 
4687 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
4688  int retval;
4689 
4690  if (!KMP_AFFINITY_CAPABLE()) {
4691  return -1;
4692  }
4693 
4694  KA_TRACE(1000, ; {
4695  int gtid = __kmp_entry_gtid();
4696  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4697  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4698  (kmp_affin_mask_t *)(*mask));
4699  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
4700  "affinity mask for thread %d = %s\n",
4701  proc, gtid, buf);
4702  });
4703 
4704  if (__kmp_env_consistency_check) {
4705  if ((mask == NULL) || (*mask == NULL)) {
4706  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4707  }
4708  }
4709 
4710  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4711  return -1;
4712  }
4713  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4714  return -2;
4715  }
4716 
4717  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4718  return 0;
4719 }
4720 
4721 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
4722  int retval;
4723 
4724  if (!KMP_AFFINITY_CAPABLE()) {
4725  return -1;
4726  }
4727 
4728  KA_TRACE(1000, ; {
4729  int gtid = __kmp_entry_gtid();
4730  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4731  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4732  (kmp_affin_mask_t *)(*mask));
4733  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
4734  "affinity mask for thread %d = %s\n",
4735  proc, gtid, buf);
4736  });
4737 
4738  if (__kmp_env_consistency_check) {
4739  if ((mask == NULL) || (*mask == NULL)) {
4740  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4741  }
4742  }
4743 
4744  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4745  return -1;
4746  }
4747  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4748  return 0;
4749  }
4750 
4751  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4752 }
4753 
4754 // Dynamic affinity settings - Affinity balanced
4755 void __kmp_balanced_affinity(int tid, int nthreads) {
4756  bool fine_gran = true;
4757 
4758  switch (__kmp_affinity_gran) {
4759  case affinity_gran_fine:
4760  case affinity_gran_thread:
4761  break;
4762  case affinity_gran_core:
4763  if (__kmp_nThreadsPerCore > 1) {
4764  fine_gran = false;
4765  }
4766  break;
4767  case affinity_gran_package:
4768  if (nCoresPerPkg > 1) {
4769  fine_gran = false;
4770  }
4771  break;
4772  default:
4773  fine_gran = false;
4774  }
4775 
4776  if (__kmp_affinity_uniform_topology()) {
4777  int coreID;
4778  int threadID;
4779  // Number of hyper threads per core in HT machine
4780  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4781  // Number of cores
4782  int ncores = __kmp_ncores;
4783  if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
4784  __kmp_nth_per_core = __kmp_avail_proc / nPackages;
4785  ncores = nPackages;
4786  }
4787  // How many threads will be bound to each core
4788  int chunk = nthreads / ncores;
4789  // How many cores will have an additional thread bound to it - "big cores"
4790  int big_cores = nthreads % ncores;
4791  // Number of threads on the big cores
4792  int big_nth = (chunk + 1) * big_cores;
4793  if (tid < big_nth) {
4794  coreID = tid / (chunk + 1);
4795  threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
4796  } else { // tid >= big_nth
4797  coreID = (tid - big_cores) / chunk;
4798  threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
4799  }
4800 
4801  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4802  "Illegal set affinity operation when not capable");
4803 
4804  kmp_affin_mask_t *mask;
4805  KMP_CPU_ALLOC_ON_STACK(mask);
4806  KMP_CPU_ZERO(mask);
4807 
4808  if (fine_gran) {
4809  int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
4810  KMP_CPU_SET(osID, mask);
4811  } else {
4812  for (int i = 0; i < __kmp_nth_per_core; i++) {
4813  int osID;
4814  osID = address2os[coreID * __kmp_nth_per_core + i].second;
4815  KMP_CPU_SET(osID, mask);
4816  }
4817  }
4818  if (__kmp_affinity_verbose) {
4819  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4820  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4821  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4822  __kmp_gettid(), tid, buf);
4823  }
4824  __kmp_set_system_affinity(mask, TRUE);
4825  KMP_CPU_FREE_FROM_STACK(mask);
4826  } else { // Non-uniform topology
4827 
4828  kmp_affin_mask_t *mask;
4829  KMP_CPU_ALLOC_ON_STACK(mask);
4830  KMP_CPU_ZERO(mask);
4831 
4832  int core_level = __kmp_affinity_find_core_level(
4833  address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
4834  int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
4835  __kmp_aff_depth - 1, core_level);
4836  int nth_per_core = __kmp_affinity_max_proc_per_core(
4837  address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4838 
4839  // For performance gain consider the special case nthreads ==
4840  // __kmp_avail_proc
4841  if (nthreads == __kmp_avail_proc) {
4842  if (fine_gran) {
4843  int osID = address2os[tid].second;
4844  KMP_CPU_SET(osID, mask);
4845  } else {
4846  int core = __kmp_affinity_find_core(address2os, tid,
4847  __kmp_aff_depth - 1, core_level);
4848  for (int i = 0; i < __kmp_avail_proc; i++) {
4849  int osID = address2os[i].second;
4850  if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
4851  core_level) == core) {
4852  KMP_CPU_SET(osID, mask);
4853  }
4854  }
4855  }
4856  } else if (nthreads <= ncores) {
4857 
4858  int core = 0;
4859  for (int i = 0; i < ncores; i++) {
4860  // Check if this core from procarr[] is in the mask
4861  int in_mask = 0;
4862  for (int j = 0; j < nth_per_core; j++) {
4863  if (procarr[i * nth_per_core + j] != -1) {
4864  in_mask = 1;
4865  break;
4866  }
4867  }
4868  if (in_mask) {
4869  if (tid == core) {
4870  for (int j = 0; j < nth_per_core; j++) {
4871  int osID = procarr[i * nth_per_core + j];
4872  if (osID != -1) {
4873  KMP_CPU_SET(osID, mask);
4874  // For fine granularity it is enough to set the first available
4875  // osID for this core
4876  if (fine_gran) {
4877  break;
4878  }
4879  }
4880  }
4881  break;
4882  } else {
4883  core++;
4884  }
4885  }
4886  }
4887  } else { // nthreads > ncores
4888  // Array to save the number of processors at each core
4889  int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
4890  // Array to save the number of cores with "x" available processors;
4891  int *ncores_with_x_procs =
4892  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4893  // Array to save the number of cores with # procs from x to nth_per_core
4894  int *ncores_with_x_to_max_procs =
4895  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4896 
4897  for (int i = 0; i <= nth_per_core; i++) {
4898  ncores_with_x_procs[i] = 0;
4899  ncores_with_x_to_max_procs[i] = 0;
4900  }
4901 
4902  for (int i = 0; i < ncores; i++) {
4903  int cnt = 0;
4904  for (int j = 0; j < nth_per_core; j++) {
4905  if (procarr[i * nth_per_core + j] != -1) {
4906  cnt++;
4907  }
4908  }
4909  nproc_at_core[i] = cnt;
4910  ncores_with_x_procs[cnt]++;
4911  }
4912 
4913  for (int i = 0; i <= nth_per_core; i++) {
4914  for (int j = i; j <= nth_per_core; j++) {
4915  ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
4916  }
4917  }
4918 
4919  // Max number of processors
4920  int nproc = nth_per_core * ncores;
4921  // An array to keep number of threads per each context
4922  int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4923  for (int i = 0; i < nproc; i++) {
4924  newarr[i] = 0;
4925  }
4926 
4927  int nth = nthreads;
4928  int flag = 0;
4929  while (nth > 0) {
4930  for (int j = 1; j <= nth_per_core; j++) {
4931  int cnt = ncores_with_x_to_max_procs[j];
4932  for (int i = 0; i < ncores; i++) {
4933  // Skip the core with 0 processors
4934  if (nproc_at_core[i] == 0) {
4935  continue;
4936  }
4937  for (int k = 0; k < nth_per_core; k++) {
4938  if (procarr[i * nth_per_core + k] != -1) {
4939  if (newarr[i * nth_per_core + k] == 0) {
4940  newarr[i * nth_per_core + k] = 1;
4941  cnt--;
4942  nth--;
4943  break;
4944  } else {
4945  if (flag != 0) {
4946  newarr[i * nth_per_core + k]++;
4947  cnt--;
4948  nth--;
4949  break;
4950  }
4951  }
4952  }
4953  }
4954  if (cnt == 0 || nth == 0) {
4955  break;
4956  }
4957  }
4958  if (nth == 0) {
4959  break;
4960  }
4961  }
4962  flag = 1;
4963  }
4964  int sum = 0;
4965  for (int i = 0; i < nproc; i++) {
4966  sum += newarr[i];
4967  if (sum > tid) {
4968  if (fine_gran) {
4969  int osID = procarr[i];
4970  KMP_CPU_SET(osID, mask);
4971  } else {
4972  int coreID = i / nth_per_core;
4973  for (int ii = 0; ii < nth_per_core; ii++) {
4974  int osID = procarr[coreID * nth_per_core + ii];
4975  if (osID != -1) {
4976  KMP_CPU_SET(osID, mask);
4977  }
4978  }
4979  }
4980  break;
4981  }
4982  }
4983  __kmp_free(newarr);
4984  }
4985 
4986  if (__kmp_affinity_verbose) {
4987  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4988  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4989  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4990  __kmp_gettid(), tid, buf);
4991  }
4992  __kmp_set_system_affinity(mask, TRUE);
4993  KMP_CPU_FREE_FROM_STACK(mask);
4994  }
4995 }
4996 
4997 #if KMP_OS_LINUX
4998 // We don't need this entry for Windows because
4999 // there is GetProcessAffinityMask() api
5000 //
5001 // The intended usage is indicated by these steps:
5002 // 1) The user gets the current affinity mask
5003 // 2) Then sets the affinity by calling this function
5004 // 3) Error check the return value
5005 // 4) Use non-OpenMP parallelization
5006 // 5) Reset the affinity to what was stored in step 1)
5007 #ifdef __cplusplus
5008 extern "C"
5009 #endif
5010  int
5011  kmp_set_thread_affinity_mask_initial()
5012 // the function returns 0 on success,
5013 // -1 if we cannot bind thread
5014 // >0 (errno) if an error happened during binding
5015 {
5016  int gtid = __kmp_get_gtid();
5017  if (gtid < 0) {
5018  // Do not touch non-omp threads
5019  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5020  "non-omp thread, returning\n"));
5021  return -1;
5022  }
5023  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5024  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5025  "affinity not initialized, returning\n"));
5026  return -1;
5027  }
5028  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5029  "set full mask for thread %d\n",
5030  gtid));
5031  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5032  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5033 }
5034 #endif
5035 
5036 #endif // KMP_AFFINITY_SUPPORTED