2 * kmp_affinity.cpp -- affinity management
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "kmp_affinity.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy
;
26 void __kmp_cleanup_hierarchy() { machine_hierarchy
.fini(); }
28 void __kmp_get_hierarchy(kmp_uint32 nproc
, kmp_bstate_t
*thr_bar
) {
30 // The test below is true if affinity is available, but set to "none". Need to
31 // init on first use of hierarchical barrier.
32 if (TCR_1(machine_hierarchy
.uninitialized
))
33 machine_hierarchy
.init(NULL
, nproc
);
35 // Adjust the hierarchy in case num threads exceeds original
36 if (nproc
> machine_hierarchy
.base_num_threads
)
37 machine_hierarchy
.resize(nproc
);
39 depth
= machine_hierarchy
.depth
;
40 KMP_DEBUG_ASSERT(depth
> 0);
42 thr_bar
->depth
= depth
;
43 thr_bar
->base_leaf_kids
= (kmp_uint8
)machine_hierarchy
.numPerLevel
[0] - 1;
44 thr_bar
->skip_per_level
= machine_hierarchy
.skipPerLevel
;
47 #if KMP_AFFINITY_SUPPORTED
49 bool KMPAffinity::picked_api
= false;
51 void *KMPAffinity::Mask::operator new(size_t n
) { return __kmp_allocate(n
); }
52 void *KMPAffinity::Mask::operator new[](size_t n
) { return __kmp_allocate(n
); }
53 void KMPAffinity::Mask::operator delete(void *p
) { __kmp_free(p
); }
54 void KMPAffinity::Mask::operator delete[](void *p
) { __kmp_free(p
); }
55 void *KMPAffinity::operator new(size_t n
) { return __kmp_allocate(n
); }
56 void KMPAffinity::operator delete(void *p
) { __kmp_free(p
); }
58 void KMPAffinity::pick_api() {
59 KMPAffinity
*affinity_dispatch
;
63 // Only use Hwloc if affinity isn't explicitly disabled and
64 // user requests Hwloc topology method
65 if (__kmp_affinity_top_method
== affinity_top_method_hwloc
&&
66 __kmp_affinity_type
!= affinity_disabled
) {
67 affinity_dispatch
= new KMPHwlocAffinity();
71 affinity_dispatch
= new KMPNativeAffinity();
73 __kmp_affinity_dispatch
= affinity_dispatch
;
77 void KMPAffinity::destroy_api() {
78 if (__kmp_affinity_dispatch
!= NULL
) {
79 delete __kmp_affinity_dispatch
;
80 __kmp_affinity_dispatch
= NULL
;
85 #define KMP_ADVANCE_SCAN(scan) \
86 while (*scan != '\0') { \
90 // Print the affinity mask to the character array in a pretty format.
91 // The format is a comma separated list of non-negative integers or integer
92 // ranges: e.g., 1,2,3-5,7,9-15
93 // The format can also be the string "{<empty>}" if no bits are set in mask
94 char *__kmp_affinity_print_mask(char *buf
, int buf_len
,
95 kmp_affin_mask_t
*mask
) {
96 int start
= 0, finish
= 0, previous
= 0;
99 KMP_ASSERT(buf_len
>= 40);
102 char *end
= buf
+ buf_len
- 1;
104 // Check for empty set.
105 if (mask
->begin() == mask
->end()) {
106 KMP_SNPRINTF(scan
, end
- scan
+ 1, "{<empty>}");
107 KMP_ADVANCE_SCAN(scan
);
108 KMP_ASSERT(scan
<= end
);
113 start
= mask
->begin();
116 // [start, previous] is inclusive range of contiguous bits in mask
117 for (finish
= mask
->next(start
), previous
= start
;
118 finish
== previous
+ 1 && finish
!= mask
->end();
119 finish
= mask
->next(finish
)) {
123 // The first range does not need a comma printed before it, but the rest
124 // of the ranges do need a comma beforehand
126 KMP_SNPRINTF(scan
, end
- scan
+ 1, "%s", ",");
127 KMP_ADVANCE_SCAN(scan
);
131 // Range with three or more contiguous bits in the affinity mask
132 if (previous
- start
> 1) {
133 KMP_SNPRINTF(scan
, end
- scan
+ 1, "%d-%d", static_cast<int>(start
),
134 static_cast<int>(previous
));
136 // Range with one or two contiguous bits in the affinity mask
137 KMP_SNPRINTF(scan
, end
- scan
+ 1, "%d", static_cast<int>(start
));
138 KMP_ADVANCE_SCAN(scan
);
139 if (previous
- start
> 0) {
140 KMP_SNPRINTF(scan
, end
- scan
+ 1, ",%d", static_cast<int>(previous
));
143 KMP_ADVANCE_SCAN(scan
);
144 // Start over with new start point
146 if (start
== mask
->end())
148 // Check for overflow
153 // Check for overflow
154 KMP_ASSERT(scan
<= end
);
157 #undef KMP_ADVANCE_SCAN
159 // Print the affinity mask to the string buffer object in a pretty format
160 // The format is a comma separated list of non-negative integers or integer
161 // ranges: e.g., 1,2,3-5,7,9-15
162 // The format can also be the string "{<empty>}" if no bits are set in mask
163 kmp_str_buf_t
*__kmp_affinity_str_buf_mask(kmp_str_buf_t
*buf
,
164 kmp_affin_mask_t
*mask
) {
165 int start
= 0, finish
= 0, previous
= 0;
170 __kmp_str_buf_clear(buf
);
172 // Check for empty set.
173 if (mask
->begin() == mask
->end()) {
174 __kmp_str_buf_print(buf
, "%s", "{<empty>}");
179 start
= mask
->begin();
182 // [start, previous] is inclusive range of contiguous bits in mask
183 for (finish
= mask
->next(start
), previous
= start
;
184 finish
== previous
+ 1 && finish
!= mask
->end();
185 finish
= mask
->next(finish
)) {
189 // The first range does not need a comma printed before it, but the rest
190 // of the ranges do need a comma beforehand
192 __kmp_str_buf_print(buf
, "%s", ",");
196 // Range with three or more contiguous bits in the affinity mask
197 if (previous
- start
> 1) {
198 __kmp_str_buf_print(buf
, "%d-%d", static_cast<int>(start
),
199 static_cast<int>(previous
));
201 // Range with one or two contiguous bits in the affinity mask
202 __kmp_str_buf_print(buf
, "%d", static_cast<int>(start
));
203 if (previous
- start
> 0) {
204 __kmp_str_buf_print(buf
, ",%d", static_cast<int>(previous
));
207 // Start over with new start point
209 if (start
== mask
->end())
215 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t
*mask
) {
218 #if KMP_GROUP_AFFINITY
220 if (__kmp_num_proc_groups
> 1) {
222 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount
!= NULL
);
223 for (group
= 0; group
< __kmp_num_proc_groups
; group
++) {
225 int num
= __kmp_GetActiveProcessorCount(group
);
226 for (i
= 0; i
< num
; i
++) {
227 KMP_CPU_SET(i
+ group
* (CHAR_BIT
* sizeof(DWORD_PTR
)), mask
);
232 #endif /* KMP_GROUP_AFFINITY */
236 for (proc
= 0; proc
< __kmp_xproc
; proc
++) {
237 KMP_CPU_SET(proc
, mask
);
242 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
243 // called to renumber the labels from [0..n] and place them into the child_num
244 // vector of the address object. This is done in case the labels used for
245 // the children at one node of the hierarchy differ from those used for
246 // another node at the same level. Example: suppose the machine has 2 nodes
247 // with 2 packages each. The first node contains packages 601 and 602, and
248 // second node contains packages 603 and 604. If we try to sort the table
249 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
250 // because we are paying attention to the labels themselves, not the ordinal
251 // child numbers. By using the child numbers in the sort, the result is
252 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
253 static void __kmp_affinity_assign_child_nums(AddrUnsPair
*address2os
,
255 KMP_DEBUG_ASSERT(numAddrs
> 0);
256 int depth
= address2os
->first
.depth
;
257 unsigned *counts
= (unsigned *)__kmp_allocate(depth
* sizeof(unsigned));
258 unsigned *lastLabel
= (unsigned *)__kmp_allocate(depth
* sizeof(unsigned));
260 for (labCt
= 0; labCt
< depth
; labCt
++) {
261 address2os
[0].first
.childNums
[labCt
] = counts
[labCt
] = 0;
262 lastLabel
[labCt
] = address2os
[0].first
.labels
[labCt
];
265 for (i
= 1; i
< numAddrs
; i
++) {
266 for (labCt
= 0; labCt
< depth
; labCt
++) {
267 if (address2os
[i
].first
.labels
[labCt
] != lastLabel
[labCt
]) {
269 for (labCt2
= labCt
+ 1; labCt2
< depth
; labCt2
++) {
271 lastLabel
[labCt2
] = address2os
[i
].first
.labels
[labCt2
];
274 lastLabel
[labCt
] = address2os
[i
].first
.labels
[labCt
];
278 for (labCt
= 0; labCt
< depth
; labCt
++) {
279 address2os
[i
].first
.childNums
[labCt
] = counts
[labCt
];
281 for (; labCt
< (int)Address::maxDepth
; labCt
++) {
282 address2os
[i
].first
.childNums
[labCt
] = 0;
285 __kmp_free(lastLabel
);
289 // All of the __kmp_affinity_create_*_map() routines should set
290 // __kmp_affinity_masks to a vector of affinity mask objects of length
291 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
292 // the number of levels in the machine topology tree (zero if
293 // __kmp_affinity_type == affinity_none).
295 // All of the __kmp_affinity_create_*_map() routines should set
296 // *__kmp_affin_fullMask to the affinity mask for the initialization thread.
297 // They need to save and restore the mask, and it could be needed later, so
298 // saving it is just an optimization to avoid calling kmp_get_system_affinity()
300 kmp_affin_mask_t
*__kmp_affin_fullMask
= NULL
;
302 static int nCoresPerPkg
, nPackages
;
303 static int __kmp_nThreadsPerCore
;
304 #ifndef KMP_DFLT_NTH_CORES
305 static int __kmp_ncores
;
307 static int *__kmp_pu_os_idx
= NULL
;
309 // __kmp_affinity_uniform_topology() doesn't work when called from
310 // places which support arbitrarily many levels in the machine topology
311 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
312 // __kmp_affinity_create_x2apicid_map().
313 inline static bool __kmp_affinity_uniform_topology() {
314 return __kmp_avail_proc
== (__kmp_nThreadsPerCore
* nCoresPerPkg
* nPackages
);
317 // Print out the detailed machine topology map, i.e. the physical locations
319 static void __kmp_affinity_print_topology(AddrUnsPair
*address2os
, int len
,
320 int depth
, int pkgLevel
,
321 int coreLevel
, int threadLevel
) {
324 KMP_INFORM(OSProcToPhysicalThreadMap
, "KMP_AFFINITY");
325 for (proc
= 0; proc
< len
; proc
++) {
328 __kmp_str_buf_init(&buf
);
329 for (level
= 0; level
< depth
; level
++) {
330 if (level
== threadLevel
) {
331 __kmp_str_buf_print(&buf
, "%s ", KMP_I18N_STR(Thread
));
332 } else if (level
== coreLevel
) {
333 __kmp_str_buf_print(&buf
, "%s ", KMP_I18N_STR(Core
));
334 } else if (level
== pkgLevel
) {
335 __kmp_str_buf_print(&buf
, "%s ", KMP_I18N_STR(Package
));
336 } else if (level
> pkgLevel
) {
337 __kmp_str_buf_print(&buf
, "%s_%d ", KMP_I18N_STR(Node
),
338 level
- pkgLevel
- 1);
340 __kmp_str_buf_print(&buf
, "L%d ", level
);
342 __kmp_str_buf_print(&buf
, "%d ", address2os
[proc
].first
.labels
[level
]);
344 KMP_INFORM(OSProcMapToPack
, "KMP_AFFINITY", address2os
[proc
].second
,
346 __kmp_str_buf_free(&buf
);
352 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair
*addrP
, int len
,
353 int depth
, int *levels
) {
356 __kmp_str_buf_init(&buf
);
357 KMP_INFORM(OSProcToPhysicalThreadMap
, "KMP_AFFINITY");
358 for (proc
= 0; proc
< len
; proc
++) {
359 __kmp_str_buf_print(&buf
, "%s %d ", KMP_I18N_STR(Package
),
360 addrP
[proc
].first
.labels
[0]);
362 int level
= 1; // iterate over levels
363 int label
= 1; // iterate over labels
364 if (__kmp_numa_detected
)
365 // node level follows package
366 if (levels
[level
++] > 0)
367 __kmp_str_buf_print(&buf
, "%s %d ", KMP_I18N_STR(Node
),
368 addrP
[proc
].first
.labels
[label
++]);
369 if (__kmp_tile_depth
> 0)
370 // tile level follows node if any, or package
371 if (levels
[level
++] > 0)
372 __kmp_str_buf_print(&buf
, "%s %d ", KMP_I18N_STR(Tile
),
373 addrP
[proc
].first
.labels
[label
++]);
374 if (levels
[level
++] > 0)
375 // core level follows
376 __kmp_str_buf_print(&buf
, "%s %d ", KMP_I18N_STR(Core
),
377 addrP
[proc
].first
.labels
[label
++]);
378 if (levels
[level
++] > 0)
379 // thread level is the latest
380 __kmp_str_buf_print(&buf
, "%s %d ", KMP_I18N_STR(Thread
),
381 addrP
[proc
].first
.labels
[label
++]);
382 KMP_DEBUG_ASSERT(label
== depth
);
384 KMP_INFORM(OSProcMapToPack
, "KMP_AFFINITY", addrP
[proc
].second
, buf
.str
);
385 __kmp_str_buf_clear(&buf
);
387 __kmp_str_buf_free(&buf
);
390 static int nNodePerPkg
, nTilePerPkg
, nTilePerNode
, nCorePerNode
, nCorePerTile
;
392 // This function removes the topology levels that are radix 1 and don't offer
393 // further information about the topology. The most common example is when you
394 // have one thread context per core, we don't want the extra thread context
395 // level if it offers no unique labels. So they are removed.
396 // return value: the new depth of address2os
397 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair
*addrP
, int nTh
,
398 int depth
, int *levels
) {
402 int new_depth
= depth
;
403 for (level
= depth
- 1; level
> 0; --level
) {
404 // Detect if this level is radix 1
406 for (i
= 1; i
< nTh
; ++i
) {
407 if (addrP
[0].first
.labels
[level
] != addrP
[i
].first
.labels
[level
]) {
408 // There are differing label values for this level so it stays
413 if (!radix1_detected
)
415 // Radix 1 was detected
417 levels
[level
] = -1; // mark level as not present in address2os array
418 if (level
== new_depth
) {
419 // "turn off" deepest level, just decrement the depth that removes
420 // the level from address2os array
421 for (i
= 0; i
< nTh
; ++i
) {
422 addrP
[i
].first
.depth
--;
425 // For other levels, we move labels over and also reduce the depth
427 for (j
= level
; j
< new_depth
; ++j
) {
428 for (i
= 0; i
< nTh
; ++i
) {
429 addrP
[i
].first
.labels
[j
] = addrP
[i
].first
.labels
[j
+ 1];
430 addrP
[i
].first
.depth
--;
439 // Returns the number of objects of type 'type' below 'obj' within the topology
440 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
441 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
443 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj
,
444 hwloc_obj_type_t type
) {
447 for (first
= hwloc_get_obj_below_by_type(__kmp_hwloc_topology
, obj
->type
,
448 obj
->logical_index
, type
, 0);
450 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology
, obj
->type
, first
) ==
452 first
= hwloc_get_next_obj_by_type(__kmp_hwloc_topology
, first
->type
,
459 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t
,
461 kmp_hwloc_depth_t depth
,
463 if (o
->depth
== depth
) {
465 *f
= o
; // output first descendant found
469 for (unsigned i
= 0; i
< o
->arity
; i
++)
470 sum
+= __kmp_hwloc_count_children_by_depth(t
, o
->children
[i
], depth
, f
);
471 return sum
; // will be 0 if no one found (as PU arity is 0)
474 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t
, hwloc_obj_t o
,
475 hwloc_obj_type_t type
,
477 if (!hwloc_compare_types(o
->type
, type
)) {
479 *f
= o
; // output first descendant found
483 for (unsigned i
= 0; i
< o
->arity
; i
++)
484 sum
+= __kmp_hwloc_count_children_by_type(t
, o
->children
[i
], type
, f
);
485 return sum
; // will be 0 if no one found (as PU arity is 0)
488 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair
*addrPair
,
490 int &num_active_cores
,
491 hwloc_obj_t obj
, int depth
,
493 hwloc_obj_t core
= NULL
;
494 hwloc_topology_t
&tp
= __kmp_hwloc_topology
;
495 int NC
= __kmp_hwloc_count_children_by_type(tp
, obj
, HWLOC_OBJ_CORE
, &core
);
496 for (int core_id
= 0; core_id
< NC
; ++core_id
, core
= core
->next_cousin
) {
497 hwloc_obj_t pu
= NULL
;
498 KMP_DEBUG_ASSERT(core
!= NULL
);
499 int num_active_threads
= 0;
500 int NT
= __kmp_hwloc_count_children_by_type(tp
, core
, HWLOC_OBJ_PU
, &pu
);
501 // int NT = core->arity; pu = core->first_child; // faster?
502 for (int pu_id
= 0; pu_id
< NT
; ++pu_id
, pu
= pu
->next_cousin
) {
503 KMP_DEBUG_ASSERT(pu
!= NULL
);
504 if (!KMP_CPU_ISSET(pu
->os_index
, __kmp_affin_fullMask
))
505 continue; // skip inactive (inaccessible) unit
506 Address
addr(depth
+ 2);
507 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
508 obj
->os_index
, obj
->logical_index
, core
->os_index
,
509 core
->logical_index
, pu
->os_index
, pu
->logical_index
));
510 for (int i
= 0; i
< depth
; ++i
)
511 addr
.labels
[i
] = labels
[i
]; // package, etc.
512 addr
.labels
[depth
] = core_id
; // core
513 addr
.labels
[depth
+ 1] = pu_id
; // pu
514 addrPair
[nActiveThreads
] = AddrUnsPair(addr
, pu
->os_index
);
515 __kmp_pu_os_idx
[nActiveThreads
] = pu
->os_index
;
517 ++num_active_threads
; // count active threads per core
519 if (num_active_threads
) { // were there any active threads on the core?
520 ++__kmp_ncores
; // count total active cores
521 ++num_active_cores
; // count active cores per socket
522 if (num_active_threads
> __kmp_nThreadsPerCore
)
523 __kmp_nThreadsPerCore
= num_active_threads
; // calc maximum
529 // Check if NUMA node detected below the package,
530 // and if tile object is detected and return its depth
531 static int __kmp_hwloc_check_numa() {
532 hwloc_topology_t
&tp
= __kmp_hwloc_topology
;
533 hwloc_obj_t hT
, hC
, hL
, hN
, hS
; // hwloc objects (pointers to)
534 int depth
, l2cache_depth
, package_depth
;
537 hT
= hwloc_get_obj_by_type(tp
, HWLOC_OBJ_PU
, 0);
538 if (hT
== NULL
) // something has gone wrong
541 // check NUMA node below PACKAGE
542 hN
= hwloc_get_ancestor_obj_by_type(tp
, HWLOC_OBJ_NUMANODE
, hT
);
543 hS
= hwloc_get_ancestor_obj_by_type(tp
, HWLOC_OBJ_PACKAGE
, hT
);
544 KMP_DEBUG_ASSERT(hS
!= NULL
);
545 if (hN
!= NULL
&& hN
->depth
> hS
->depth
) {
546 __kmp_numa_detected
= TRUE
; // socket includes node(s)
547 if (__kmp_affinity_gran
== affinity_gran_node
) {
548 __kmp_affinity_gran
= affinity_gran_numa
;
552 package_depth
= hwloc_get_type_depth(tp
, HWLOC_OBJ_PACKAGE
);
553 l2cache_depth
= hwloc_get_cache_type_depth(tp
, 2, HWLOC_OBJ_CACHE_UNIFIED
);
554 // check tile, get object by depth because of multiple caches possible
555 depth
= (l2cache_depth
< package_depth
) ? package_depth
: l2cache_depth
;
556 hL
= hwloc_get_ancestor_obj_by_depth(tp
, depth
, hT
);
557 hC
= NULL
; // not used, but reset it here just in case
559 __kmp_hwloc_count_children_by_type(tp
, hL
, HWLOC_OBJ_CORE
, &hC
) > 1)
560 __kmp_tile_depth
= depth
; // tile consists of multiple cores
564 static int __kmp_affinity_create_hwloc_map(AddrUnsPair
**address2os
,
565 kmp_i18n_id_t
*const msg_id
) {
566 hwloc_topology_t
&tp
= __kmp_hwloc_topology
; // shortcut of a long name
568 *msg_id
= kmp_i18n_null
;
570 // Save the affinity mask for the current thread.
571 kmp_affin_mask_t
*oldMask
;
572 KMP_CPU_ALLOC(oldMask
);
573 __kmp_get_system_affinity(oldMask
, TRUE
);
574 __kmp_hwloc_check_numa();
576 if (!KMP_AFFINITY_CAPABLE()) {
577 // Hack to try and infer the machine topology using only the data
578 // available from cpuid on the current thread, and __kmp_xproc.
579 KMP_ASSERT(__kmp_affinity_type
== affinity_none
);
580 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
581 hwloc_obj_t o
= hwloc_get_obj_by_type(tp
, HWLOC_OBJ_PACKAGE
, 0);
583 nCoresPerPkg
= __kmp_hwloc_get_nobjs_under_obj(o
, HWLOC_OBJ_CORE
);
585 nCoresPerPkg
= 1; // no PACKAGE found
586 o
= hwloc_get_obj_by_type(tp
, HWLOC_OBJ_CORE
, 0);
588 __kmp_nThreadsPerCore
= __kmp_hwloc_get_nobjs_under_obj(o
, HWLOC_OBJ_PU
);
590 __kmp_nThreadsPerCore
= 1; // no CORE found
591 __kmp_ncores
= __kmp_xproc
/ __kmp_nThreadsPerCore
;
592 nPackages
= (__kmp_xproc
+ nCoresPerPkg
- 1) / nCoresPerPkg
;
593 if (__kmp_affinity_verbose
) {
594 KMP_INFORM(AffNotCapableUseLocCpuidL11
, "KMP_AFFINITY");
595 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
596 if (__kmp_affinity_uniform_topology()) {
597 KMP_INFORM(Uniform
, "KMP_AFFINITY");
599 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
601 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
602 __kmp_nThreadsPerCore
, __kmp_ncores
);
604 KMP_CPU_FREE(oldMask
);
609 int levels
[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
610 int labels
[3] = {0}; // package [,node] [,tile] - head of labels array
611 if (__kmp_numa_detected
)
613 if (__kmp_tile_depth
)
616 // Allocate the data structure to be returned.
617 AddrUnsPair
*retval
=
618 (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
) * __kmp_avail_proc
);
619 KMP_DEBUG_ASSERT(__kmp_pu_os_idx
== NULL
);
620 __kmp_pu_os_idx
= (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc
);
622 // When affinity is off, this routine will still be called to set
623 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
624 // nCoresPerPkg, & nPackages. Make sure all these vars are set
625 // correctly, and return if affinity is not enabled.
627 hwloc_obj_t socket
, node
, tile
;
628 int nActiveThreads
= 0;
630 // re-calculate globals to count only accessible resources
631 __kmp_ncores
= nPackages
= nCoresPerPkg
= __kmp_nThreadsPerCore
= 0;
632 nNodePerPkg
= nTilePerPkg
= nTilePerNode
= nCorePerNode
= nCorePerTile
= 0;
633 for (socket
= hwloc_get_obj_by_type(tp
, HWLOC_OBJ_PACKAGE
, 0); socket
!= NULL
;
634 socket
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PACKAGE
, socket
),
636 labels
[0] = socket_id
;
637 if (__kmp_numa_detected
) {
639 int n_active_nodes
= 0;
641 NN
= __kmp_hwloc_count_children_by_type(tp
, socket
, HWLOC_OBJ_NUMANODE
,
643 for (int node_id
= 0; node_id
< NN
; ++node_id
, node
= node
->next_cousin
) {
645 if (__kmp_tile_depth
) {
648 int n_active_tiles
= 0;
650 NT
= __kmp_hwloc_count_children_by_depth(tp
, node
, __kmp_tile_depth
,
652 for (int tl_id
= 0; tl_id
< NT
; ++tl_id
, tile
= tile
->next_cousin
) {
654 int n_active_cores
= 0;
655 __kmp_hwloc_process_obj_core_pu(retval
, nActiveThreads
,
656 n_active_cores
, tile
, 3, labels
);
657 if (n_active_cores
) { // were there any active cores on the socket?
658 ++n_active_tiles
; // count active tiles per node
659 if (n_active_cores
> nCorePerTile
)
660 nCorePerTile
= n_active_cores
; // calc maximum
663 if (n_active_tiles
) { // were there any active tiles on the socket?
664 ++n_active_nodes
; // count active nodes per package
665 if (n_active_tiles
> nTilePerNode
)
666 nTilePerNode
= n_active_tiles
; // calc maximum
670 int n_active_cores
= 0;
671 __kmp_hwloc_process_obj_core_pu(retval
, nActiveThreads
,
672 n_active_cores
, node
, 2, labels
);
673 if (n_active_cores
) { // were there any active cores on the socket?
674 ++n_active_nodes
; // count active nodes per package
675 if (n_active_cores
> nCorePerNode
)
676 nCorePerNode
= n_active_cores
; // calc maximum
680 if (n_active_nodes
) { // were there any active nodes on the socket?
681 ++nPackages
; // count total active packages
682 if (n_active_nodes
> nNodePerPkg
)
683 nNodePerPkg
= n_active_nodes
; // calc maximum
686 if (__kmp_tile_depth
) {
689 int n_active_tiles
= 0;
691 NT
= __kmp_hwloc_count_children_by_depth(tp
, socket
, __kmp_tile_depth
,
693 for (int tl_id
= 0; tl_id
< NT
; ++tl_id
, tile
= tile
->next_cousin
) {
695 int n_active_cores
= 0;
696 __kmp_hwloc_process_obj_core_pu(retval
, nActiveThreads
,
697 n_active_cores
, tile
, 2, labels
);
698 if (n_active_cores
) { // were there any active cores on the socket?
699 ++n_active_tiles
; // count active tiles per package
700 if (n_active_cores
> nCorePerTile
)
701 nCorePerTile
= n_active_cores
; // calc maximum
704 if (n_active_tiles
) { // were there any active tiles on the socket?
705 ++nPackages
; // count total active packages
706 if (n_active_tiles
> nTilePerPkg
)
707 nTilePerPkg
= n_active_tiles
; // calc maximum
711 int n_active_cores
= 0;
712 __kmp_hwloc_process_obj_core_pu(retval
, nActiveThreads
, n_active_cores
,
714 if (n_active_cores
) { // were there any active cores on the socket?
715 ++nPackages
; // count total active packages
716 if (n_active_cores
> nCoresPerPkg
)
717 nCoresPerPkg
= n_active_cores
; // calc maximum
723 // If there's only one thread context to bind to, return now.
724 KMP_DEBUG_ASSERT(nActiveThreads
== __kmp_avail_proc
);
725 KMP_ASSERT(nActiveThreads
> 0);
726 if (nActiveThreads
== 1) {
727 __kmp_ncores
= nPackages
= 1;
728 __kmp_nThreadsPerCore
= nCoresPerPkg
= 1;
729 if (__kmp_affinity_verbose
) {
730 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
731 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, oldMask
);
733 KMP_INFORM(AffUsingHwloc
, "KMP_AFFINITY");
734 if (__kmp_affinity_respect_mask
) {
735 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", buf
);
737 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", buf
);
739 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
740 KMP_INFORM(Uniform
, "KMP_AFFINITY");
741 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
742 __kmp_nThreadsPerCore
, __kmp_ncores
);
745 if (__kmp_affinity_type
== affinity_none
) {
747 KMP_CPU_FREE(oldMask
);
751 // Form an Address object which only includes the package level.
753 addr
.labels
[0] = retval
[0].first
.labels
[0];
754 retval
[0].first
= addr
;
756 if (__kmp_affinity_gran_levels
< 0) {
757 __kmp_affinity_gran_levels
= 0;
760 if (__kmp_affinity_verbose
) {
761 __kmp_affinity_print_topology(retval
, 1, 1, 0, -1, -1);
764 *address2os
= retval
;
765 KMP_CPU_FREE(oldMask
);
769 // Sort the table by physical Id.
770 qsort(retval
, nActiveThreads
, sizeof(*retval
),
771 __kmp_affinity_cmp_Address_labels
);
773 // Check to see if the machine topology is uniform
774 int nPUs
= nPackages
* __kmp_nThreadsPerCore
;
775 if (__kmp_numa_detected
) {
776 if (__kmp_tile_depth
) { // NUMA + tiles
777 nPUs
*= (nNodePerPkg
* nTilePerNode
* nCorePerTile
);
778 } else { // NUMA, no tiles
779 nPUs
*= (nNodePerPkg
* nCorePerNode
);
782 if (__kmp_tile_depth
) { // no NUMA, tiles
783 nPUs
*= (nTilePerPkg
* nCorePerTile
);
784 } else { // no NUMA, no tiles
785 nPUs
*= nCoresPerPkg
;
788 unsigned uniform
= (nPUs
== nActiveThreads
);
790 // Print the machine topology summary.
791 if (__kmp_affinity_verbose
) {
792 char mask
[KMP_AFFIN_MASK_PRINT_LEN
];
793 __kmp_affinity_print_mask(mask
, KMP_AFFIN_MASK_PRINT_LEN
, oldMask
);
794 if (__kmp_affinity_respect_mask
) {
795 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", mask
);
797 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", mask
);
799 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
801 KMP_INFORM(Uniform
, "KMP_AFFINITY");
803 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
805 if (__kmp_numa_detected
) {
806 if (__kmp_tile_depth
) { // NUMA + tiles
807 KMP_INFORM(TopologyExtraNoTi
, "KMP_AFFINITY", nPackages
, nNodePerPkg
,
808 nTilePerNode
, nCorePerTile
, __kmp_nThreadsPerCore
,
810 } else { // NUMA, no tiles
811 KMP_INFORM(TopologyExtraNode
, "KMP_AFFINITY", nPackages
, nNodePerPkg
,
812 nCorePerNode
, __kmp_nThreadsPerCore
, __kmp_ncores
);
813 nPUs
*= (nNodePerPkg
* nCorePerNode
);
816 if (__kmp_tile_depth
) { // no NUMA, tiles
817 KMP_INFORM(TopologyExtraTile
, "KMP_AFFINITY", nPackages
, nTilePerPkg
,
818 nCorePerTile
, __kmp_nThreadsPerCore
, __kmp_ncores
);
819 } else { // no NUMA, no tiles
821 __kmp_str_buf_init(&buf
);
822 __kmp_str_buf_print(&buf
, "%d", nPackages
);
823 KMP_INFORM(TopologyExtra
, "KMP_AFFINITY", buf
.str
, nCoresPerPkg
,
824 __kmp_nThreadsPerCore
, __kmp_ncores
);
825 __kmp_str_buf_free(&buf
);
830 if (__kmp_affinity_type
== affinity_none
) {
832 KMP_CPU_FREE(oldMask
);
836 int depth_full
= depth
; // number of levels before compressing
837 // Find any levels with radix 1, and remove them from the map
838 // (except for the package level).
839 depth
= __kmp_affinity_remove_radix_one_levels(retval
, nActiveThreads
, depth
,
841 KMP_DEBUG_ASSERT(__kmp_affinity_gran
!= affinity_gran_default
);
842 if (__kmp_affinity_gran_levels
< 0) {
843 // Set the granularity level based on what levels are modeled
844 // in the machine topology map.
845 __kmp_affinity_gran_levels
= 0; // lowest level (e.g. fine)
846 if (__kmp_affinity_gran
> affinity_gran_thread
) {
847 for (int i
= 1; i
<= depth_full
; ++i
) {
848 if (__kmp_affinity_gran
<= i
) // only count deeper levels
850 if (levels
[depth_full
- i
] > 0)
851 __kmp_affinity_gran_levels
++;
854 if (__kmp_affinity_gran
> affinity_gran_package
)
855 __kmp_affinity_gran_levels
++; // e.g. granularity = group
858 if (__kmp_affinity_verbose
)
859 __kmp_affinity_print_hwloc_tp(retval
, nActiveThreads
, depth
, levels
);
861 KMP_CPU_FREE(oldMask
);
862 *address2os
= retval
;
865 #endif // KMP_USE_HWLOC
867 // If we don't know how to retrieve the machine's processor topology, or
868 // encounter an error in doing so, this routine is called to form a "flat"
869 // mapping of os thread id's <-> processor id's.
870 static int __kmp_affinity_create_flat_map(AddrUnsPair
**address2os
,
871 kmp_i18n_id_t
*const msg_id
) {
873 *msg_id
= kmp_i18n_null
;
875 // Even if __kmp_affinity_type == affinity_none, this routine might still
876 // called to set __kmp_ncores, as well as
877 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
878 if (!KMP_AFFINITY_CAPABLE()) {
879 KMP_ASSERT(__kmp_affinity_type
== affinity_none
);
880 __kmp_ncores
= nPackages
= __kmp_xproc
;
881 __kmp_nThreadsPerCore
= nCoresPerPkg
= 1;
882 if (__kmp_affinity_verbose
) {
883 KMP_INFORM(AffFlatTopology
, "KMP_AFFINITY");
884 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
885 KMP_INFORM(Uniform
, "KMP_AFFINITY");
886 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
887 __kmp_nThreadsPerCore
, __kmp_ncores
);
892 // When affinity is off, this routine will still be called to set
893 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
894 // Make sure all these vars are set correctly, and return now if affinity is
896 __kmp_ncores
= nPackages
= __kmp_avail_proc
;
897 __kmp_nThreadsPerCore
= nCoresPerPkg
= 1;
898 if (__kmp_affinity_verbose
) {
899 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
900 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
901 __kmp_affin_fullMask
);
903 KMP_INFORM(AffCapableUseFlat
, "KMP_AFFINITY");
904 if (__kmp_affinity_respect_mask
) {
905 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", buf
);
907 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", buf
);
909 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
910 KMP_INFORM(Uniform
, "KMP_AFFINITY");
911 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
912 __kmp_nThreadsPerCore
, __kmp_ncores
);
914 KMP_DEBUG_ASSERT(__kmp_pu_os_idx
== NULL
);
915 __kmp_pu_os_idx
= (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc
);
916 if (__kmp_affinity_type
== affinity_none
) {
919 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
920 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
))
922 __kmp_pu_os_idx
[avail_ct
++] = i
; // suppose indices are flat
927 // Construct the data structure to be returned.
929 (AddrUnsPair
*)__kmp_allocate(sizeof(**address2os
) * __kmp_avail_proc
);
932 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
933 // Skip this proc if it is not included in the machine model.
934 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
937 __kmp_pu_os_idx
[avail_ct
] = i
; // suppose indices are flat
940 (*address2os
)[avail_ct
++] = AddrUnsPair(addr
, i
);
942 if (__kmp_affinity_verbose
) {
943 KMP_INFORM(OSProcToPackage
, "KMP_AFFINITY");
946 if (__kmp_affinity_gran_levels
< 0) {
947 // Only the package level is modeled in the machine topology map,
948 // so the #levels of granularity is either 0 or 1.
949 if (__kmp_affinity_gran
> affinity_gran_package
) {
950 __kmp_affinity_gran_levels
= 1;
952 __kmp_affinity_gran_levels
= 0;
958 #if KMP_GROUP_AFFINITY
960 // If multiple Windows* OS processor groups exist, we can create a 2-level
961 // topology map with the groups at level 0 and the individual procs at level 1.
962 // This facilitates letting the threads float among all procs in a group,
963 // if granularity=group (the default when there are multiple groups).
964 static int __kmp_affinity_create_proc_group_map(AddrUnsPair
**address2os
,
965 kmp_i18n_id_t
*const msg_id
) {
967 *msg_id
= kmp_i18n_null
;
969 // If we aren't affinity capable, then return now.
970 // The flat mapping will be used.
971 if (!KMP_AFFINITY_CAPABLE()) {
976 // Construct the data structure to be returned.
978 (AddrUnsPair
*)__kmp_allocate(sizeof(**address2os
) * __kmp_avail_proc
);
979 KMP_DEBUG_ASSERT(__kmp_pu_os_idx
== NULL
);
980 __kmp_pu_os_idx
= (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc
);
983 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
984 // Skip this proc if it is not included in the machine model.
985 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
988 __kmp_pu_os_idx
[avail_ct
] = i
; // suppose indices are flat
990 addr
.labels
[0] = i
/ (CHAR_BIT
* sizeof(DWORD_PTR
));
991 addr
.labels
[1] = i
% (CHAR_BIT
* sizeof(DWORD_PTR
));
992 (*address2os
)[avail_ct
++] = AddrUnsPair(addr
, i
);
994 if (__kmp_affinity_verbose
) {
995 KMP_INFORM(AffOSProcToGroup
, "KMP_AFFINITY", i
, addr
.labels
[0],
1000 if (__kmp_affinity_gran_levels
< 0) {
1001 if (__kmp_affinity_gran
== affinity_gran_group
) {
1002 __kmp_affinity_gran_levels
= 1;
1003 } else if ((__kmp_affinity_gran
== affinity_gran_fine
) ||
1004 (__kmp_affinity_gran
== affinity_gran_thread
)) {
1005 __kmp_affinity_gran_levels
= 0;
1007 const char *gran_str
= NULL
;
1008 if (__kmp_affinity_gran
== affinity_gran_core
) {
1010 } else if (__kmp_affinity_gran
== affinity_gran_package
) {
1011 gran_str
= "package";
1012 } else if (__kmp_affinity_gran
== affinity_gran_node
) {
1018 // Warning: can't use affinity granularity \"gran\" with group topology
1019 // method, using "thread"
1020 __kmp_affinity_gran_levels
= 0;
1026 #endif /* KMP_GROUP_AFFINITY */
1028 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1030 static int __kmp_cpuid_mask_width(int count
) {
1033 while ((1 << r
) < count
)
1038 class apicThreadInfo
{
1040 unsigned osId
; // param to __kmp_affinity_bind_thread
1041 unsigned apicId
; // from cpuid after binding
1042 unsigned maxCoresPerPkg
; // ""
1043 unsigned maxThreadsPerPkg
; // ""
1044 unsigned pkgId
; // inferred from above values
1045 unsigned coreId
; // ""
1046 unsigned threadId
; // ""
1049 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a
,
1051 const apicThreadInfo
*aa
= (const apicThreadInfo
*)a
;
1052 const apicThreadInfo
*bb
= (const apicThreadInfo
*)b
;
1053 if (aa
->pkgId
< bb
->pkgId
)
1055 if (aa
->pkgId
> bb
->pkgId
)
1057 if (aa
->coreId
< bb
->coreId
)
1059 if (aa
->coreId
> bb
->coreId
)
1061 if (aa
->threadId
< bb
->threadId
)
1063 if (aa
->threadId
> bb
->threadId
)
1068 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
1069 // an algorithm which cycles through the available os threads, setting
1070 // the current thread's affinity mask to that thread, and then retrieves
1071 // the Apic Id for each thread context using the cpuid instruction.
1072 static int __kmp_affinity_create_apicid_map(AddrUnsPair
**address2os
,
1073 kmp_i18n_id_t
*const msg_id
) {
1076 *msg_id
= kmp_i18n_null
;
1078 // Check if cpuid leaf 4 is supported.
1079 __kmp_x86_cpuid(0, 0, &buf
);
1081 *msg_id
= kmp_i18n_str_NoLeaf4Support
;
1085 // The algorithm used starts by setting the affinity to each available thread
1086 // and retrieving info from the cpuid instruction, so if we are not capable of
1087 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1088 // need to do something else - use the defaults that we calculated from
1089 // issuing cpuid without binding to each proc.
1090 if (!KMP_AFFINITY_CAPABLE()) {
1091 // Hack to try and infer the machine topology using only the data
1092 // available from cpuid on the current thread, and __kmp_xproc.
1093 KMP_ASSERT(__kmp_affinity_type
== affinity_none
);
1095 // Get an upper bound on the number of threads per package using cpuid(1).
1096 // On some OS/chps combinations where HT is supported by the chip but is
1097 // disabled, this value will be 2 on a single core chip. Usually, it will be
1098 // 2 if HT is enabled and 1 if HT is disabled.
1099 __kmp_x86_cpuid(1, 0, &buf
);
1100 int maxThreadsPerPkg
= (buf
.ebx
>> 16) & 0xff;
1101 if (maxThreadsPerPkg
== 0) {
1102 maxThreadsPerPkg
= 1;
1105 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
1108 // The author of cpu_count.cpp treated this only an upper bound on the
1109 // number of cores, but I haven't seen any cases where it was greater than
1110 // the actual number of cores, so we will treat it as exact in this block of
1113 // First, we need to check if cpuid(4) is supported on this chip. To see if
1114 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
1116 __kmp_x86_cpuid(0, 0, &buf
);
1118 __kmp_x86_cpuid(4, 0, &buf
);
1119 nCoresPerPkg
= ((buf
.eax
>> 26) & 0x3f) + 1;
1124 // There is no way to reliably tell if HT is enabled without issuing the
1125 // cpuid instruction from every thread, can correlating the cpuid info, so
1126 // if the machine is not affinity capable, we assume that HT is off. We have
1127 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
1128 // does not support HT.
1130 // - Older OSes are usually found on machines with older chips, which do not
1132 // - The performance penalty for mistakenly identifying a machine as HT when
1133 // it isn't (which results in blocktime being incorrectly set to 0) is
1134 // greater than the penalty when for mistakenly identifying a machine as
1135 // being 1 thread/core when it is really HT enabled (which results in
1136 // blocktime being incorrectly set to a positive value).
1137 __kmp_ncores
= __kmp_xproc
;
1138 nPackages
= (__kmp_xproc
+ nCoresPerPkg
- 1) / nCoresPerPkg
;
1139 __kmp_nThreadsPerCore
= 1;
1140 if (__kmp_affinity_verbose
) {
1141 KMP_INFORM(AffNotCapableUseLocCpuid
, "KMP_AFFINITY");
1142 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
1143 if (__kmp_affinity_uniform_topology()) {
1144 KMP_INFORM(Uniform
, "KMP_AFFINITY");
1146 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
1148 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
1149 __kmp_nThreadsPerCore
, __kmp_ncores
);
1154 // From here on, we can assume that it is safe to call
1155 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1156 // __kmp_affinity_type = affinity_none.
1158 // Save the affinity mask for the current thread.
1159 kmp_affin_mask_t
*oldMask
;
1160 KMP_CPU_ALLOC(oldMask
);
1161 KMP_ASSERT(oldMask
!= NULL
);
1162 __kmp_get_system_affinity(oldMask
, TRUE
);
1164 // Run through each of the available contexts, binding the current thread
1165 // to it, and obtaining the pertinent information using the cpuid instr.
1167 // The relevant information is:
1168 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1169 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1170 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
1171 // of this field determines the width of the core# + thread# fields in the
1172 // Apic Id. It is also an upper bound on the number of threads per
1173 // package, but it has been verified that situations happen were it is not
1174 // exact. In particular, on certain OS/chip combinations where Intel(R)
1175 // Hyper-Threading Technology is supported by the chip but has been
1176 // disabled, the value of this field will be 2 (for a single core chip).
1177 // On other OS/chip combinations supporting Intel(R) Hyper-Threading
1178 // Technology, the value of this field will be 1 when Intel(R)
1179 // Hyper-Threading Technology is disabled and 2 when it is enabled.
1180 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
1181 // of this field (+1) determines the width of the core# field in the Apic
1182 // Id. The comments in "cpucount.cpp" say that this value is an upper
1183 // bound, but the IA-32 architecture manual says that it is exactly the
1184 // number of cores per package, and I haven't seen any case where it
1187 // From this information, deduce the package Id, core Id, and thread Id,
1188 // and set the corresponding fields in the apicThreadInfo struct.
1190 apicThreadInfo
*threadInfo
= (apicThreadInfo
*)__kmp_allocate(
1191 __kmp_avail_proc
* sizeof(apicThreadInfo
));
1192 unsigned nApics
= 0;
1193 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
1194 // Skip this proc if it is not included in the machine model.
1195 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
1198 KMP_DEBUG_ASSERT((int)nApics
< __kmp_avail_proc
);
1200 __kmp_affinity_dispatch
->bind_thread(i
);
1201 threadInfo
[nApics
].osId
= i
;
1203 // The apic id and max threads per pkg come from cpuid(1).
1204 __kmp_x86_cpuid(1, 0, &buf
);
1205 if (((buf
.edx
>> 9) & 1) == 0) {
1206 __kmp_set_system_affinity(oldMask
, TRUE
);
1207 __kmp_free(threadInfo
);
1208 KMP_CPU_FREE(oldMask
);
1209 *msg_id
= kmp_i18n_str_ApicNotPresent
;
1212 threadInfo
[nApics
].apicId
= (buf
.ebx
>> 24) & 0xff;
1213 threadInfo
[nApics
].maxThreadsPerPkg
= (buf
.ebx
>> 16) & 0xff;
1214 if (threadInfo
[nApics
].maxThreadsPerPkg
== 0) {
1215 threadInfo
[nApics
].maxThreadsPerPkg
= 1;
1218 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
1221 // First, we need to check if cpuid(4) is supported on this chip. To see if
1222 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
1224 __kmp_x86_cpuid(0, 0, &buf
);
1226 __kmp_x86_cpuid(4, 0, &buf
);
1227 threadInfo
[nApics
].maxCoresPerPkg
= ((buf
.eax
>> 26) & 0x3f) + 1;
1229 threadInfo
[nApics
].maxCoresPerPkg
= 1;
1232 // Infer the pkgId / coreId / threadId using only the info obtained locally.
1233 int widthCT
= __kmp_cpuid_mask_width(threadInfo
[nApics
].maxThreadsPerPkg
);
1234 threadInfo
[nApics
].pkgId
= threadInfo
[nApics
].apicId
>> widthCT
;
1236 int widthC
= __kmp_cpuid_mask_width(threadInfo
[nApics
].maxCoresPerPkg
);
1237 int widthT
= widthCT
- widthC
;
1239 // I've never seen this one happen, but I suppose it could, if the cpuid
1240 // instruction on a chip was really screwed up. Make sure to restore the
1241 // affinity mask before the tail call.
1242 __kmp_set_system_affinity(oldMask
, TRUE
);
1243 __kmp_free(threadInfo
);
1244 KMP_CPU_FREE(oldMask
);
1245 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
1249 int maskC
= (1 << widthC
) - 1;
1250 threadInfo
[nApics
].coreId
= (threadInfo
[nApics
].apicId
>> widthT
) & maskC
;
1252 int maskT
= (1 << widthT
) - 1;
1253 threadInfo
[nApics
].threadId
= threadInfo
[nApics
].apicId
& maskT
;
1258 // We've collected all the info we need.
1259 // Restore the old affinity mask for this thread.
1260 __kmp_set_system_affinity(oldMask
, TRUE
);
1262 // If there's only one thread context to bind to, form an Address object
1263 // with depth 1 and return immediately (or, if affinity is off, set
1264 // address2os to NULL and return).
1266 // If it is configured to omit the package level when there is only a single
1267 // package, the logic at the end of this routine won't work if there is only
1268 // a single thread - it would try to form an Address object with depth 0.
1269 KMP_ASSERT(nApics
> 0);
1271 __kmp_ncores
= nPackages
= 1;
1272 __kmp_nThreadsPerCore
= nCoresPerPkg
= 1;
1273 if (__kmp_affinity_verbose
) {
1274 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
1275 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, oldMask
);
1277 KMP_INFORM(AffUseGlobCpuid
, "KMP_AFFINITY");
1278 if (__kmp_affinity_respect_mask
) {
1279 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", buf
);
1281 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", buf
);
1283 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
1284 KMP_INFORM(Uniform
, "KMP_AFFINITY");
1285 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
1286 __kmp_nThreadsPerCore
, __kmp_ncores
);
1289 if (__kmp_affinity_type
== affinity_none
) {
1290 __kmp_free(threadInfo
);
1291 KMP_CPU_FREE(oldMask
);
1295 *address2os
= (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
));
1297 addr
.labels
[0] = threadInfo
[0].pkgId
;
1298 (*address2os
)[0] = AddrUnsPair(addr
, threadInfo
[0].osId
);
1300 if (__kmp_affinity_gran_levels
< 0) {
1301 __kmp_affinity_gran_levels
= 0;
1304 if (__kmp_affinity_verbose
) {
1305 __kmp_affinity_print_topology(*address2os
, 1, 1, 0, -1, -1);
1308 __kmp_free(threadInfo
);
1309 KMP_CPU_FREE(oldMask
);
1313 // Sort the threadInfo table by physical Id.
1314 qsort(threadInfo
, nApics
, sizeof(*threadInfo
),
1315 __kmp_affinity_cmp_apicThreadInfo_phys_id
);
1317 // The table is now sorted by pkgId / coreId / threadId, but we really don't
1318 // know the radix of any of the fields. pkgId's may be sparsely assigned among
1319 // the chips on a system. Although coreId's are usually assigned
1320 // [0 .. coresPerPkg-1] and threadId's are usually assigned
1321 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1323 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1324 // total # packages) are at this point - we want to determine that now. We
1325 // only have an upper bound on the first two figures.
1327 // We also perform a consistency check at this point: the values returned by
1328 // the cpuid instruction for any thread bound to a given package had better
1329 // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1332 __kmp_nThreadsPerCore
= 1;
1333 unsigned nCores
= 1;
1335 unsigned pkgCt
= 1; // to determine radii
1336 unsigned lastPkgId
= threadInfo
[0].pkgId
;
1337 unsigned coreCt
= 1;
1338 unsigned lastCoreId
= threadInfo
[0].coreId
;
1339 unsigned threadCt
= 1;
1340 unsigned lastThreadId
= threadInfo
[0].threadId
;
1342 // intra-pkg consist checks
1343 unsigned prevMaxCoresPerPkg
= threadInfo
[0].maxCoresPerPkg
;
1344 unsigned prevMaxThreadsPerPkg
= threadInfo
[0].maxThreadsPerPkg
;
1346 for (i
= 1; i
< nApics
; i
++) {
1347 if (threadInfo
[i
].pkgId
!= lastPkgId
) {
1350 lastPkgId
= threadInfo
[i
].pkgId
;
1351 if ((int)coreCt
> nCoresPerPkg
)
1352 nCoresPerPkg
= coreCt
;
1354 lastCoreId
= threadInfo
[i
].coreId
;
1355 if ((int)threadCt
> __kmp_nThreadsPerCore
)
1356 __kmp_nThreadsPerCore
= threadCt
;
1358 lastThreadId
= threadInfo
[i
].threadId
;
1360 // This is a different package, so go on to the next iteration without
1361 // doing any consistency checks. Reset the consistency check vars, though.
1362 prevMaxCoresPerPkg
= threadInfo
[i
].maxCoresPerPkg
;
1363 prevMaxThreadsPerPkg
= threadInfo
[i
].maxThreadsPerPkg
;
1367 if (threadInfo
[i
].coreId
!= lastCoreId
) {
1370 lastCoreId
= threadInfo
[i
].coreId
;
1371 if ((int)threadCt
> __kmp_nThreadsPerCore
)
1372 __kmp_nThreadsPerCore
= threadCt
;
1374 lastThreadId
= threadInfo
[i
].threadId
;
1375 } else if (threadInfo
[i
].threadId
!= lastThreadId
) {
1377 lastThreadId
= threadInfo
[i
].threadId
;
1379 __kmp_free(threadInfo
);
1380 KMP_CPU_FREE(oldMask
);
1381 *msg_id
= kmp_i18n_str_LegacyApicIDsNotUnique
;
1385 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1386 // fields agree between all the threads bounds to a given package.
1387 if ((prevMaxCoresPerPkg
!= threadInfo
[i
].maxCoresPerPkg
) ||
1388 (prevMaxThreadsPerPkg
!= threadInfo
[i
].maxThreadsPerPkg
)) {
1389 __kmp_free(threadInfo
);
1390 KMP_CPU_FREE(oldMask
);
1391 *msg_id
= kmp_i18n_str_InconsistentCpuidInfo
;
1396 if ((int)coreCt
> nCoresPerPkg
)
1397 nCoresPerPkg
= coreCt
;
1398 if ((int)threadCt
> __kmp_nThreadsPerCore
)
1399 __kmp_nThreadsPerCore
= threadCt
;
1401 // When affinity is off, this routine will still be called to set
1402 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1403 // Make sure all these vars are set correctly, and return now if affinity is
1405 __kmp_ncores
= nCores
;
1406 if (__kmp_affinity_verbose
) {
1407 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
1408 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, oldMask
);
1410 KMP_INFORM(AffUseGlobCpuid
, "KMP_AFFINITY");
1411 if (__kmp_affinity_respect_mask
) {
1412 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", buf
);
1414 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", buf
);
1416 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
1417 if (__kmp_affinity_uniform_topology()) {
1418 KMP_INFORM(Uniform
, "KMP_AFFINITY");
1420 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
1422 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
1423 __kmp_nThreadsPerCore
, __kmp_ncores
);
1425 KMP_DEBUG_ASSERT(__kmp_pu_os_idx
== NULL
);
1426 KMP_DEBUG_ASSERT(nApics
== (unsigned)__kmp_avail_proc
);
1427 __kmp_pu_os_idx
= (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc
);
1428 for (i
= 0; i
< nApics
; ++i
) {
1429 __kmp_pu_os_idx
[i
] = threadInfo
[i
].osId
;
1431 if (__kmp_affinity_type
== affinity_none
) {
1432 __kmp_free(threadInfo
);
1433 KMP_CPU_FREE(oldMask
);
1437 // Now that we've determined the number of packages, the number of cores per
1438 // package, and the number of threads per core, we can construct the data
1439 // structure that is to be returned.
1441 int coreLevel
= (nCoresPerPkg
<= 1) ? -1 : 1;
1443 (__kmp_nThreadsPerCore
<= 1) ? -1 : ((coreLevel
>= 0) ? 2 : 1);
1444 unsigned depth
= (pkgLevel
>= 0) + (coreLevel
>= 0) + (threadLevel
>= 0);
1446 KMP_ASSERT(depth
> 0);
1447 *address2os
= (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
) * nApics
);
1449 for (i
= 0; i
< nApics
; ++i
) {
1450 Address
addr(depth
);
1451 unsigned os
= threadInfo
[i
].osId
;
1454 if (pkgLevel
>= 0) {
1455 addr
.labels
[d
++] = threadInfo
[i
].pkgId
;
1457 if (coreLevel
>= 0) {
1458 addr
.labels
[d
++] = threadInfo
[i
].coreId
;
1460 if (threadLevel
>= 0) {
1461 addr
.labels
[d
++] = threadInfo
[i
].threadId
;
1463 (*address2os
)[i
] = AddrUnsPair(addr
, os
);
1466 if (__kmp_affinity_gran_levels
< 0) {
1467 // Set the granularity level based on what levels are modeled in the machine
1469 __kmp_affinity_gran_levels
= 0;
1470 if ((threadLevel
>= 0) && (__kmp_affinity_gran
> affinity_gran_thread
)) {
1471 __kmp_affinity_gran_levels
++;
1473 if ((coreLevel
>= 0) && (__kmp_affinity_gran
> affinity_gran_core
)) {
1474 __kmp_affinity_gran_levels
++;
1476 if ((pkgLevel
>= 0) && (__kmp_affinity_gran
> affinity_gran_package
)) {
1477 __kmp_affinity_gran_levels
++;
1481 if (__kmp_affinity_verbose
) {
1482 __kmp_affinity_print_topology(*address2os
, nApics
, depth
, pkgLevel
,
1483 coreLevel
, threadLevel
);
1486 __kmp_free(threadInfo
);
1487 KMP_CPU_FREE(oldMask
);
1491 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1492 // architectures support a newer interface for specifying the x2APIC Ids,
1493 // based on cpuid leaf 11.
1494 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair
**address2os
,
1495 kmp_i18n_id_t
*const msg_id
) {
1498 *msg_id
= kmp_i18n_null
;
1500 // Check to see if cpuid leaf 11 is supported.
1501 __kmp_x86_cpuid(0, 0, &buf
);
1503 *msg_id
= kmp_i18n_str_NoLeaf11Support
;
1506 __kmp_x86_cpuid(11, 0, &buf
);
1508 *msg_id
= kmp_i18n_str_NoLeaf11Support
;
1512 // Find the number of levels in the machine topology. While we're at it, get
1513 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
1514 // get more accurate values later by explicitly counting them, but get
1515 // reasonable defaults now, in case we return early.
1517 int threadLevel
= -1;
1520 __kmp_nThreadsPerCore
= nCoresPerPkg
= nPackages
= 1;
1522 for (level
= 0;; level
++) {
1524 // FIXME: Hack for DPD200163180
1526 // If level is big then something went wrong -> exiting
1528 // There could actually be 32 valid levels in the machine topology, but so
1529 // far, the only machine we have seen which does not exit this loop before
1530 // iteration 32 has fubar x2APIC settings.
1532 // For now, just reject this case based upon loop trip count.
1533 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
1536 __kmp_x86_cpuid(11, level
, &buf
);
1539 // Will infer nPackages from __kmp_xproc
1545 int kind
= (buf
.ecx
>> 8) & 0xff;
1548 threadLevel
= level
;
1551 __kmp_nThreadsPerCore
= buf
.ebx
& 0xffff;
1552 if (__kmp_nThreadsPerCore
== 0) {
1553 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
1556 } else if (kind
== 2) {
1560 nCoresPerPkg
= buf
.ebx
& 0xffff;
1561 if (nCoresPerPkg
== 0) {
1562 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
1567 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
1570 if (pkgLevel
>= 0) {
1574 nPackages
= buf
.ebx
& 0xffff;
1575 if (nPackages
== 0) {
1576 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
1583 // In the above loop, "level" was counted from the finest level (usually
1584 // thread) to the coarsest. The caller expects that we will place the labels
1585 // in (*address2os)[].first.labels[] in the inverse order, so we need to
1586 // invert the vars saying which level means what.
1587 if (threadLevel
>= 0) {
1588 threadLevel
= depth
- threadLevel
- 1;
1590 if (coreLevel
>= 0) {
1591 coreLevel
= depth
- coreLevel
- 1;
1593 KMP_DEBUG_ASSERT(pkgLevel
>= 0);
1594 pkgLevel
= depth
- pkgLevel
- 1;
1596 // The algorithm used starts by setting the affinity to each available thread
1597 // and retrieving info from the cpuid instruction, so if we are not capable of
1598 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1599 // need to do something else - use the defaults that we calculated from
1600 // issuing cpuid without binding to each proc.
1601 if (!KMP_AFFINITY_CAPABLE()) {
1602 // Hack to try and infer the machine topology using only the data
1603 // available from cpuid on the current thread, and __kmp_xproc.
1604 KMP_ASSERT(__kmp_affinity_type
== affinity_none
);
1606 __kmp_ncores
= __kmp_xproc
/ __kmp_nThreadsPerCore
;
1607 nPackages
= (__kmp_xproc
+ nCoresPerPkg
- 1) / nCoresPerPkg
;
1608 if (__kmp_affinity_verbose
) {
1609 KMP_INFORM(AffNotCapableUseLocCpuidL11
, "KMP_AFFINITY");
1610 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
1611 if (__kmp_affinity_uniform_topology()) {
1612 KMP_INFORM(Uniform
, "KMP_AFFINITY");
1614 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
1616 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
1617 __kmp_nThreadsPerCore
, __kmp_ncores
);
1622 // From here on, we can assume that it is safe to call
1623 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1624 // __kmp_affinity_type = affinity_none.
1626 // Save the affinity mask for the current thread.
1627 kmp_affin_mask_t
*oldMask
;
1628 KMP_CPU_ALLOC(oldMask
);
1629 __kmp_get_system_affinity(oldMask
, TRUE
);
1631 // Allocate the data structure to be returned.
1632 AddrUnsPair
*retval
=
1633 (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
) * __kmp_avail_proc
);
1635 // Run through each of the available contexts, binding the current thread
1636 // to it, and obtaining the pertinent information using the cpuid instr.
1639 KMP_CPU_SET_ITERATE(proc
, __kmp_affin_fullMask
) {
1640 // Skip this proc if it is not included in the machine model.
1641 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
1644 KMP_DEBUG_ASSERT(nApics
< __kmp_avail_proc
);
1646 __kmp_affinity_dispatch
->bind_thread(proc
);
1648 // Extract labels for each level in the machine topology map from Apic ID.
1649 Address
addr(depth
);
1652 for (level
= 0; level
< depth
; level
++) {
1653 __kmp_x86_cpuid(11, level
, &buf
);
1654 unsigned apicId
= buf
.edx
;
1656 if (level
!= depth
- 1) {
1657 KMP_CPU_FREE(oldMask
);
1658 *msg_id
= kmp_i18n_str_InconsistentCpuidInfo
;
1661 addr
.labels
[depth
- level
- 1] = apicId
>> prev_shift
;
1665 int shift
= buf
.eax
& 0x1f;
1666 int mask
= (1 << shift
) - 1;
1667 addr
.labels
[depth
- level
- 1] = (apicId
& mask
) >> prev_shift
;
1670 if (level
!= depth
) {
1671 KMP_CPU_FREE(oldMask
);
1672 *msg_id
= kmp_i18n_str_InconsistentCpuidInfo
;
1676 retval
[nApics
] = AddrUnsPair(addr
, proc
);
1680 // We've collected all the info we need.
1681 // Restore the old affinity mask for this thread.
1682 __kmp_set_system_affinity(oldMask
, TRUE
);
1684 // If there's only one thread context to bind to, return now.
1685 KMP_ASSERT(nApics
> 0);
1687 __kmp_ncores
= nPackages
= 1;
1688 __kmp_nThreadsPerCore
= nCoresPerPkg
= 1;
1689 if (__kmp_affinity_verbose
) {
1690 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
1691 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, oldMask
);
1693 KMP_INFORM(AffUseGlobCpuidL11
, "KMP_AFFINITY");
1694 if (__kmp_affinity_respect_mask
) {
1695 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", buf
);
1697 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", buf
);
1699 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
1700 KMP_INFORM(Uniform
, "KMP_AFFINITY");
1701 KMP_INFORM(Topology
, "KMP_AFFINITY", nPackages
, nCoresPerPkg
,
1702 __kmp_nThreadsPerCore
, __kmp_ncores
);
1705 if (__kmp_affinity_type
== affinity_none
) {
1707 KMP_CPU_FREE(oldMask
);
1711 // Form an Address object which only includes the package level.
1713 addr
.labels
[0] = retval
[0].first
.labels
[pkgLevel
];
1714 retval
[0].first
= addr
;
1716 if (__kmp_affinity_gran_levels
< 0) {
1717 __kmp_affinity_gran_levels
= 0;
1720 if (__kmp_affinity_verbose
) {
1721 __kmp_affinity_print_topology(retval
, 1, 1, 0, -1, -1);
1724 *address2os
= retval
;
1725 KMP_CPU_FREE(oldMask
);
1729 // Sort the table by physical Id.
1730 qsort(retval
, nApics
, sizeof(*retval
), __kmp_affinity_cmp_Address_labels
);
1732 // Find the radix at each of the levels.
1733 unsigned *totals
= (unsigned *)__kmp_allocate(depth
* sizeof(unsigned));
1734 unsigned *counts
= (unsigned *)__kmp_allocate(depth
* sizeof(unsigned));
1735 unsigned *maxCt
= (unsigned *)__kmp_allocate(depth
* sizeof(unsigned));
1736 unsigned *last
= (unsigned *)__kmp_allocate(depth
* sizeof(unsigned));
1737 for (level
= 0; level
< depth
; level
++) {
1741 last
[level
] = retval
[0].first
.labels
[level
];
1744 // From here on, the iteration variable "level" runs from the finest level to
1745 // the coarsest, i.e. we iterate forward through
1746 // (*address2os)[].first.labels[] - in the previous loops, we iterated
1748 for (proc
= 1; (int)proc
< nApics
; proc
++) {
1750 for (level
= 0; level
< depth
; level
++) {
1751 if (retval
[proc
].first
.labels
[level
] != last
[level
]) {
1753 for (j
= level
+ 1; j
< depth
; j
++) {
1756 // The line below causes printing incorrect topology information in
1757 // case the max value for some level (maxCt[level]) is encountered
1758 // earlier than some less value while going through the array. For
1759 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
1761 // whereas it must be 4.
1762 // TODO!!! Check if it can be commented safely
1764 last
[j
] = retval
[proc
].first
.labels
[j
];
1768 if (counts
[level
] > maxCt
[level
]) {
1769 maxCt
[level
] = counts
[level
];
1771 last
[level
] = retval
[proc
].first
.labels
[level
];
1773 } else if (level
== depth
- 1) {
1779 KMP_CPU_FREE(oldMask
);
1780 *msg_id
= kmp_i18n_str_x2ApicIDsNotUnique
;
1786 // When affinity is off, this routine will still be called to set
1787 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1788 // Make sure all these vars are set correctly, and return if affinity is not
1790 if (threadLevel
>= 0) {
1791 __kmp_nThreadsPerCore
= maxCt
[threadLevel
];
1793 __kmp_nThreadsPerCore
= 1;
1795 nPackages
= totals
[pkgLevel
];
1797 if (coreLevel
>= 0) {
1798 __kmp_ncores
= totals
[coreLevel
];
1799 nCoresPerPkg
= maxCt
[coreLevel
];
1801 __kmp_ncores
= nPackages
;
1805 // Check to see if the machine topology is uniform
1806 unsigned prod
= maxCt
[0];
1807 for (level
= 1; level
< depth
; level
++) {
1808 prod
*= maxCt
[level
];
1810 bool uniform
= (prod
== totals
[level
- 1]);
1812 // Print the machine topology summary.
1813 if (__kmp_affinity_verbose
) {
1814 char mask
[KMP_AFFIN_MASK_PRINT_LEN
];
1815 __kmp_affinity_print_mask(mask
, KMP_AFFIN_MASK_PRINT_LEN
, oldMask
);
1817 KMP_INFORM(AffUseGlobCpuidL11
, "KMP_AFFINITY");
1818 if (__kmp_affinity_respect_mask
) {
1819 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", mask
);
1821 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", mask
);
1823 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
1825 KMP_INFORM(Uniform
, "KMP_AFFINITY");
1827 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
1831 __kmp_str_buf_init(&buf
);
1833 __kmp_str_buf_print(&buf
, "%d", totals
[0]);
1834 for (level
= 1; level
<= pkgLevel
; level
++) {
1835 __kmp_str_buf_print(&buf
, " x %d", maxCt
[level
]);
1837 KMP_INFORM(TopologyExtra
, "KMP_AFFINITY", buf
.str
, nCoresPerPkg
,
1838 __kmp_nThreadsPerCore
, __kmp_ncores
);
1840 __kmp_str_buf_free(&buf
);
1842 KMP_DEBUG_ASSERT(__kmp_pu_os_idx
== NULL
);
1843 KMP_DEBUG_ASSERT(nApics
== __kmp_avail_proc
);
1844 __kmp_pu_os_idx
= (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc
);
1845 for (proc
= 0; (int)proc
< nApics
; ++proc
) {
1846 __kmp_pu_os_idx
[proc
] = retval
[proc
].second
;
1848 if (__kmp_affinity_type
== affinity_none
) {
1854 KMP_CPU_FREE(oldMask
);
1858 // Find any levels with radix 1, and remove them from the map
1859 // (except for the package level).
1861 for (level
= 0; level
< depth
; level
++) {
1862 if ((maxCt
[level
] == 1) && (level
!= pkgLevel
)) {
1868 // If we are removing any levels, allocate a new vector to return,
1869 // and copy the relevant information to it.
1870 if (new_depth
!= depth
) {
1871 AddrUnsPair
*new_retval
=
1872 (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
) * nApics
);
1873 for (proc
= 0; (int)proc
< nApics
; proc
++) {
1874 Address
addr(new_depth
);
1875 new_retval
[proc
] = AddrUnsPair(addr
, retval
[proc
].second
);
1878 int newPkgLevel
= -1;
1879 int newCoreLevel
= -1;
1880 int newThreadLevel
= -1;
1881 for (level
= 0; level
< depth
; level
++) {
1882 if ((maxCt
[level
] == 1) && (level
!= pkgLevel
)) {
1883 // Remove this level. Never remove the package level
1886 if (level
== pkgLevel
) {
1887 newPkgLevel
= new_level
;
1889 if (level
== coreLevel
) {
1890 newCoreLevel
= new_level
;
1892 if (level
== threadLevel
) {
1893 newThreadLevel
= new_level
;
1895 for (proc
= 0; (int)proc
< nApics
; proc
++) {
1896 new_retval
[proc
].first
.labels
[new_level
] =
1897 retval
[proc
].first
.labels
[level
];
1903 retval
= new_retval
;
1905 pkgLevel
= newPkgLevel
;
1906 coreLevel
= newCoreLevel
;
1907 threadLevel
= newThreadLevel
;
1910 if (__kmp_affinity_gran_levels
< 0) {
1911 // Set the granularity level based on what levels are modeled
1912 // in the machine topology map.
1913 __kmp_affinity_gran_levels
= 0;
1914 if ((threadLevel
>= 0) && (__kmp_affinity_gran
> affinity_gran_thread
)) {
1915 __kmp_affinity_gran_levels
++;
1917 if ((coreLevel
>= 0) && (__kmp_affinity_gran
> affinity_gran_core
)) {
1918 __kmp_affinity_gran_levels
++;
1920 if (__kmp_affinity_gran
> affinity_gran_package
) {
1921 __kmp_affinity_gran_levels
++;
1925 if (__kmp_affinity_verbose
) {
1926 __kmp_affinity_print_topology(retval
, nApics
, depth
, pkgLevel
, coreLevel
,
1934 KMP_CPU_FREE(oldMask
);
1935 *address2os
= retval
;
1939 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1942 #define threadIdIndex 1
1943 #define coreIdIndex 2
1944 #define pkgIdIndex 3
1945 #define nodeIdIndex 4
1947 typedef unsigned *ProcCpuInfo
;
1948 static unsigned maxIndex
= pkgIdIndex
;
1950 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a
,
1953 const unsigned *aa
= *(unsigned *const *)a
;
1954 const unsigned *bb
= *(unsigned *const *)b
;
1955 for (i
= maxIndex
;; i
--) {
1966 #if KMP_USE_HIER_SCHED
1967 // Set the array sizes for the hierarchy layers
1968 static void __kmp_dispatch_set_hierarchy_values() {
1969 // Set the maximum number of L1's to number of cores
1970 // Set the maximum number of L2's to to either number of cores / 2 for
1971 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
1972 // Or the number of cores for Intel(R) Xeon(R) processors
1973 // Set the maximum number of NUMA nodes and L3's to number of packages
1974 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_THREAD
+ 1] =
1975 nPackages
* nCoresPerPkg
* __kmp_nThreadsPerCore
;
1976 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L1
+ 1] = __kmp_ncores
;
1977 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
1979 if (__kmp_mic_type
>= mic3
)
1980 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L2
+ 1] = __kmp_ncores
/ 2;
1982 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1983 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L2
+ 1] = __kmp_ncores
;
1984 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L3
+ 1] = nPackages
;
1985 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_NUMA
+ 1] = nPackages
;
1986 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_LOOP
+ 1] = 1;
1987 // Set the number of threads per unit
1988 // Number of hardware threads per L1/L2/L3/NUMA/LOOP
1989 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_THREAD
+ 1] = 1;
1990 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L1
+ 1] =
1991 __kmp_nThreadsPerCore
;
1992 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
1994 if (__kmp_mic_type
>= mic3
)
1995 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L2
+ 1] =
1996 2 * __kmp_nThreadsPerCore
;
1998 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1999 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L2
+ 1] =
2000 __kmp_nThreadsPerCore
;
2001 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L3
+ 1] =
2002 nCoresPerPkg
* __kmp_nThreadsPerCore
;
2003 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_NUMA
+ 1] =
2004 nCoresPerPkg
* __kmp_nThreadsPerCore
;
2005 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_LOOP
+ 1] =
2006 nPackages
* nCoresPerPkg
* __kmp_nThreadsPerCore
;
2009 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2010 // i.e., this thread's L1 or this thread's L2, etc.
2011 int __kmp_dispatch_get_index(int tid
, kmp_hier_layer_e type
) {
2012 int index
= type
+ 1;
2013 int num_hw_threads
= __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_THREAD
+ 1];
2014 KMP_DEBUG_ASSERT(type
!= kmp_hier_layer_e::LAYER_LAST
);
2015 if (type
== kmp_hier_layer_e::LAYER_THREAD
)
2017 else if (type
== kmp_hier_layer_e::LAYER_LOOP
)
2019 KMP_DEBUG_ASSERT(__kmp_hier_max_units
[index
] != 0);
2020 if (tid
>= num_hw_threads
)
2021 tid
= tid
% num_hw_threads
;
2022 return (tid
/ __kmp_hier_threads_per
[index
]) % __kmp_hier_max_units
[index
];
2025 // Return the number of t1's per t2
2026 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1
, kmp_hier_layer_e t2
) {
2029 KMP_DEBUG_ASSERT(i1
<= i2
);
2030 KMP_DEBUG_ASSERT(t1
!= kmp_hier_layer_e::LAYER_LAST
);
2031 KMP_DEBUG_ASSERT(t2
!= kmp_hier_layer_e::LAYER_LAST
);
2032 KMP_DEBUG_ASSERT(__kmp_hier_threads_per
[i1
] != 0);
2033 // (nthreads/t2) / (nthreads/t1) = t1 / t2
2034 return __kmp_hier_threads_per
[i2
] / __kmp_hier_threads_per
[i1
];
2036 #endif // KMP_USE_HIER_SCHED
2038 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2040 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair
**address2os
,
2042 kmp_i18n_id_t
*const msg_id
,
2045 *msg_id
= kmp_i18n_null
;
2047 // Scan of the file, and count the number of "processor" (osId) fields,
2048 // and find the highest value of <n> for a node_<n> field.
2050 unsigned num_records
= 0;
2052 buf
[sizeof(buf
) - 1] = 1;
2053 if (!fgets(buf
, sizeof(buf
), f
)) {
2054 // Read errors presumably because of EOF
2058 char s1
[] = "processor";
2059 if (strncmp(buf
, s1
, sizeof(s1
) - 1) == 0) {
2064 // FIXME - this will match "node_<n> <garbage>"
2066 if (KMP_SSCANF(buf
, "node_%u id", &level
) == 1) {
2067 if (nodeIdIndex
+ level
>= maxIndex
) {
2068 maxIndex
= nodeIdIndex
+ level
;
2074 // Check for empty file / no valid processor records, or too many. The number
2075 // of records can't exceed the number of valid bits in the affinity mask.
2076 if (num_records
== 0) {
2078 *msg_id
= kmp_i18n_str_NoProcRecords
;
2081 if (num_records
> (unsigned)__kmp_xproc
) {
2083 *msg_id
= kmp_i18n_str_TooManyProcRecords
;
2087 // Set the file pointer back to the beginning, so that we can scan the file
2088 // again, this time performing a full parse of the data. Allocate a vector of
2089 // ProcCpuInfo object, where we will place the data. Adding an extra element
2090 // at the end allows us to remove a lot of extra checks for termination
2092 if (fseek(f
, 0, SEEK_SET
) != 0) {
2094 *msg_id
= kmp_i18n_str_CantRewindCpuinfo
;
2098 // Allocate the array of records to store the proc info in. The dummy
2099 // element at the end makes the logic in filling them out easier to code.
2100 unsigned **threadInfo
=
2101 (unsigned **)__kmp_allocate((num_records
+ 1) * sizeof(unsigned *));
2103 for (i
= 0; i
<= num_records
; i
++) {
2105 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
2108 #define CLEANUP_THREAD_INFO \
2109 for (i = 0; i <= num_records; i++) { \
2110 __kmp_free(threadInfo[i]); \
2112 __kmp_free(threadInfo);
2114 // A value of UINT_MAX means that we didn't find the field
2117 #define INIT_PROC_INFO(p) \
2118 for (__index = 0; __index <= maxIndex; __index++) { \
2119 (p)[__index] = UINT_MAX; \
2122 for (i
= 0; i
<= num_records
; i
++) {
2123 INIT_PROC_INFO(threadInfo
[i
]);
2126 unsigned num_avail
= 0;
2129 // Create an inner scoping level, so that all the goto targets at the end of
2130 // the loop appear in an outer scoping level. This avoids warnings about
2131 // jumping past an initialization to a target in the same block.
2133 buf
[sizeof(buf
) - 1] = 1;
2134 bool long_line
= false;
2135 if (!fgets(buf
, sizeof(buf
), f
)) {
2136 // Read errors presumably because of EOF
2137 // If there is valid data in threadInfo[num_avail], then fake
2138 // a blank line in ensure that the last address gets parsed.
2140 for (i
= 0; i
<= maxIndex
; i
++) {
2141 if (threadInfo
[num_avail
][i
] != UINT_MAX
) {
2149 } else if (!buf
[sizeof(buf
) - 1]) {
2150 // The line is longer than the buffer. Set a flag and don't
2151 // emit an error if we were going to ignore the line, anyway.
2154 #define CHECK_LINE \
2156 CLEANUP_THREAD_INFO; \
2157 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2163 char s1
[] = "processor";
2164 if (strncmp(buf
, s1
, sizeof(s1
) - 1) == 0) {
2166 char *p
= strchr(buf
+ sizeof(s1
) - 1, ':');
2168 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
2170 if (threadInfo
[num_avail
][osIdIndex
] != UINT_MAX
)
2171 #if KMP_ARCH_AARCH64
2172 // Handle the old AArch64 /proc/cpuinfo layout differently,
2173 // it contains all of the 'processor' entries listed in a
2174 // single 'Processor' section, therefore the normal looking
2175 // for duplicates in that section will always fail.
2180 threadInfo
[num_avail
][osIdIndex
] = val
;
2181 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2185 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2186 threadInfo
[num_avail
][osIdIndex
]);
2187 __kmp_read_from_file(path
, "%u", &threadInfo
[num_avail
][pkgIdIndex
]);
2189 KMP_SNPRINTF(path
, sizeof(path
),
2190 "/sys/devices/system/cpu/cpu%u/topology/core_id",
2191 threadInfo
[num_avail
][osIdIndex
]);
2192 __kmp_read_from_file(path
, "%u", &threadInfo
[num_avail
][coreIdIndex
]);
2196 char s2
[] = "physical id";
2197 if (strncmp(buf
, s2
, sizeof(s2
) - 1) == 0) {
2199 char *p
= strchr(buf
+ sizeof(s2
) - 1, ':');
2201 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
2203 if (threadInfo
[num_avail
][pkgIdIndex
] != UINT_MAX
)
2205 threadInfo
[num_avail
][pkgIdIndex
] = val
;
2208 char s3
[] = "core id";
2209 if (strncmp(buf
, s3
, sizeof(s3
) - 1) == 0) {
2211 char *p
= strchr(buf
+ sizeof(s3
) - 1, ':');
2213 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
2215 if (threadInfo
[num_avail
][coreIdIndex
] != UINT_MAX
)
2217 threadInfo
[num_avail
][coreIdIndex
] = val
;
2219 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2221 char s4
[] = "thread id";
2222 if (strncmp(buf
, s4
, sizeof(s4
) - 1) == 0) {
2224 char *p
= strchr(buf
+ sizeof(s4
) - 1, ':');
2226 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
2228 if (threadInfo
[num_avail
][threadIdIndex
] != UINT_MAX
)
2230 threadInfo
[num_avail
][threadIdIndex
] = val
;
2234 if (KMP_SSCANF(buf
, "node_%u id", &level
) == 1) {
2236 char *p
= strchr(buf
+ sizeof(s4
) - 1, ':');
2238 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
2240 KMP_ASSERT(nodeIdIndex
+ level
<= maxIndex
);
2241 if (threadInfo
[num_avail
][nodeIdIndex
+ level
] != UINT_MAX
)
2243 threadInfo
[num_avail
][nodeIdIndex
+ level
] = val
;
2247 // We didn't recognize the leading token on the line. There are lots of
2248 // leading tokens that we don't recognize - if the line isn't empty, go on
2249 // to the next line.
2250 if ((*buf
!= 0) && (*buf
!= '\n')) {
2251 // If the line is longer than the buffer, read characters
2252 // until we find a newline.
2255 while (((ch
= fgetc(f
)) != EOF
) && (ch
!= '\n'))
2261 // A newline has signalled the end of the processor record.
2262 // Check that there aren't too many procs specified.
2263 if ((int)num_avail
== __kmp_xproc
) {
2264 CLEANUP_THREAD_INFO
;
2265 *msg_id
= kmp_i18n_str_TooManyEntries
;
2269 // Check for missing fields. The osId field must be there, and we
2270 // currently require that the physical id field is specified, also.
2271 if (threadInfo
[num_avail
][osIdIndex
] == UINT_MAX
) {
2272 CLEANUP_THREAD_INFO
;
2273 *msg_id
= kmp_i18n_str_MissingProcField
;
2276 if (threadInfo
[0][pkgIdIndex
] == UINT_MAX
) {
2277 CLEANUP_THREAD_INFO
;
2278 *msg_id
= kmp_i18n_str_MissingPhysicalIDField
;
2282 // Skip this proc if it is not included in the machine model.
2283 if (!KMP_CPU_ISSET(threadInfo
[num_avail
][osIdIndex
],
2284 __kmp_affin_fullMask
)) {
2285 INIT_PROC_INFO(threadInfo
[num_avail
]);
2289 // We have a successful parse of this proc's info.
2290 // Increment the counter, and prepare for the next proc.
2292 KMP_ASSERT(num_avail
<= num_records
);
2293 INIT_PROC_INFO(threadInfo
[num_avail
]);
2298 CLEANUP_THREAD_INFO
;
2299 *msg_id
= kmp_i18n_str_MissingValCpuinfo
;
2303 CLEANUP_THREAD_INFO
;
2304 *msg_id
= kmp_i18n_str_DuplicateFieldCpuinfo
;
2309 #if KMP_MIC && REDUCE_TEAM_SIZE
2310 unsigned teamSize
= 0;
2311 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2313 // check for num_records == __kmp_xproc ???
2315 // If there's only one thread context to bind to, form an Address object with
2316 // depth 1 and return immediately (or, if affinity is off, set address2os to
2317 // NULL and return).
2319 // If it is configured to omit the package level when there is only a single
2320 // package, the logic at the end of this routine won't work if there is only a
2321 // single thread - it would try to form an Address object with depth 0.
2322 KMP_ASSERT(num_avail
> 0);
2323 KMP_ASSERT(num_avail
<= num_records
);
2324 if (num_avail
== 1) {
2326 __kmp_nThreadsPerCore
= nCoresPerPkg
= nPackages
= 1;
2327 if (__kmp_affinity_verbose
) {
2328 if (!KMP_AFFINITY_CAPABLE()) {
2329 KMP_INFORM(AffNotCapableUseCpuinfo
, "KMP_AFFINITY");
2330 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
2331 KMP_INFORM(Uniform
, "KMP_AFFINITY");
2333 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
2334 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
2335 __kmp_affin_fullMask
);
2336 KMP_INFORM(AffCapableUseCpuinfo
, "KMP_AFFINITY");
2337 if (__kmp_affinity_respect_mask
) {
2338 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", buf
);
2340 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", buf
);
2342 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
2343 KMP_INFORM(Uniform
, "KMP_AFFINITY");
2347 __kmp_str_buf_init(&buf
);
2348 __kmp_str_buf_print(&buf
, "1");
2349 for (index
= maxIndex
- 1; index
> pkgIdIndex
; index
--) {
2350 __kmp_str_buf_print(&buf
, " x 1");
2352 KMP_INFORM(TopologyExtra
, "KMP_AFFINITY", buf
.str
, 1, 1, 1);
2353 __kmp_str_buf_free(&buf
);
2356 if (__kmp_affinity_type
== affinity_none
) {
2357 CLEANUP_THREAD_INFO
;
2361 *address2os
= (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
));
2363 addr
.labels
[0] = threadInfo
[0][pkgIdIndex
];
2364 (*address2os
)[0] = AddrUnsPair(addr
, threadInfo
[0][osIdIndex
]);
2366 if (__kmp_affinity_gran_levels
< 0) {
2367 __kmp_affinity_gran_levels
= 0;
2370 if (__kmp_affinity_verbose
) {
2371 __kmp_affinity_print_topology(*address2os
, 1, 1, 0, -1, -1);
2374 CLEANUP_THREAD_INFO
;
2378 // Sort the threadInfo table by physical Id.
2379 qsort(threadInfo
, num_avail
, sizeof(*threadInfo
),
2380 __kmp_affinity_cmp_ProcCpuInfo_phys_id
);
2382 // The table is now sorted by pkgId / coreId / threadId, but we really don't
2383 // know the radix of any of the fields. pkgId's may be sparsely assigned among
2384 // the chips on a system. Although coreId's are usually assigned
2385 // [0 .. coresPerPkg-1] and threadId's are usually assigned
2386 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2388 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2389 // total # packages) are at this point - we want to determine that now. We
2390 // only have an upper bound on the first two figures.
2392 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
2394 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
2396 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
2398 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
2400 bool assign_thread_ids
= false;
2401 unsigned threadIdCt
;
2404 restart_radix_check
:
2407 // Initialize the counter arrays with data from threadInfo[0].
2408 if (assign_thread_ids
) {
2409 if (threadInfo
[0][threadIdIndex
] == UINT_MAX
) {
2410 threadInfo
[0][threadIdIndex
] = threadIdCt
++;
2411 } else if (threadIdCt
<= threadInfo
[0][threadIdIndex
]) {
2412 threadIdCt
= threadInfo
[0][threadIdIndex
] + 1;
2415 for (index
= 0; index
<= maxIndex
; index
++) {
2419 lastId
[index
] = threadInfo
[0][index
];
2423 // Run through the rest of the OS procs.
2424 for (i
= 1; i
< num_avail
; i
++) {
2425 // Find the most significant index whose id differs from the id for the
2426 // previous OS proc.
2427 for (index
= maxIndex
; index
>= threadIdIndex
; index
--) {
2428 if (assign_thread_ids
&& (index
== threadIdIndex
)) {
2429 // Auto-assign the thread id field if it wasn't specified.
2430 if (threadInfo
[i
][threadIdIndex
] == UINT_MAX
) {
2431 threadInfo
[i
][threadIdIndex
] = threadIdCt
++;
2433 // Apparently the thread id field was specified for some entries and not
2434 // others. Start the thread id counter off at the next higher thread id.
2435 else if (threadIdCt
<= threadInfo
[i
][threadIdIndex
]) {
2436 threadIdCt
= threadInfo
[i
][threadIdIndex
] + 1;
2439 if (threadInfo
[i
][index
] != lastId
[index
]) {
2440 // Run through all indices which are less significant, and reset the
2441 // counts to 1. At all levels up to and including index, we need to
2442 // increment the totals and record the last id.
2444 for (index2
= threadIdIndex
; index2
< index
; index2
++) {
2446 if (counts
[index2
] > maxCt
[index2
]) {
2447 maxCt
[index2
] = counts
[index2
];
2450 lastId
[index2
] = threadInfo
[i
][index2
];
2454 lastId
[index
] = threadInfo
[i
][index
];
2456 if (assign_thread_ids
&& (index
> threadIdIndex
)) {
2458 #if KMP_MIC && REDUCE_TEAM_SIZE
2459 // The default team size is the total #threads in the machine
2460 // minus 1 thread for every core that has 3 or more threads.
2461 teamSize
+= (threadIdCt
<= 2) ? (threadIdCt
) : (threadIdCt
- 1);
2462 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2464 // Restart the thread counter, as we are on a new core.
2467 // Auto-assign the thread id field if it wasn't specified.
2468 if (threadInfo
[i
][threadIdIndex
] == UINT_MAX
) {
2469 threadInfo
[i
][threadIdIndex
] = threadIdCt
++;
2472 // Apparently the thread id field was specified for some entries and
2473 // not others. Start the thread id counter off at the next higher
2475 else if (threadIdCt
<= threadInfo
[i
][threadIdIndex
]) {
2476 threadIdCt
= threadInfo
[i
][threadIdIndex
] + 1;
2482 if (index
< threadIdIndex
) {
2483 // If thread ids were specified, it is an error if they are not unique.
2484 // Also, check that we waven't already restarted the loop (to be safe -
2485 // shouldn't need to).
2486 if ((threadInfo
[i
][threadIdIndex
] != UINT_MAX
) || assign_thread_ids
) {
2491 CLEANUP_THREAD_INFO
;
2492 *msg_id
= kmp_i18n_str_PhysicalIDsNotUnique
;
2496 // If the thread ids were not specified and we see entries entries that
2497 // are duplicates, start the loop over and assign the thread ids manually.
2498 assign_thread_ids
= true;
2499 goto restart_radix_check
;
2503 #if KMP_MIC && REDUCE_TEAM_SIZE
2504 // The default team size is the total #threads in the machine
2505 // minus 1 thread for every core that has 3 or more threads.
2506 teamSize
+= (threadIdCt
<= 2) ? (threadIdCt
) : (threadIdCt
- 1);
2507 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2509 for (index
= threadIdIndex
; index
<= maxIndex
; index
++) {
2510 if (counts
[index
] > maxCt
[index
]) {
2511 maxCt
[index
] = counts
[index
];
2515 __kmp_nThreadsPerCore
= maxCt
[threadIdIndex
];
2516 nCoresPerPkg
= maxCt
[coreIdIndex
];
2517 nPackages
= totals
[pkgIdIndex
];
2519 // Check to see if the machine topology is uniform
2520 unsigned prod
= totals
[maxIndex
];
2521 for (index
= threadIdIndex
; index
< maxIndex
; index
++) {
2522 prod
*= maxCt
[index
];
2524 bool uniform
= (prod
== totals
[threadIdIndex
]);
2526 // When affinity is off, this routine will still be called to set
2527 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2528 // Make sure all these vars are set correctly, and return now if affinity is
2530 __kmp_ncores
= totals
[coreIdIndex
];
2532 if (__kmp_affinity_verbose
) {
2533 if (!KMP_AFFINITY_CAPABLE()) {
2534 KMP_INFORM(AffNotCapableUseCpuinfo
, "KMP_AFFINITY");
2535 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
2537 KMP_INFORM(Uniform
, "KMP_AFFINITY");
2539 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
2542 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
2543 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
2544 __kmp_affin_fullMask
);
2545 KMP_INFORM(AffCapableUseCpuinfo
, "KMP_AFFINITY");
2546 if (__kmp_affinity_respect_mask
) {
2547 KMP_INFORM(InitOSProcSetRespect
, "KMP_AFFINITY", buf
);
2549 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_AFFINITY", buf
);
2551 KMP_INFORM(AvailableOSProc
, "KMP_AFFINITY", __kmp_avail_proc
);
2553 KMP_INFORM(Uniform
, "KMP_AFFINITY");
2555 KMP_INFORM(NonUniform
, "KMP_AFFINITY");
2559 __kmp_str_buf_init(&buf
);
2561 __kmp_str_buf_print(&buf
, "%d", totals
[maxIndex
]);
2562 for (index
= maxIndex
- 1; index
>= pkgIdIndex
; index
--) {
2563 __kmp_str_buf_print(&buf
, " x %d", maxCt
[index
]);
2565 KMP_INFORM(TopologyExtra
, "KMP_AFFINITY", buf
.str
, maxCt
[coreIdIndex
],
2566 maxCt
[threadIdIndex
], __kmp_ncores
);
2568 __kmp_str_buf_free(&buf
);
2571 #if KMP_MIC && REDUCE_TEAM_SIZE
2572 // Set the default team size.
2573 if ((__kmp_dflt_team_nth
== 0) && (teamSize
> 0)) {
2574 __kmp_dflt_team_nth
= teamSize
;
2575 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2576 "__kmp_dflt_team_nth = %d\n",
2577 __kmp_dflt_team_nth
));
2579 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2581 KMP_DEBUG_ASSERT(__kmp_pu_os_idx
== NULL
);
2582 KMP_DEBUG_ASSERT(num_avail
== (unsigned)__kmp_avail_proc
);
2583 __kmp_pu_os_idx
= (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc
);
2584 for (i
= 0; i
< num_avail
; ++i
) { // fill the os indices
2585 __kmp_pu_os_idx
[i
] = threadInfo
[i
][osIdIndex
];
2588 if (__kmp_affinity_type
== affinity_none
) {
2593 CLEANUP_THREAD_INFO
;
2597 // Count the number of levels which have more nodes at that level than at the
2598 // parent's level (with there being an implicit root node of the top level).
2599 // This is equivalent to saying that there is at least one node at this level
2600 // which has a sibling. These levels are in the map, and the package level is
2601 // always in the map.
2602 bool *inMap
= (bool *)__kmp_allocate((maxIndex
+ 1) * sizeof(bool));
2603 for (index
= threadIdIndex
; index
< maxIndex
; index
++) {
2604 KMP_ASSERT(totals
[index
] >= totals
[index
+ 1]);
2605 inMap
[index
] = (totals
[index
] > totals
[index
+ 1]);
2607 inMap
[maxIndex
] = (totals
[maxIndex
] > 1);
2608 inMap
[pkgIdIndex
] = true;
2611 for (index
= threadIdIndex
; index
<= maxIndex
; index
++) {
2616 KMP_ASSERT(depth
> 0);
2618 // Construct the data structure that is to be returned.
2619 *address2os
= (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
) * num_avail
);
2622 int threadLevel
= -1;
2624 for (i
= 0; i
< num_avail
; ++i
) {
2625 Address
addr(depth
);
2626 unsigned os
= threadInfo
[i
][osIdIndex
];
2630 for (src_index
= maxIndex
; src_index
>= threadIdIndex
; src_index
--) {
2631 if (!inMap
[src_index
]) {
2634 addr
.labels
[dst_index
] = threadInfo
[i
][src_index
];
2635 if (src_index
== pkgIdIndex
) {
2636 pkgLevel
= dst_index
;
2637 } else if (src_index
== coreIdIndex
) {
2638 coreLevel
= dst_index
;
2639 } else if (src_index
== threadIdIndex
) {
2640 threadLevel
= dst_index
;
2644 (*address2os
)[i
] = AddrUnsPair(addr
, os
);
2647 if (__kmp_affinity_gran_levels
< 0) {
2648 // Set the granularity level based on what levels are modeled
2649 // in the machine topology map.
2651 __kmp_affinity_gran_levels
= 0;
2652 for (src_index
= threadIdIndex
; src_index
<= maxIndex
; src_index
++) {
2653 if (!inMap
[src_index
]) {
2656 switch (src_index
) {
2658 if (__kmp_affinity_gran
> affinity_gran_thread
) {
2659 __kmp_affinity_gran_levels
++;
2664 if (__kmp_affinity_gran
> affinity_gran_core
) {
2665 __kmp_affinity_gran_levels
++;
2670 if (__kmp_affinity_gran
> affinity_gran_package
) {
2671 __kmp_affinity_gran_levels
++;
2678 if (__kmp_affinity_verbose
) {
2679 __kmp_affinity_print_topology(*address2os
, num_avail
, depth
, pkgLevel
,
2680 coreLevel
, threadLevel
);
2688 CLEANUP_THREAD_INFO
;
2692 // Create and return a table of affinity masks, indexed by OS thread ID.
2693 // This routine handles OR'ing together all the affinity masks of threads
2694 // that are sufficiently close, if granularity > fine.
2695 static kmp_affin_mask_t
*__kmp_create_masks(unsigned *maxIndex
,
2696 unsigned *numUnique
,
2697 AddrUnsPair
*address2os
,
2698 unsigned numAddrs
) {
2699 // First form a table of affinity masks in order of OS thread id.
2704 KMP_ASSERT(numAddrs
> 0);
2705 depth
= address2os
[0].first
.depth
;
2708 for (i
= numAddrs
- 1;; --i
) {
2709 unsigned osId
= address2os
[i
].second
;
2710 if (osId
> maxOsId
) {
2716 kmp_affin_mask_t
*osId2Mask
;
2717 KMP_CPU_ALLOC_ARRAY(osId2Mask
, (maxOsId
+ 1));
2719 // Sort the address2os table according to physical order. Doing so will put
2720 // all threads on the same core/package/node in consecutive locations.
2721 qsort(address2os
, numAddrs
, sizeof(*address2os
),
2722 __kmp_affinity_cmp_Address_labels
);
2724 KMP_ASSERT(__kmp_affinity_gran_levels
>= 0);
2725 if (__kmp_affinity_verbose
&& (__kmp_affinity_gran_levels
> 0)) {
2726 KMP_INFORM(ThreadsMigrate
, "KMP_AFFINITY", __kmp_affinity_gran_levels
);
2728 if (__kmp_affinity_gran_levels
>= (int)depth
) {
2729 if (__kmp_affinity_verbose
||
2730 (__kmp_affinity_warnings
&& (__kmp_affinity_type
!= affinity_none
))) {
2731 KMP_WARNING(AffThreadsMayMigrate
);
2735 // Run through the table, forming the masks for all threads on each core.
2736 // Threads on the same core will have identical "Address" objects, not
2737 // considering the last level, which must be the thread id. All threads on a
2738 // core will appear consecutively.
2739 unsigned unique
= 0;
2740 unsigned j
= 0; // index of 1st thread on core
2741 unsigned leader
= 0;
2742 Address
*leaderAddr
= &(address2os
[0].first
);
2743 kmp_affin_mask_t
*sum
;
2744 KMP_CPU_ALLOC_ON_STACK(sum
);
2746 KMP_CPU_SET(address2os
[0].second
, sum
);
2747 for (i
= 1; i
< numAddrs
; i
++) {
2748 // If this thread is sufficiently close to the leader (within the
2749 // granularity setting), then set the bit for this os thread in the
2750 // affinity mask for this group, and go on to the next thread.
2751 if (leaderAddr
->isClose(address2os
[i
].first
, __kmp_affinity_gran_levels
)) {
2752 KMP_CPU_SET(address2os
[i
].second
, sum
);
2756 // For every thread in this group, copy the mask to the thread's entry in
2757 // the osId2Mask table. Mark the first address as a leader.
2758 for (; j
< i
; j
++) {
2759 unsigned osId
= address2os
[j
].second
;
2760 KMP_DEBUG_ASSERT(osId
<= maxOsId
);
2761 kmp_affin_mask_t
*mask
= KMP_CPU_INDEX(osId2Mask
, osId
);
2762 KMP_CPU_COPY(mask
, sum
);
2763 address2os
[j
].first
.leader
= (j
== leader
);
2767 // Start a new mask.
2769 leaderAddr
= &(address2os
[i
].first
);
2771 KMP_CPU_SET(address2os
[i
].second
, sum
);
2774 // For every thread in last group, copy the mask to the thread's
2775 // entry in the osId2Mask table.
2776 for (; j
< i
; j
++) {
2777 unsigned osId
= address2os
[j
].second
;
2778 KMP_DEBUG_ASSERT(osId
<= maxOsId
);
2779 kmp_affin_mask_t
*mask
= KMP_CPU_INDEX(osId2Mask
, osId
);
2780 KMP_CPU_COPY(mask
, sum
);
2781 address2os
[j
].first
.leader
= (j
== leader
);
2784 KMP_CPU_FREE_FROM_STACK(sum
);
2786 *maxIndex
= maxOsId
;
2787 *numUnique
= unique
;
2791 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2792 // as file-static than to try and pass them through the calling sequence of
2793 // the recursive-descent OMP_PLACES parser.
2794 static kmp_affin_mask_t
*newMasks
;
2795 static int numNewMasks
;
2796 static int nextNewMask
;
2798 #define ADD_MASK(_mask) \
2800 if (nextNewMask >= numNewMasks) { \
2803 kmp_affin_mask_t *temp; \
2804 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
2805 for (i = 0; i < numNewMasks / 2; i++) { \
2806 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
2807 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
2808 KMP_CPU_COPY(dest, src); \
2810 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
2813 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2817 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
2819 if (((_osId) > _maxOsId) || \
2820 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2821 if (__kmp_affinity_verbose || \
2822 (__kmp_affinity_warnings && \
2823 (__kmp_affinity_type != affinity_none))) { \
2824 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2827 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2831 // Re-parse the proclist (for the explicit affinity type), and form the list
2832 // of affinity newMasks indexed by gtid.
2833 static void __kmp_affinity_process_proclist(kmp_affin_mask_t
**out_masks
,
2834 unsigned int *out_numMasks
,
2835 const char *proclist
,
2836 kmp_affin_mask_t
*osId2Mask
,
2839 const char *scan
= proclist
;
2840 const char *next
= proclist
;
2842 // We use malloc() for the temporary mask vector, so that we can use
2843 // realloc() to extend it.
2845 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks
, numNewMasks
);
2847 kmp_affin_mask_t
*sumMask
;
2848 KMP_CPU_ALLOC(sumMask
);
2852 int start
, end
, stride
;
2856 if (*next
== '\0') {
2867 // Read the first integer in the set.
2868 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad proclist");
2870 num
= __kmp_str_to_int(scan
, *next
);
2871 KMP_ASSERT2(num
>= 0, "bad explicit proc list");
2873 // Copy the mask for that osId to the sum (union) mask.
2874 if ((num
> maxOsId
) ||
2875 (!KMP_CPU_ISSET(num
, KMP_CPU_INDEX(osId2Mask
, num
)))) {
2876 if (__kmp_affinity_verbose
||
2877 (__kmp_affinity_warnings
&&
2878 (__kmp_affinity_type
!= affinity_none
))) {
2879 KMP_WARNING(AffIgnoreInvalidProcID
, num
);
2881 KMP_CPU_ZERO(sumMask
);
2883 KMP_CPU_COPY(sumMask
, KMP_CPU_INDEX(osId2Mask
, num
));
2888 // Check for end of set.
2895 // Skip optional comma.
2901 // Read the next integer in the set.
2903 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
2906 num
= __kmp_str_to_int(scan
, *next
);
2907 KMP_ASSERT2(num
>= 0, "bad explicit proc list");
2909 // Add the mask for that osId to the sum mask.
2910 if ((num
> maxOsId
) ||
2911 (!KMP_CPU_ISSET(num
, KMP_CPU_INDEX(osId2Mask
, num
)))) {
2912 if (__kmp_affinity_verbose
||
2913 (__kmp_affinity_warnings
&&
2914 (__kmp_affinity_type
!= affinity_none
))) {
2915 KMP_WARNING(AffIgnoreInvalidProcID
, num
);
2918 KMP_CPU_UNION(sumMask
, KMP_CPU_INDEX(osId2Mask
, num
));
2934 // Read the first integer.
2935 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
2937 start
= __kmp_str_to_int(scan
, *next
);
2938 KMP_ASSERT2(start
>= 0, "bad explicit proc list");
2941 // If this isn't a range, then add a mask to the list and go on.
2943 ADD_MASK_OSID(start
, osId2Mask
, maxOsId
);
2945 // Skip optional comma.
2953 // This is a range. Skip over the '-' and read in the 2nd int.
2957 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
2959 end
= __kmp_str_to_int(scan
, *next
);
2960 KMP_ASSERT2(end
>= 0, "bad explicit proc list");
2962 // Check for a stride parameter
2966 // A stride is specified. Skip over the ':" and read the 3rd int.
2977 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
2979 stride
= __kmp_str_to_int(scan
, *next
);
2980 KMP_ASSERT2(stride
>= 0, "bad explicit proc list");
2984 // Do some range checks.
2985 KMP_ASSERT2(stride
!= 0, "bad explicit proc list");
2987 KMP_ASSERT2(start
<= end
, "bad explicit proc list");
2989 KMP_ASSERT2(start
>= end
, "bad explicit proc list");
2991 KMP_ASSERT2((end
- start
) / stride
<= 65536, "bad explicit proc list");
2993 // Add the mask for each OS proc # to the list.
2996 ADD_MASK_OSID(start
, osId2Mask
, maxOsId
);
2998 } while (start
<= end
);
3001 ADD_MASK_OSID(start
, osId2Mask
, maxOsId
);
3003 } while (start
>= end
);
3006 // Skip optional comma.
3014 *out_numMasks
= nextNewMask
;
3015 if (nextNewMask
== 0) {
3017 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
3020 KMP_CPU_ALLOC_ARRAY((*out_masks
), nextNewMask
);
3021 for (i
= 0; i
< nextNewMask
; i
++) {
3022 kmp_affin_mask_t
*src
= KMP_CPU_INDEX(newMasks
, i
);
3023 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX((*out_masks
), i
);
3024 KMP_CPU_COPY(dest
, src
);
3026 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
3027 KMP_CPU_FREE(sumMask
);
3030 /*-----------------------------------------------------------------------------
3031 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3032 places. Again, Here is the grammar:
3035 place_list := place , place_list
3037 place := place : num
3038 place := place : num : signed
3039 place := { subplacelist }
3040 place := ! place // (lowest priority)
3041 subplace_list := subplace
3042 subplace_list := subplace , subplace_list
3044 subplace := num : num
3045 subplace := num : num : signed
3049 -----------------------------------------------------------------------------*/
3050 static void __kmp_process_subplace_list(const char **scan
,
3051 kmp_affin_mask_t
*osId2Mask
,
3052 int maxOsId
, kmp_affin_mask_t
*tempMask
,
3057 int start
, count
, stride
, i
;
3059 // Read in the starting proc id
3061 KMP_ASSERT2((**scan
>= '0') && (**scan
<= '9'), "bad explicit places list");
3064 start
= __kmp_str_to_int(*scan
, *next
);
3065 KMP_ASSERT(start
>= 0);
3068 // valid follow sets are ',' ':' and '}'
3070 if (**scan
== '}' || **scan
== ',') {
3071 if ((start
> maxOsId
) ||
3072 (!KMP_CPU_ISSET(start
, KMP_CPU_INDEX(osId2Mask
, start
)))) {
3073 if (__kmp_affinity_verbose
||
3074 (__kmp_affinity_warnings
&&
3075 (__kmp_affinity_type
!= affinity_none
))) {
3076 KMP_WARNING(AffIgnoreInvalidProcID
, start
);
3079 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, start
));
3082 if (**scan
== '}') {
3085 (*scan
)++; // skip ','
3088 KMP_ASSERT2(**scan
== ':', "bad explicit places list");
3089 (*scan
)++; // skip ':'
3091 // Read count parameter
3093 KMP_ASSERT2((**scan
>= '0') && (**scan
<= '9'), "bad explicit places list");
3096 count
= __kmp_str_to_int(*scan
, *next
);
3097 KMP_ASSERT(count
>= 0);
3100 // valid follow sets are ',' ':' and '}'
3102 if (**scan
== '}' || **scan
== ',') {
3103 for (i
= 0; i
< count
; i
++) {
3104 if ((start
> maxOsId
) ||
3105 (!KMP_CPU_ISSET(start
, KMP_CPU_INDEX(osId2Mask
, start
)))) {
3106 if (__kmp_affinity_verbose
||
3107 (__kmp_affinity_warnings
&&
3108 (__kmp_affinity_type
!= affinity_none
))) {
3109 KMP_WARNING(AffIgnoreInvalidProcID
, start
);
3111 break; // don't proliferate warnings for large count
3113 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, start
));
3118 if (**scan
== '}') {
3121 (*scan
)++; // skip ','
3124 KMP_ASSERT2(**scan
== ':', "bad explicit places list");
3125 (*scan
)++; // skip ':'
3127 // Read stride parameter
3131 if (**scan
== '+') {
3132 (*scan
)++; // skip '+'
3135 if (**scan
== '-') {
3137 (*scan
)++; // skip '-'
3143 KMP_ASSERT2((**scan
>= '0') && (**scan
<= '9'), "bad explicit places list");
3146 stride
= __kmp_str_to_int(*scan
, *next
);
3147 KMP_ASSERT(stride
>= 0);
3151 // valid follow sets are ',' and '}'
3153 if (**scan
== '}' || **scan
== ',') {
3154 for (i
= 0; i
< count
; i
++) {
3155 if ((start
> maxOsId
) ||
3156 (!KMP_CPU_ISSET(start
, KMP_CPU_INDEX(osId2Mask
, start
)))) {
3157 if (__kmp_affinity_verbose
||
3158 (__kmp_affinity_warnings
&&
3159 (__kmp_affinity_type
!= affinity_none
))) {
3160 KMP_WARNING(AffIgnoreInvalidProcID
, start
);
3162 break; // don't proliferate warnings for large count
3164 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, start
));
3169 if (**scan
== '}') {
3172 (*scan
)++; // skip ','
3176 KMP_ASSERT2(0, "bad explicit places list");
3180 static void __kmp_process_place(const char **scan
, kmp_affin_mask_t
*osId2Mask
,
3181 int maxOsId
, kmp_affin_mask_t
*tempMask
,
3185 // valid follow sets are '{' '!' and num
3187 if (**scan
== '{') {
3188 (*scan
)++; // skip '{'
3189 __kmp_process_subplace_list(scan
, osId2Mask
, maxOsId
, tempMask
, setSize
);
3190 KMP_ASSERT2(**scan
== '}', "bad explicit places list");
3191 (*scan
)++; // skip '}'
3192 } else if (**scan
== '!') {
3193 (*scan
)++; // skip '!'
3194 __kmp_process_place(scan
, osId2Mask
, maxOsId
, tempMask
, setSize
);
3195 KMP_CPU_COMPLEMENT(maxOsId
, tempMask
);
3196 } else if ((**scan
>= '0') && (**scan
<= '9')) {
3199 int num
= __kmp_str_to_int(*scan
, *next
);
3200 KMP_ASSERT(num
>= 0);
3201 if ((num
> maxOsId
) ||
3202 (!KMP_CPU_ISSET(num
, KMP_CPU_INDEX(osId2Mask
, num
)))) {
3203 if (__kmp_affinity_verbose
||
3204 (__kmp_affinity_warnings
&& (__kmp_affinity_type
!= affinity_none
))) {
3205 KMP_WARNING(AffIgnoreInvalidProcID
, num
);
3208 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, num
));
3211 *scan
= next
; // skip num
3213 KMP_ASSERT2(0, "bad explicit places list");
3218 void __kmp_affinity_process_placelist(kmp_affin_mask_t
**out_masks
,
3219 unsigned int *out_numMasks
,
3220 const char *placelist
,
3221 kmp_affin_mask_t
*osId2Mask
,
3223 int i
, j
, count
, stride
, sign
;
3224 const char *scan
= placelist
;
3225 const char *next
= placelist
;
3228 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks
, numNewMasks
);
3231 // tempMask is modified based on the previous or initial
3232 // place to form the current place
3233 // previousMask contains the previous place
3234 kmp_affin_mask_t
*tempMask
;
3235 kmp_affin_mask_t
*previousMask
;
3236 KMP_CPU_ALLOC(tempMask
);
3237 KMP_CPU_ZERO(tempMask
);
3238 KMP_CPU_ALLOC(previousMask
);
3239 KMP_CPU_ZERO(previousMask
);
3243 __kmp_process_place(&scan
, osId2Mask
, maxOsId
, tempMask
, &setSize
);
3245 // valid follow sets are ',' ':' and EOL
3247 if (*scan
== '\0' || *scan
== ',') {
3251 KMP_CPU_ZERO(tempMask
);
3253 if (*scan
== '\0') {
3260 KMP_ASSERT2(*scan
== ':', "bad explicit places list");
3263 // Read count parameter
3265 KMP_ASSERT2((*scan
>= '0') && (*scan
<= '9'), "bad explicit places list");
3268 count
= __kmp_str_to_int(scan
, *next
);
3269 KMP_ASSERT(count
>= 0);
3272 // valid follow sets are ',' ':' and EOL
3274 if (*scan
== '\0' || *scan
== ',') {
3277 KMP_ASSERT2(*scan
== ':', "bad explicit places list");
3280 // Read stride parameter
3296 KMP_ASSERT2((*scan
>= '0') && (*scan
<= '9'), "bad explicit places list");
3299 stride
= __kmp_str_to_int(scan
, *next
);
3300 KMP_DEBUG_ASSERT(stride
>= 0);
3305 // Add places determined by initial_place : count : stride
3306 for (i
= 0; i
< count
; i
++) {
3310 // Add the current place, then build the next place (tempMask) from that
3311 KMP_CPU_COPY(previousMask
, tempMask
);
3312 ADD_MASK(previousMask
);
3313 KMP_CPU_ZERO(tempMask
);
3315 KMP_CPU_SET_ITERATE(j
, previousMask
) {
3316 if (!KMP_CPU_ISSET(j
, previousMask
)) {
3319 if ((j
+ stride
> maxOsId
) || (j
+ stride
< 0) ||
3320 (!KMP_CPU_ISSET(j
, __kmp_affin_fullMask
)) ||
3321 (!KMP_CPU_ISSET(j
+ stride
,
3322 KMP_CPU_INDEX(osId2Mask
, j
+ stride
)))) {
3323 if ((__kmp_affinity_verbose
||
3324 (__kmp_affinity_warnings
&&
3325 (__kmp_affinity_type
!= affinity_none
))) &&
3327 KMP_WARNING(AffIgnoreInvalidProcID
, j
+ stride
);
3331 KMP_CPU_SET(j
+ stride
, tempMask
);
3335 KMP_CPU_ZERO(tempMask
);
3338 // valid follow sets are ',' and EOL
3340 if (*scan
== '\0') {
3348 KMP_ASSERT2(0, "bad explicit places list");
3351 *out_numMasks
= nextNewMask
;
3352 if (nextNewMask
== 0) {
3354 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
3357 KMP_CPU_ALLOC_ARRAY((*out_masks
), nextNewMask
);
3358 KMP_CPU_FREE(tempMask
);
3359 KMP_CPU_FREE(previousMask
);
3360 for (i
= 0; i
< nextNewMask
; i
++) {
3361 kmp_affin_mask_t
*src
= KMP_CPU_INDEX(newMasks
, i
);
3362 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX((*out_masks
), i
);
3363 KMP_CPU_COPY(dest
, src
);
3365 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
3369 #undef ADD_MASK_OSID
3372 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t
, hwloc_obj_t o
) {
3373 // skip PUs descendants of the object o
3375 hwloc_obj_t hT
= NULL
;
3376 int N
= __kmp_hwloc_count_children_by_type(t
, o
, HWLOC_OBJ_PU
, &hT
);
3377 for (int i
= 0; i
< N
; ++i
) {
3378 KMP_DEBUG_ASSERT(hT
);
3379 unsigned idx
= hT
->os_index
;
3380 if (KMP_CPU_ISSET(idx
, __kmp_affin_fullMask
)) {
3381 KMP_CPU_CLR(idx
, __kmp_affin_fullMask
);
3382 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx
));
3385 hT
= hwloc_get_next_obj_by_type(t
, HWLOC_OBJ_PU
, hT
);
3387 return skipped
; // count number of skipped units
3390 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t
, hwloc_obj_t o
) {
3391 // check if obj has PUs present in fullMask
3392 hwloc_obj_t hT
= NULL
;
3393 int N
= __kmp_hwloc_count_children_by_type(t
, o
, HWLOC_OBJ_PU
, &hT
);
3394 for (int i
= 0; i
< N
; ++i
) {
3395 KMP_DEBUG_ASSERT(hT
);
3396 unsigned idx
= hT
->os_index
;
3397 if (KMP_CPU_ISSET(idx
, __kmp_affin_fullMask
))
3398 return 1; // found PU
3399 hT
= hwloc_get_next_obj_by_type(t
, HWLOC_OBJ_PU
, hT
);
3401 return 0; // no PUs found
3403 #endif // KMP_USE_HWLOC
3405 static void __kmp_apply_thread_places(AddrUnsPair
**pAddr
, int depth
) {
3406 AddrUnsPair
*newAddr
;
3407 if (__kmp_hws_requested
== 0)
3408 goto _exit
; // no topology limiting actions requested, exit
3410 if (__kmp_affinity_dispatch
->get_api_type() == KMPAffinity::HWLOC
) {
3411 // Number of subobjects calculated dynamically, this works fine for
3412 // any non-uniform topology.
3413 // L2 cache objects are determined by depth, other objects - by type.
3414 hwloc_topology_t tp
= __kmp_hwloc_topology
;
3415 int nS
= 0, nN
= 0, nL
= 0, nC
= 0,
3416 nT
= 0; // logical index including skipped
3417 int nCr
= 0, nTr
= 0; // number of requested units
3418 int nPkg
= 0, nCo
= 0, n_new
= 0, n_old
= 0, nCpP
= 0, nTpC
= 0; // counters
3419 hwloc_obj_t hT
, hC
, hL
, hN
, hS
; // hwloc objects (pointers to)
3422 // check support of extensions ----------------------------------
3423 int numa_support
= 0, tile_support
= 0;
3424 if (__kmp_pu_os_idx
)
3425 hT
= hwloc_get_pu_obj_by_os_index(tp
,
3426 __kmp_pu_os_idx
[__kmp_avail_proc
- 1]);
3428 hT
= hwloc_get_obj_by_type(tp
, HWLOC_OBJ_PU
, __kmp_avail_proc
- 1);
3429 if (hT
== NULL
) { // something's gone wrong
3430 KMP_WARNING(AffHWSubsetUnsupported
);
3434 hN
= hwloc_get_ancestor_obj_by_type(tp
, HWLOC_OBJ_NUMANODE
, hT
);
3435 hS
= hwloc_get_ancestor_obj_by_type(tp
, HWLOC_OBJ_PACKAGE
, hT
);
3436 if (hN
!= NULL
&& hN
->depth
> hS
->depth
) {
3437 numa_support
= 1; // 1 in case socket includes node(s)
3438 } else if (__kmp_hws_node
.num
> 0) {
3439 // don't support sockets inside NUMA node (no such HW found for testing)
3440 KMP_WARNING(AffHWSubsetUnsupported
);
3443 // check L2 cahce, get object by depth because of multiple caches
3444 L2depth
= hwloc_get_cache_type_depth(tp
, 2, HWLOC_OBJ_CACHE_UNIFIED
);
3445 hL
= hwloc_get_ancestor_obj_by_depth(tp
, L2depth
, hT
);
3447 __kmp_hwloc_count_children_by_type(tp
, hL
, HWLOC_OBJ_CORE
, &hC
) > 1) {
3448 tile_support
= 1; // no sense to count L2 if it includes single core
3449 } else if (__kmp_hws_tile
.num
> 0) {
3450 if (__kmp_hws_core
.num
== 0) {
3451 __kmp_hws_core
= __kmp_hws_tile
; // replace L2 with core
3452 __kmp_hws_tile
.num
= 0;
3454 // L2 and core are both requested, but represent same object
3455 KMP_WARNING(AffHWSubsetInvalid
);
3459 // end of check of extensions -----------------------------------
3461 // fill in unset items, validate settings -----------------------
3462 if (__kmp_hws_socket
.num
== 0)
3463 __kmp_hws_socket
.num
= nPackages
; // use all available sockets
3464 if (__kmp_hws_socket
.offset
>= nPackages
) {
3465 KMP_WARNING(AffHWSubsetManySockets
);
3470 int NN
= __kmp_hwloc_count_children_by_type(tp
, hS
, HWLOC_OBJ_NUMANODE
,
3471 &hN
); // num nodes in socket
3472 if (__kmp_hws_node
.num
== 0)
3473 __kmp_hws_node
.num
= NN
; // use all available nodes
3474 if (__kmp_hws_node
.offset
>= NN
) {
3475 KMP_WARNING(AffHWSubsetManyNodes
);
3479 // get num tiles in node
3480 int NL
= __kmp_hwloc_count_children_by_depth(tp
, hN
, L2depth
, &hL
);
3481 if (__kmp_hws_tile
.num
== 0) {
3482 __kmp_hws_tile
.num
= NL
+ 1;
3483 } // use all available tiles, some node may have more tiles, thus +1
3484 if (__kmp_hws_tile
.offset
>= NL
) {
3485 KMP_WARNING(AffHWSubsetManyTiles
);
3488 int NC
= __kmp_hwloc_count_children_by_type(tp
, hL
, HWLOC_OBJ_CORE
,
3489 &hC
); // num cores in tile
3490 if (__kmp_hws_core
.num
== 0)
3491 __kmp_hws_core
.num
= NC
; // use all available cores
3492 if (__kmp_hws_core
.offset
>= NC
) {
3493 KMP_WARNING(AffHWSubsetManyCores
);
3496 } else { // tile_support
3497 int NC
= __kmp_hwloc_count_children_by_type(tp
, hN
, HWLOC_OBJ_CORE
,
3498 &hC
); // num cores in node
3499 if (__kmp_hws_core
.num
== 0)
3500 __kmp_hws_core
.num
= NC
; // use all available cores
3501 if (__kmp_hws_core
.offset
>= NC
) {
3502 KMP_WARNING(AffHWSubsetManyCores
);
3506 } else { // numa_support
3508 // get num tiles in socket
3509 int NL
= __kmp_hwloc_count_children_by_depth(tp
, hS
, L2depth
, &hL
);
3510 if (__kmp_hws_tile
.num
== 0)
3511 __kmp_hws_tile
.num
= NL
; // use all available tiles
3512 if (__kmp_hws_tile
.offset
>= NL
) {
3513 KMP_WARNING(AffHWSubsetManyTiles
);
3516 int NC
= __kmp_hwloc_count_children_by_type(tp
, hL
, HWLOC_OBJ_CORE
,
3517 &hC
); // num cores in tile
3518 if (__kmp_hws_core
.num
== 0)
3519 __kmp_hws_core
.num
= NC
; // use all available cores
3520 if (__kmp_hws_core
.offset
>= NC
) {
3521 KMP_WARNING(AffHWSubsetManyCores
);
3524 } else { // tile_support
3525 int NC
= __kmp_hwloc_count_children_by_type(tp
, hS
, HWLOC_OBJ_CORE
,
3526 &hC
); // num cores in socket
3527 if (__kmp_hws_core
.num
== 0)
3528 __kmp_hws_core
.num
= NC
; // use all available cores
3529 if (__kmp_hws_core
.offset
>= NC
) {
3530 KMP_WARNING(AffHWSubsetManyCores
);
3535 if (__kmp_hws_proc
.num
== 0)
3536 __kmp_hws_proc
.num
= __kmp_nThreadsPerCore
; // use all available procs
3537 if (__kmp_hws_proc
.offset
>= __kmp_nThreadsPerCore
) {
3538 KMP_WARNING(AffHWSubsetManyProcs
);
3541 // end of validation --------------------------------------------
3543 if (pAddr
) // pAddr is NULL in case of affinity_none
3544 newAddr
= (AddrUnsPair
*)__kmp_allocate(sizeof(AddrUnsPair
) *
3545 __kmp_avail_proc
); // max size
3546 // main loop to form HW subset ----------------------------------
3548 int NP
= hwloc_get_nbobjs_by_type(tp
, HWLOC_OBJ_PACKAGE
);
3549 for (int s
= 0; s
< NP
; ++s
) {
3550 // Check Socket -----------------------------------------------
3551 hS
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PACKAGE
, hS
);
3552 if (!__kmp_hwloc_obj_has_PUs(tp
, hS
))
3553 continue; // skip socket if all PUs are out of fullMask
3554 ++nS
; // only count objects those have PUs in affinity mask
3555 if (nS
<= __kmp_hws_socket
.offset
||
3556 nS
> __kmp_hws_socket
.num
+ __kmp_hws_socket
.offset
) {
3557 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hS
); // skip socket
3558 continue; // move to next socket
3560 nCr
= 0; // count number of cores per socket
3561 // socket requested, go down the topology tree
3562 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
3566 // num nodes in current socket
3568 __kmp_hwloc_count_children_by_type(tp
, hS
, HWLOC_OBJ_NUMANODE
, &hN
);
3569 for (int n
= 0; n
< NN
; ++n
) {
3570 // Check NUMA Node ----------------------------------------
3571 if (!__kmp_hwloc_obj_has_PUs(tp
, hN
)) {
3572 hN
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_NUMANODE
, hN
);
3573 continue; // skip node if all PUs are out of fullMask
3576 if (nN
<= __kmp_hws_node
.offset
||
3577 nN
> __kmp_hws_node
.num
+ __kmp_hws_node
.offset
) {
3578 // skip node as not requested
3579 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hN
); // skip node
3580 hN
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_NUMANODE
, hN
);
3581 continue; // move to next node
3583 // node requested, go down the topology tree
3587 int NL
= __kmp_hwloc_count_children_by_depth(tp
, hN
, L2depth
, &hL
);
3588 for (int l
= 0; l
< NL
; ++l
) {
3589 // Check L2 (tile) ------------------------------------
3590 if (!__kmp_hwloc_obj_has_PUs(tp
, hL
)) {
3591 hL
= hwloc_get_next_obj_by_depth(tp
, L2depth
, hL
);
3592 continue; // skip tile if all PUs are out of fullMask
3595 if (nL
<= __kmp_hws_tile
.offset
||
3596 nL
> __kmp_hws_tile
.num
+ __kmp_hws_tile
.offset
) {
3597 // skip tile as not requested
3598 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hL
); // skip tile
3599 hL
= hwloc_get_next_obj_by_depth(tp
, L2depth
, hL
);
3600 continue; // move to next tile
3602 // tile requested, go down the topology tree
3605 // num cores in current tile
3606 int NC
= __kmp_hwloc_count_children_by_type(tp
, hL
,
3607 HWLOC_OBJ_CORE
, &hC
);
3608 for (int c
= 0; c
< NC
; ++c
) {
3609 // Check Core ---------------------------------------
3610 if (!__kmp_hwloc_obj_has_PUs(tp
, hC
)) {
3611 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3612 continue; // skip core if all PUs are out of fullMask
3615 if (nC
<= __kmp_hws_core
.offset
||
3616 nC
> __kmp_hws_core
.num
+ __kmp_hws_core
.offset
) {
3617 // skip node as not requested
3618 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hC
); // skip core
3619 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3620 continue; // move to next node
3622 // core requested, go down to PUs
3626 // num procs in current core
3627 int NT
= __kmp_hwloc_count_children_by_type(tp
, hC
,
3629 for (int t
= 0; t
< NT
; ++t
) {
3630 // Check PU ---------------------------------------
3632 if (!KMP_CPU_ISSET(idx
, __kmp_affin_fullMask
)) {
3633 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3634 continue; // skip PU if not in fullMask
3637 if (nT
<= __kmp_hws_proc
.offset
||
3638 nT
> __kmp_hws_proc
.num
+ __kmp_hws_proc
.offset
) {
3640 KMP_CPU_CLR(idx
, __kmp_affin_fullMask
);
3642 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx
));
3643 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3644 continue; // move to next node
3647 if (pAddr
) // collect requested thread's data
3648 newAddr
[n_new
] = (*pAddr
)[n_old
];
3651 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3654 ++nCr
; // num cores per socket
3655 ++nCo
; // total num cores
3657 nTpC
= nTr
; // calc max threads per core
3659 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3661 hL
= hwloc_get_next_obj_by_depth(tp
, L2depth
, hL
);
3663 } else { // tile_support
3664 // no tiles, check cores
3667 // num cores in current node
3669 __kmp_hwloc_count_children_by_type(tp
, hN
, HWLOC_OBJ_CORE
, &hC
);
3670 for (int c
= 0; c
< NC
; ++c
) {
3671 // Check Core ---------------------------------------
3672 if (!__kmp_hwloc_obj_has_PUs(tp
, hC
)) {
3673 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3674 continue; // skip core if all PUs are out of fullMask
3677 if (nC
<= __kmp_hws_core
.offset
||
3678 nC
> __kmp_hws_core
.num
+ __kmp_hws_core
.offset
) {
3679 // skip node as not requested
3680 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hC
); // skip core
3681 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3682 continue; // move to next node
3684 // core requested, go down to PUs
3689 __kmp_hwloc_count_children_by_type(tp
, hC
, HWLOC_OBJ_PU
, &hT
);
3690 for (int t
= 0; t
< NT
; ++t
) {
3691 // Check PU ---------------------------------------
3693 if (!KMP_CPU_ISSET(idx
, __kmp_affin_fullMask
)) {
3694 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3695 continue; // skip PU if not in fullMask
3698 if (nT
<= __kmp_hws_proc
.offset
||
3699 nT
> __kmp_hws_proc
.num
+ __kmp_hws_proc
.offset
) {
3701 KMP_CPU_CLR(idx
, __kmp_affin_fullMask
);
3703 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx
));
3704 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3705 continue; // move to next node
3708 if (pAddr
) // collect requested thread's data
3709 newAddr
[n_new
] = (*pAddr
)[n_old
];
3712 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3715 ++nCr
; // num cores per socket
3716 ++nCo
; // total num cores
3718 nTpC
= nTr
; // calc max threads per core
3720 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3723 hN
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_NUMANODE
, hN
);
3725 } else { // numa_support
3730 // num tiles in current socket
3731 int NL
= __kmp_hwloc_count_children_by_depth(tp
, hS
, L2depth
, &hL
);
3732 for (int l
= 0; l
< NL
; ++l
) {
3733 // Check L2 (tile) ------------------------------------
3734 if (!__kmp_hwloc_obj_has_PUs(tp
, hL
)) {
3735 hL
= hwloc_get_next_obj_by_depth(tp
, L2depth
, hL
);
3736 continue; // skip tile if all PUs are out of fullMask
3739 if (nL
<= __kmp_hws_tile
.offset
||
3740 nL
> __kmp_hws_tile
.num
+ __kmp_hws_tile
.offset
) {
3741 // skip tile as not requested
3742 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hL
); // skip tile
3743 hL
= hwloc_get_next_obj_by_depth(tp
, L2depth
, hL
);
3744 continue; // move to next tile
3746 // tile requested, go down the topology tree
3749 // num cores per tile
3751 __kmp_hwloc_count_children_by_type(tp
, hL
, HWLOC_OBJ_CORE
, &hC
);
3752 for (int c
= 0; c
< NC
; ++c
) {
3753 // Check Core ---------------------------------------
3754 if (!__kmp_hwloc_obj_has_PUs(tp
, hC
)) {
3755 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3756 continue; // skip core if all PUs are out of fullMask
3759 if (nC
<= __kmp_hws_core
.offset
||
3760 nC
> __kmp_hws_core
.num
+ __kmp_hws_core
.offset
) {
3761 // skip node as not requested
3762 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hC
); // skip core
3763 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3764 continue; // move to next node
3766 // core requested, go down to PUs
3770 // num procs per core
3772 __kmp_hwloc_count_children_by_type(tp
, hC
, HWLOC_OBJ_PU
, &hT
);
3773 for (int t
= 0; t
< NT
; ++t
) {
3774 // Check PU ---------------------------------------
3776 if (!KMP_CPU_ISSET(idx
, __kmp_affin_fullMask
)) {
3777 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3778 continue; // skip PU if not in fullMask
3781 if (nT
<= __kmp_hws_proc
.offset
||
3782 nT
> __kmp_hws_proc
.num
+ __kmp_hws_proc
.offset
) {
3784 KMP_CPU_CLR(idx
, __kmp_affin_fullMask
);
3786 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx
));
3787 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3788 continue; // move to next node
3791 if (pAddr
) // collect requested thread's data
3792 newAddr
[n_new
] = (*pAddr
)[n_old
];
3795 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3798 ++nCr
; // num cores per socket
3799 ++nCo
; // total num cores
3801 nTpC
= nTr
; // calc max threads per core
3803 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3805 hL
= hwloc_get_next_obj_by_depth(tp
, L2depth
, hL
);
3807 } else { // tile_support
3808 // no tiles, check cores
3811 // num cores in socket
3813 __kmp_hwloc_count_children_by_type(tp
, hS
, HWLOC_OBJ_CORE
, &hC
);
3814 for (int c
= 0; c
< NC
; ++c
) {
3815 // Check Core -------------------------------------------
3816 if (!__kmp_hwloc_obj_has_PUs(tp
, hC
)) {
3817 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3818 continue; // skip core if all PUs are out of fullMask
3821 if (nC
<= __kmp_hws_core
.offset
||
3822 nC
> __kmp_hws_core
.num
+ __kmp_hws_core
.offset
) {
3823 // skip node as not requested
3824 n_old
+= __kmp_hwloc_skip_PUs_obj(tp
, hC
); // skip core
3825 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3826 continue; // move to next node
3828 // core requested, go down to PUs
3832 // num procs per core
3834 __kmp_hwloc_count_children_by_type(tp
, hC
, HWLOC_OBJ_PU
, &hT
);
3835 for (int t
= 0; t
< NT
; ++t
) {
3836 // Check PU ---------------------------------------
3838 if (!KMP_CPU_ISSET(idx
, __kmp_affin_fullMask
)) {
3839 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3840 continue; // skip PU if not in fullMask
3843 if (nT
<= __kmp_hws_proc
.offset
||
3844 nT
> __kmp_hws_proc
.num
+ __kmp_hws_proc
.offset
) {
3846 KMP_CPU_CLR(idx
, __kmp_affin_fullMask
);
3848 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx
));
3849 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3850 continue; // move to next node
3853 if (pAddr
) // collect requested thread's data
3854 newAddr
[n_new
] = (*pAddr
)[n_old
];
3857 hT
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, hT
);
3860 ++nCr
; // num cores per socket
3861 ++nCo
; // total num cores
3863 nTpC
= nTr
; // calc max threads per core
3865 hC
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_CORE
, hC
);
3869 if (nCr
> 0) { // found cores?
3870 ++nPkg
; // num sockets
3872 nCpP
= nCr
; // calc max cores per socket
3876 // check the subset is valid
3877 KMP_DEBUG_ASSERT(n_old
== __kmp_avail_proc
);
3878 KMP_DEBUG_ASSERT(nPkg
> 0);
3879 KMP_DEBUG_ASSERT(nCpP
> 0);
3880 KMP_DEBUG_ASSERT(nTpC
> 0);
3881 KMP_DEBUG_ASSERT(nCo
> 0);
3882 KMP_DEBUG_ASSERT(nPkg
<= nPackages
);
3883 KMP_DEBUG_ASSERT(nCpP
<= nCoresPerPkg
);
3884 KMP_DEBUG_ASSERT(nTpC
<= __kmp_nThreadsPerCore
);
3885 KMP_DEBUG_ASSERT(nCo
<= __kmp_ncores
);
3887 nPackages
= nPkg
; // correct num sockets
3888 nCoresPerPkg
= nCpP
; // correct num cores per socket
3889 __kmp_nThreadsPerCore
= nTpC
; // correct num threads per core
3890 __kmp_avail_proc
= n_new
; // correct num procs
3891 __kmp_ncores
= nCo
; // correct num cores
3892 // hwloc topology method end
3894 #endif // KMP_USE_HWLOC
3896 int n_old
= 0, n_new
= 0, proc_num
= 0;
3897 if (__kmp_hws_node
.num
> 0 || __kmp_hws_tile
.num
> 0) {
3898 KMP_WARNING(AffHWSubsetNoHWLOC
);
3901 if (__kmp_hws_socket
.num
== 0)
3902 __kmp_hws_socket
.num
= nPackages
; // use all available sockets
3903 if (__kmp_hws_core
.num
== 0)
3904 __kmp_hws_core
.num
= nCoresPerPkg
; // use all available cores
3905 if (__kmp_hws_proc
.num
== 0 || __kmp_hws_proc
.num
> __kmp_nThreadsPerCore
)
3906 __kmp_hws_proc
.num
= __kmp_nThreadsPerCore
; // use all HW contexts
3907 if (!__kmp_affinity_uniform_topology()) {
3908 KMP_WARNING(AffHWSubsetNonUniform
);
3909 goto _exit
; // don't support non-uniform topology
3912 KMP_WARNING(AffHWSubsetNonThreeLevel
);
3913 goto _exit
; // don't support not-3-level topology
3915 if (__kmp_hws_socket
.offset
+ __kmp_hws_socket
.num
> nPackages
) {
3916 KMP_WARNING(AffHWSubsetManySockets
);
3919 if (__kmp_hws_core
.offset
+ __kmp_hws_core
.num
> nCoresPerPkg
) {
3920 KMP_WARNING(AffHWSubsetManyCores
);
3923 // Form the requested subset
3924 if (pAddr
) // pAddr is NULL in case of affinity_none
3925 newAddr
= (AddrUnsPair
*)__kmp_allocate(
3926 sizeof(AddrUnsPair
) * __kmp_hws_socket
.num
* __kmp_hws_core
.num
*
3927 __kmp_hws_proc
.num
);
3928 for (int i
= 0; i
< nPackages
; ++i
) {
3929 if (i
< __kmp_hws_socket
.offset
||
3930 i
>= __kmp_hws_socket
.offset
+ __kmp_hws_socket
.num
) {
3931 // skip not-requested socket
3932 n_old
+= nCoresPerPkg
* __kmp_nThreadsPerCore
;
3933 if (__kmp_pu_os_idx
!= NULL
) {
3934 // walk through skipped socket
3935 for (int j
= 0; j
< nCoresPerPkg
; ++j
) {
3936 for (int k
= 0; k
< __kmp_nThreadsPerCore
; ++k
) {
3937 KMP_CPU_CLR(__kmp_pu_os_idx
[proc_num
], __kmp_affin_fullMask
);
3943 // walk through requested socket
3944 for (int j
= 0; j
< nCoresPerPkg
; ++j
) {
3945 if (j
< __kmp_hws_core
.offset
||
3946 j
>= __kmp_hws_core
.offset
+
3947 __kmp_hws_core
.num
) { // skip not-requested core
3948 n_old
+= __kmp_nThreadsPerCore
;
3949 if (__kmp_pu_os_idx
!= NULL
) {
3950 for (int k
= 0; k
< __kmp_nThreadsPerCore
; ++k
) {
3951 KMP_CPU_CLR(__kmp_pu_os_idx
[proc_num
], __kmp_affin_fullMask
);
3956 // walk through requested core
3957 for (int k
= 0; k
< __kmp_nThreadsPerCore
; ++k
) {
3958 if (k
< __kmp_hws_proc
.num
) {
3959 if (pAddr
) // collect requested thread's data
3960 newAddr
[n_new
] = (*pAddr
)[n_old
];
3963 if (__kmp_pu_os_idx
!= NULL
)
3964 KMP_CPU_CLR(__kmp_pu_os_idx
[proc_num
], __kmp_affin_fullMask
);
3973 KMP_DEBUG_ASSERT(n_old
== nPackages
* nCoresPerPkg
* __kmp_nThreadsPerCore
);
3974 KMP_DEBUG_ASSERT(n_new
==
3975 __kmp_hws_socket
.num
* __kmp_hws_core
.num
*
3976 __kmp_hws_proc
.num
);
3977 nPackages
= __kmp_hws_socket
.num
; // correct nPackages
3978 nCoresPerPkg
= __kmp_hws_core
.num
; // correct nCoresPerPkg
3979 __kmp_nThreadsPerCore
= __kmp_hws_proc
.num
; // correct __kmp_nThreadsPerCore
3980 __kmp_avail_proc
= n_new
; // correct avail_proc
3981 __kmp_ncores
= nPackages
* __kmp_hws_core
.num
; // correct ncores
3982 } // non-hwloc topology method
3985 *pAddr
= newAddr
; // replace old topology with new one
3987 if (__kmp_affinity_verbose
) {
3988 char m
[KMP_AFFIN_MASK_PRINT_LEN
];
3989 __kmp_affinity_print_mask(m
, KMP_AFFIN_MASK_PRINT_LEN
,
3990 __kmp_affin_fullMask
);
3991 if (__kmp_affinity_respect_mask
) {
3992 KMP_INFORM(InitOSProcSetRespect
, "KMP_HW_SUBSET", m
);
3994 KMP_INFORM(InitOSProcSetNotRespect
, "KMP_HW_SUBSET", m
);
3996 KMP_INFORM(AvailableOSProc
, "KMP_HW_SUBSET", __kmp_avail_proc
);
3998 __kmp_str_buf_init(&buf
);
3999 __kmp_str_buf_print(&buf
, "%d", nPackages
);
4000 KMP_INFORM(TopologyExtra
, "KMP_HW_SUBSET", buf
.str
, nCoresPerPkg
,
4001 __kmp_nThreadsPerCore
, __kmp_ncores
);
4002 __kmp_str_buf_free(&buf
);
4005 if (__kmp_pu_os_idx
!= NULL
) {
4006 __kmp_free(__kmp_pu_os_idx
);
4007 __kmp_pu_os_idx
= NULL
;
4011 // This function figures out the deepest level at which there is at least one
4012 // cluster/core with more than one processing unit bound to it.
4013 static int __kmp_affinity_find_core_level(const AddrUnsPair
*address2os
,
4014 int nprocs
, int bottom_level
) {
4017 for (int i
= 0; i
< nprocs
; i
++) {
4018 for (int j
= bottom_level
; j
> 0; j
--) {
4019 if (address2os
[i
].first
.labels
[j
] > 0) {
4020 if (core_level
< (j
- 1)) {
4029 // This function counts number of clusters/cores at given level.
4030 static int __kmp_affinity_compute_ncores(const AddrUnsPair
*address2os
,
4031 int nprocs
, int bottom_level
,
4037 for (i
= 0; i
< nprocs
; i
++) {
4038 for (j
= bottom_level
; j
> core_level
; j
--) {
4039 if ((i
+ 1) < nprocs
) {
4040 if (address2os
[i
+ 1].first
.labels
[j
] > 0) {
4045 if (j
== core_level
) {
4049 if (j
> core_level
) {
4050 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
4051 // core. May occur when called from __kmp_affinity_find_core().
4057 // This function finds to which cluster/core given processing unit is bound.
4058 static int __kmp_affinity_find_core(const AddrUnsPair
*address2os
, int proc
,
4059 int bottom_level
, int core_level
) {
4060 return __kmp_affinity_compute_ncores(address2os
, proc
+ 1, bottom_level
,
4065 // This function finds maximal number of processing units bound to a
4066 // cluster/core at given level.
4067 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair
*address2os
,
4068 int nprocs
, int bottom_level
,
4070 int maxprocpercore
= 0;
4072 if (core_level
< bottom_level
) {
4073 for (int i
= 0; i
< nprocs
; i
++) {
4074 int percore
= address2os
[i
].first
.labels
[core_level
+ 1] + 1;
4076 if (percore
> maxprocpercore
) {
4077 maxprocpercore
= percore
;
4083 return maxprocpercore
;
4086 static AddrUnsPair
*address2os
= NULL
;
4087 static int *procarr
= NULL
;
4088 static int __kmp_aff_depth
= 0;
4090 #if KMP_USE_HIER_SCHED
4091 #define KMP_EXIT_AFF_NONE \
4092 KMP_ASSERT(__kmp_affinity_type == affinity_none); \
4093 KMP_ASSERT(address2os == NULL); \
4094 __kmp_apply_thread_places(NULL, 0); \
4095 __kmp_create_affinity_none_places(); \
4096 __kmp_dispatch_set_hierarchy_values(); \
4099 #define KMP_EXIT_AFF_NONE \
4100 KMP_ASSERT(__kmp_affinity_type == affinity_none); \
4101 KMP_ASSERT(address2os == NULL); \
4102 __kmp_apply_thread_places(NULL, 0); \
4103 __kmp_create_affinity_none_places(); \
4107 // Create a one element mask array (set of places) which only contains the
4108 // initial process's affinity mask
4109 static void __kmp_create_affinity_none_places() {
4110 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4111 KMP_ASSERT(__kmp_affinity_type
== affinity_none
);
4112 __kmp_affinity_num_masks
= 1;
4113 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks
, __kmp_affinity_num_masks
);
4114 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX(__kmp_affinity_masks
, 0);
4115 KMP_CPU_COPY(dest
, __kmp_affin_fullMask
);
4118 static int __kmp_affinity_cmp_Address_child_num(const void *a
, const void *b
) {
4119 const Address
*aa
= &(((const AddrUnsPair
*)a
)->first
);
4120 const Address
*bb
= &(((const AddrUnsPair
*)b
)->first
);
4121 unsigned depth
= aa
->depth
;
4123 KMP_DEBUG_ASSERT(depth
== bb
->depth
);
4124 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact
<= depth
);
4125 KMP_DEBUG_ASSERT(__kmp_affinity_compact
>= 0);
4126 for (i
= 0; i
< (unsigned)__kmp_affinity_compact
; i
++) {
4127 int j
= depth
- i
- 1;
4128 if (aa
->childNums
[j
] < bb
->childNums
[j
])
4130 if (aa
->childNums
[j
] > bb
->childNums
[j
])
4133 for (; i
< depth
; i
++) {
4134 int j
= i
- __kmp_affinity_compact
;
4135 if (aa
->childNums
[j
] < bb
->childNums
[j
])
4137 if (aa
->childNums
[j
] > bb
->childNums
[j
])
4143 static void __kmp_aux_affinity_initialize(void) {
4144 if (__kmp_affinity_masks
!= NULL
) {
4145 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4149 // Create the "full" mask - this defines all of the processors that we
4150 // consider to be in the machine model. If respect is set, then it is the
4151 // initialization thread's affinity mask. Otherwise, it is all processors that
4152 // we know about on the machine.
4153 if (__kmp_affin_fullMask
== NULL
) {
4154 KMP_CPU_ALLOC(__kmp_affin_fullMask
);
4156 if (KMP_AFFINITY_CAPABLE()) {
4157 if (__kmp_affinity_respect_mask
) {
4158 __kmp_get_system_affinity(__kmp_affin_fullMask
, TRUE
);
4160 // Count the number of available processors.
4162 __kmp_avail_proc
= 0;
4163 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
4164 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
4169 if (__kmp_avail_proc
> __kmp_xproc
) {
4170 if (__kmp_affinity_verbose
||
4171 (__kmp_affinity_warnings
&&
4172 (__kmp_affinity_type
!= affinity_none
))) {
4173 KMP_WARNING(ErrorInitializeAffinity
);
4175 __kmp_affinity_type
= affinity_none
;
4176 KMP_AFFINITY_DISABLE();
4180 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask
);
4181 __kmp_avail_proc
= __kmp_xproc
;
4185 if (__kmp_affinity_gran
== affinity_gran_tile
&&
4186 // check if user's request is valid
4187 __kmp_affinity_dispatch
->get_api_type() == KMPAffinity::NATIVE_OS
) {
4188 KMP_WARNING(AffTilesNoHWLOC
, "KMP_AFFINITY");
4189 __kmp_affinity_gran
= affinity_gran_package
;
4193 kmp_i18n_id_t msg_id
= kmp_i18n_null
;
4195 // For backward compatibility, setting KMP_CPUINFO_FILE =>
4196 // KMP_TOPOLOGY_METHOD=cpuinfo
4197 if ((__kmp_cpuinfo_file
!= NULL
) &&
4198 (__kmp_affinity_top_method
== affinity_top_method_all
)) {
4199 __kmp_affinity_top_method
= affinity_top_method_cpuinfo
;
4202 if (__kmp_affinity_top_method
== affinity_top_method_all
) {
4203 // In the default code path, errors are not fatal - we just try using
4204 // another method. We only emit a warning message if affinity is on, or the
4205 // verbose flag is set, and the nowarnings flag was not set.
4206 const char *file_name
= NULL
;
4210 __kmp_affinity_dispatch
->get_api_type() == KMPAffinity::HWLOC
) {
4211 if (__kmp_affinity_verbose
) {
4212 KMP_INFORM(AffUsingHwloc
, "KMP_AFFINITY");
4214 if (!__kmp_hwloc_error
) {
4215 depth
= __kmp_affinity_create_hwloc_map(&address2os
, &msg_id
);
4218 } else if (depth
< 0 && __kmp_affinity_verbose
) {
4219 KMP_INFORM(AffIgnoringHwloc
, "KMP_AFFINITY");
4221 } else if (__kmp_affinity_verbose
) {
4222 KMP_INFORM(AffIgnoringHwloc
, "KMP_AFFINITY");
4227 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4230 if (__kmp_affinity_verbose
) {
4231 KMP_INFORM(AffInfoStr
, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC
));
4235 depth
= __kmp_affinity_create_x2apicid_map(&address2os
, &msg_id
);
4241 if (__kmp_affinity_verbose
) {
4242 if (msg_id
!= kmp_i18n_null
) {
4243 KMP_INFORM(AffInfoStrStr
, "KMP_AFFINITY",
4244 __kmp_i18n_catgets(msg_id
),
4245 KMP_I18N_STR(DecodingLegacyAPIC
));
4247 KMP_INFORM(AffInfoStr
, "KMP_AFFINITY",
4248 KMP_I18N_STR(DecodingLegacyAPIC
));
4253 depth
= __kmp_affinity_create_apicid_map(&address2os
, &msg_id
);
4260 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4265 if (__kmp_affinity_verbose
) {
4266 if (msg_id
!= kmp_i18n_null
) {
4267 KMP_INFORM(AffStrParseFilename
, "KMP_AFFINITY",
4268 __kmp_i18n_catgets(msg_id
), "/proc/cpuinfo");
4270 KMP_INFORM(AffParseFilename
, "KMP_AFFINITY", "/proc/cpuinfo");
4274 FILE *f
= fopen("/proc/cpuinfo", "r");
4276 msg_id
= kmp_i18n_str_CantOpenCpuinfo
;
4278 file_name
= "/proc/cpuinfo";
4280 __kmp_affinity_create_cpuinfo_map(&address2os
, &line
, &msg_id
, f
);
4288 #endif /* KMP_OS_LINUX */
4290 #if KMP_GROUP_AFFINITY
4292 if ((depth
< 0) && (__kmp_num_proc_groups
> 1)) {
4293 if (__kmp_affinity_verbose
) {
4294 KMP_INFORM(AffWindowsProcGroupMap
, "KMP_AFFINITY");
4297 depth
= __kmp_affinity_create_proc_group_map(&address2os
, &msg_id
);
4298 KMP_ASSERT(depth
!= 0);
4301 #endif /* KMP_GROUP_AFFINITY */
4304 if (__kmp_affinity_verbose
&& (msg_id
!= kmp_i18n_null
)) {
4305 if (file_name
== NULL
) {
4306 KMP_INFORM(UsingFlatOS
, __kmp_i18n_catgets(msg_id
));
4307 } else if (line
== 0) {
4308 KMP_INFORM(UsingFlatOSFile
, file_name
, __kmp_i18n_catgets(msg_id
));
4310 KMP_INFORM(UsingFlatOSFileLine
, file_name
, line
,
4311 __kmp_i18n_catgets(msg_id
));
4314 // FIXME - print msg if msg_id = kmp_i18n_null ???
4317 depth
= __kmp_affinity_create_flat_map(&address2os
, &msg_id
);
4321 KMP_ASSERT(depth
> 0);
4322 KMP_ASSERT(address2os
!= NULL
);
4327 else if (__kmp_affinity_top_method
== affinity_top_method_hwloc
) {
4328 KMP_ASSERT(__kmp_affinity_dispatch
->get_api_type() == KMPAffinity::HWLOC
);
4329 if (__kmp_affinity_verbose
) {
4330 KMP_INFORM(AffUsingHwloc
, "KMP_AFFINITY");
4332 depth
= __kmp_affinity_create_hwloc_map(&address2os
, &msg_id
);
4337 #endif // KMP_USE_HWLOC
4339 // If the user has specified that a particular topology discovery method is to be
4340 // used, then we abort if that method fails. The exception is group affinity,
4341 // which might have been implicitly set.
4343 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4345 else if (__kmp_affinity_top_method
== affinity_top_method_x2apicid
) {
4346 if (__kmp_affinity_verbose
) {
4347 KMP_INFORM(AffInfoStr
, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC
));
4350 depth
= __kmp_affinity_create_x2apicid_map(&address2os
, &msg_id
);
4355 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4356 KMP_FATAL(MsgExiting
, __kmp_i18n_catgets(msg_id
));
4358 } else if (__kmp_affinity_top_method
== affinity_top_method_apicid
) {
4359 if (__kmp_affinity_verbose
) {
4360 KMP_INFORM(AffInfoStr
, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC
));
4363 depth
= __kmp_affinity_create_apicid_map(&address2os
, &msg_id
);
4368 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4369 KMP_FATAL(MsgExiting
, __kmp_i18n_catgets(msg_id
));
4373 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4375 else if (__kmp_affinity_top_method
== affinity_top_method_cpuinfo
) {
4376 const char *filename
;
4377 if (__kmp_cpuinfo_file
!= NULL
) {
4378 filename
= __kmp_cpuinfo_file
;
4380 filename
= "/proc/cpuinfo";
4383 if (__kmp_affinity_verbose
) {
4384 KMP_INFORM(AffParseFilename
, "KMP_AFFINITY", filename
);
4387 FILE *f
= fopen(filename
, "r");
4390 if (__kmp_cpuinfo_file
!= NULL
) {
4391 __kmp_fatal(KMP_MSG(CantOpenFileForReading
, filename
), KMP_ERR(code
),
4392 KMP_HNT(NameComesFrom_CPUINFO_FILE
), __kmp_msg_null
);
4394 __kmp_fatal(KMP_MSG(CantOpenFileForReading
, filename
), KMP_ERR(code
),
4399 depth
= __kmp_affinity_create_cpuinfo_map(&address2os
, &line
, &msg_id
, f
);
4402 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4404 KMP_FATAL(FileLineMsgExiting
, filename
, line
,
4405 __kmp_i18n_catgets(msg_id
));
4407 KMP_FATAL(FileMsgExiting
, filename
, __kmp_i18n_catgets(msg_id
));
4410 if (__kmp_affinity_type
== affinity_none
) {
4411 KMP_ASSERT(depth
== 0);
4416 #if KMP_GROUP_AFFINITY
4418 else if (__kmp_affinity_top_method
== affinity_top_method_group
) {
4419 if (__kmp_affinity_verbose
) {
4420 KMP_INFORM(AffWindowsProcGroupMap
, "KMP_AFFINITY");
4423 depth
= __kmp_affinity_create_proc_group_map(&address2os
, &msg_id
);
4424 KMP_ASSERT(depth
!= 0);
4426 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4427 KMP_FATAL(MsgExiting
, __kmp_i18n_catgets(msg_id
));
4431 #endif /* KMP_GROUP_AFFINITY */
4433 else if (__kmp_affinity_top_method
== affinity_top_method_flat
) {
4434 if (__kmp_affinity_verbose
) {
4435 KMP_INFORM(AffUsingFlatOS
, "KMP_AFFINITY");
4438 depth
= __kmp_affinity_create_flat_map(&address2os
, &msg_id
);
4443 KMP_ASSERT(depth
> 0);
4444 KMP_ASSERT(address2os
!= NULL
);
4447 #if KMP_USE_HIER_SCHED
4448 __kmp_dispatch_set_hierarchy_values();
4451 if (address2os
== NULL
) {
4452 if (KMP_AFFINITY_CAPABLE() &&
4453 (__kmp_affinity_verbose
||
4454 (__kmp_affinity_warnings
&& (__kmp_affinity_type
!= affinity_none
)))) {
4455 KMP_WARNING(ErrorInitializeAffinity
);
4457 __kmp_affinity_type
= affinity_none
;
4458 __kmp_create_affinity_none_places();
4459 KMP_AFFINITY_DISABLE();
4463 if (__kmp_affinity_gran
== affinity_gran_tile
4465 && __kmp_tile_depth
== 0
4468 // tiles requested but not detected, warn user on this
4469 KMP_WARNING(AffTilesNoTiles
, "KMP_AFFINITY");
4472 __kmp_apply_thread_places(&address2os
, depth
);
4474 // Create the table of masks, indexed by thread Id.
4477 kmp_affin_mask_t
*osId2Mask
=
4478 __kmp_create_masks(&maxIndex
, &numUnique
, address2os
, __kmp_avail_proc
);
4479 if (__kmp_affinity_gran_levels
== 0) {
4480 KMP_DEBUG_ASSERT((int)numUnique
== __kmp_avail_proc
);
4483 // Set the childNums vector in all Address objects. This must be done before
4484 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
4485 // account the setting of __kmp_affinity_compact.
4486 __kmp_affinity_assign_child_nums(address2os
, __kmp_avail_proc
);
4488 switch (__kmp_affinity_type
) {
4490 case affinity_explicit
:
4491 KMP_DEBUG_ASSERT(__kmp_affinity_proclist
!= NULL
);
4492 if (__kmp_nested_proc_bind
.bind_types
[0] == proc_bind_intel
) {
4493 __kmp_affinity_process_proclist(
4494 &__kmp_affinity_masks
, &__kmp_affinity_num_masks
,
4495 __kmp_affinity_proclist
, osId2Mask
, maxIndex
);
4497 __kmp_affinity_process_placelist(
4498 &__kmp_affinity_masks
, &__kmp_affinity_num_masks
,
4499 __kmp_affinity_proclist
, osId2Mask
, maxIndex
);
4501 if (__kmp_affinity_num_masks
== 0) {
4502 if (__kmp_affinity_verbose
||
4503 (__kmp_affinity_warnings
&& (__kmp_affinity_type
!= affinity_none
))) {
4504 KMP_WARNING(AffNoValidProcID
);
4506 __kmp_affinity_type
= affinity_none
;
4507 __kmp_create_affinity_none_places();
4512 // The other affinity types rely on sorting the Addresses according to some
4513 // permutation of the machine topology tree. Set __kmp_affinity_compact and
4514 // __kmp_affinity_offset appropriately, then jump to a common code fragment
4515 // to do the sort and create the array of affinity masks.
4517 case affinity_logical
:
4518 __kmp_affinity_compact
= 0;
4519 if (__kmp_affinity_offset
) {
4520 __kmp_affinity_offset
=
4521 __kmp_nThreadsPerCore
* __kmp_affinity_offset
% __kmp_avail_proc
;
4525 case affinity_physical
:
4526 if (__kmp_nThreadsPerCore
> 1) {
4527 __kmp_affinity_compact
= 1;
4528 if (__kmp_affinity_compact
>= depth
) {
4529 __kmp_affinity_compact
= 0;
4532 __kmp_affinity_compact
= 0;
4534 if (__kmp_affinity_offset
) {
4535 __kmp_affinity_offset
=
4536 __kmp_nThreadsPerCore
* __kmp_affinity_offset
% __kmp_avail_proc
;
4540 case affinity_scatter
:
4541 if (__kmp_affinity_compact
>= depth
) {
4542 __kmp_affinity_compact
= 0;
4544 __kmp_affinity_compact
= depth
- 1 - __kmp_affinity_compact
;
4548 case affinity_compact
:
4549 if (__kmp_affinity_compact
>= depth
) {
4550 __kmp_affinity_compact
= depth
- 1;
4554 case affinity_balanced
:
4556 if (__kmp_affinity_verbose
|| __kmp_affinity_warnings
) {
4557 KMP_WARNING(AffBalancedNotAvail
, "KMP_AFFINITY");
4559 __kmp_affinity_type
= affinity_none
;
4560 __kmp_create_affinity_none_places();
4562 } else if (!__kmp_affinity_uniform_topology()) {
4563 // Save the depth for further usage
4564 __kmp_aff_depth
= depth
;
4566 int core_level
= __kmp_affinity_find_core_level(
4567 address2os
, __kmp_avail_proc
, depth
- 1);
4568 int ncores
= __kmp_affinity_compute_ncores(address2os
, __kmp_avail_proc
,
4569 depth
- 1, core_level
);
4570 int maxprocpercore
= __kmp_affinity_max_proc_per_core(
4571 address2os
, __kmp_avail_proc
, depth
- 1, core_level
);
4573 int nproc
= ncores
* maxprocpercore
;
4574 if ((nproc
< 2) || (nproc
< __kmp_avail_proc
)) {
4575 if (__kmp_affinity_verbose
|| __kmp_affinity_warnings
) {
4576 KMP_WARNING(AffBalancedNotAvail
, "KMP_AFFINITY");
4578 __kmp_affinity_type
= affinity_none
;
4582 procarr
= (int *)__kmp_allocate(sizeof(int) * nproc
);
4583 for (int i
= 0; i
< nproc
; i
++) {
4589 for (int i
= 0; i
< __kmp_avail_proc
; i
++) {
4590 int proc
= address2os
[i
].second
;
4592 __kmp_affinity_find_core(address2os
, i
, depth
- 1, core_level
);
4594 if (core
== lastcore
) {
4601 procarr
[core
* maxprocpercore
+ inlastcore
] = proc
;
4604 if (__kmp_affinity_compact
>= depth
) {
4605 __kmp_affinity_compact
= depth
- 1;
4609 // Allocate the gtid->affinity mask table.
4610 if (__kmp_affinity_dups
) {
4611 __kmp_affinity_num_masks
= __kmp_avail_proc
;
4613 __kmp_affinity_num_masks
= numUnique
;
4616 if ((__kmp_nested_proc_bind
.bind_types
[0] != proc_bind_intel
) &&
4617 (__kmp_affinity_num_places
> 0) &&
4618 ((unsigned)__kmp_affinity_num_places
< __kmp_affinity_num_masks
)) {
4619 __kmp_affinity_num_masks
= __kmp_affinity_num_places
;
4622 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks
, __kmp_affinity_num_masks
);
4624 // Sort the address2os table according to the current setting of
4625 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4626 qsort(address2os
, __kmp_avail_proc
, sizeof(*address2os
),
4627 __kmp_affinity_cmp_Address_child_num
);
4631 for (i
= 0, j
= 0; i
< __kmp_avail_proc
; i
++) {
4632 if ((!__kmp_affinity_dups
) && (!address2os
[i
].first
.leader
)) {
4635 unsigned osId
= address2os
[i
].second
;
4636 kmp_affin_mask_t
*src
= KMP_CPU_INDEX(osId2Mask
, osId
);
4637 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX(__kmp_affinity_masks
, j
);
4638 KMP_ASSERT(KMP_CPU_ISSET(osId
, src
));
4639 KMP_CPU_COPY(dest
, src
);
4640 if (++j
>= __kmp_affinity_num_masks
) {
4644 KMP_DEBUG_ASSERT(j
== __kmp_affinity_num_masks
);
4649 KMP_ASSERT2(0, "Unexpected affinity setting");
4652 KMP_CPU_FREE_ARRAY(osId2Mask
, maxIndex
+ 1);
4653 machine_hierarchy
.init(address2os
, __kmp_avail_proc
);
4655 #undef KMP_EXIT_AFF_NONE
4657 void __kmp_affinity_initialize(void) {
4658 // Much of the code above was written assuming that if a machine was not
4659 // affinity capable, then __kmp_affinity_type == affinity_none. We now
4660 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4661 // There are too many checks for __kmp_affinity_type == affinity_none
4662 // in this code. Instead of trying to change them all, check if
4663 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4664 // affinity_none, call the real initialization routine, then restore
4665 // __kmp_affinity_type to affinity_disabled.
4666 int disabled
= (__kmp_affinity_type
== affinity_disabled
);
4667 if (!KMP_AFFINITY_CAPABLE()) {
4668 KMP_ASSERT(disabled
);
4671 __kmp_affinity_type
= affinity_none
;
4673 __kmp_aux_affinity_initialize();
4675 __kmp_affinity_type
= affinity_disabled
;
4679 void __kmp_affinity_uninitialize(void) {
4680 if (__kmp_affinity_masks
!= NULL
) {
4681 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks
, __kmp_affinity_num_masks
);
4682 __kmp_affinity_masks
= NULL
;
4684 if (__kmp_affin_fullMask
!= NULL
) {
4685 KMP_CPU_FREE(__kmp_affin_fullMask
);
4686 __kmp_affin_fullMask
= NULL
;
4688 __kmp_affinity_num_masks
= 0;
4689 __kmp_affinity_type
= affinity_default
;
4690 __kmp_affinity_num_places
= 0;
4691 if (__kmp_affinity_proclist
!= NULL
) {
4692 __kmp_free(__kmp_affinity_proclist
);
4693 __kmp_affinity_proclist
= NULL
;
4695 if (address2os
!= NULL
) {
4696 __kmp_free(address2os
);
4699 if (procarr
!= NULL
) {
4700 __kmp_free(procarr
);
4704 if (__kmp_hwloc_topology
!= NULL
) {
4705 hwloc_topology_destroy(__kmp_hwloc_topology
);
4706 __kmp_hwloc_topology
= NULL
;
4709 KMPAffinity::destroy_api();
4712 void __kmp_affinity_set_init_mask(int gtid
, int isa_root
) {
4713 if (!KMP_AFFINITY_CAPABLE()) {
4717 kmp_info_t
*th
= (kmp_info_t
*)TCR_SYNC_PTR(__kmp_threads
[gtid
]);
4718 if (th
->th
.th_affin_mask
== NULL
) {
4719 KMP_CPU_ALLOC(th
->th
.th_affin_mask
);
4721 KMP_CPU_ZERO(th
->th
.th_affin_mask
);
4724 // Copy the thread mask to the kmp_info_t structure. If
4725 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
4726 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
4727 // then the full mask is the same as the mask of the initialization thread.
4728 kmp_affin_mask_t
*mask
;
4731 if (KMP_AFFINITY_NON_PROC_BIND
) {
4732 if ((__kmp_affinity_type
== affinity_none
) ||
4733 (__kmp_affinity_type
== affinity_balanced
)) {
4734 #if KMP_GROUP_AFFINITY
4735 if (__kmp_num_proc_groups
> 1) {
4739 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4741 mask
= __kmp_affin_fullMask
;
4743 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks
> 0);
4744 i
= (gtid
+ __kmp_affinity_offset
) % __kmp_affinity_num_masks
;
4745 mask
= KMP_CPU_INDEX(__kmp_affinity_masks
, i
);
4749 (__kmp_nested_proc_bind
.bind_types
[0] == proc_bind_false
)) {
4750 #if KMP_GROUP_AFFINITY
4751 if (__kmp_num_proc_groups
> 1) {
4755 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4757 mask
= __kmp_affin_fullMask
;
4759 // int i = some hash function or just a counter that doesn't
4760 // always start at 0. Use gtid for now.
4761 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks
> 0);
4762 i
= (gtid
+ __kmp_affinity_offset
) % __kmp_affinity_num_masks
;
4763 mask
= KMP_CPU_INDEX(__kmp_affinity_masks
, i
);
4767 th
->th
.th_current_place
= i
;
4769 th
->th
.th_new_place
= i
;
4770 th
->th
.th_first_place
= 0;
4771 th
->th
.th_last_place
= __kmp_affinity_num_masks
- 1;
4772 } else if (KMP_AFFINITY_NON_PROC_BIND
) {
4773 // When using a Non-OMP_PROC_BIND affinity method,
4774 // set all threads' place-partition-var to the entire place list
4775 th
->th
.th_first_place
= 0;
4776 th
->th
.th_last_place
= __kmp_affinity_num_masks
- 1;
4779 if (i
== KMP_PLACE_ALL
) {
4780 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4783 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4787 KMP_CPU_COPY(th
->th
.th_affin_mask
, mask
);
4789 if (__kmp_affinity_verbose
4790 /* to avoid duplicate printing (will be correctly printed on barrier) */
4791 && (__kmp_affinity_type
== affinity_none
||
4792 (i
!= KMP_PLACE_ALL
&& __kmp_affinity_type
!= affinity_balanced
))) {
4793 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4794 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4795 th
->th
.th_affin_mask
);
4796 KMP_INFORM(BoundToOSProcSet
, "KMP_AFFINITY", (kmp_int32
)getpid(),
4797 __kmp_gettid(), gtid
, buf
);
4801 // On Windows* OS, the process affinity mask might have changed. If the user
4802 // didn't request affinity and this call fails, just continue silently.
4804 if (__kmp_affinity_type
== affinity_none
) {
4805 __kmp_set_system_affinity(th
->th
.th_affin_mask
, FALSE
);
4808 __kmp_set_system_affinity(th
->th
.th_affin_mask
, TRUE
);
4811 void __kmp_affinity_set_place(int gtid
) {
4812 if (!KMP_AFFINITY_CAPABLE()) {
4816 kmp_info_t
*th
= (kmp_info_t
*)TCR_SYNC_PTR(__kmp_threads
[gtid
]);
4818 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4820 gtid
, th
->th
.th_new_place
, th
->th
.th_current_place
));
4822 // Check that the new place is within this thread's partition.
4823 KMP_DEBUG_ASSERT(th
->th
.th_affin_mask
!= NULL
);
4824 KMP_ASSERT(th
->th
.th_new_place
>= 0);
4825 KMP_ASSERT((unsigned)th
->th
.th_new_place
<= __kmp_affinity_num_masks
);
4826 if (th
->th
.th_first_place
<= th
->th
.th_last_place
) {
4827 KMP_ASSERT((th
->th
.th_new_place
>= th
->th
.th_first_place
) &&
4828 (th
->th
.th_new_place
<= th
->th
.th_last_place
));
4830 KMP_ASSERT((th
->th
.th_new_place
<= th
->th
.th_first_place
) ||
4831 (th
->th
.th_new_place
>= th
->th
.th_last_place
));
4834 // Copy the thread mask to the kmp_info_t structure,
4835 // and set this thread's affinity.
4836 kmp_affin_mask_t
*mask
=
4837 KMP_CPU_INDEX(__kmp_affinity_masks
, th
->th
.th_new_place
);
4838 KMP_CPU_COPY(th
->th
.th_affin_mask
, mask
);
4839 th
->th
.th_current_place
= th
->th
.th_new_place
;
4841 if (__kmp_affinity_verbose
) {
4842 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4843 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4844 th
->th
.th_affin_mask
);
4845 KMP_INFORM(BoundToOSProcSet
, "OMP_PROC_BIND", (kmp_int32
)getpid(),
4846 __kmp_gettid(), gtid
, buf
);
4848 __kmp_set_system_affinity(th
->th
.th_affin_mask
, TRUE
);
4851 int __kmp_aux_set_affinity(void **mask
) {
4856 if (!KMP_AFFINITY_CAPABLE()) {
4860 gtid
= __kmp_entry_gtid();
4861 KA_TRACE(1000, (""); {
4862 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4863 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4864 (kmp_affin_mask_t
*)(*mask
));
4866 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid
,
4870 if (__kmp_env_consistency_check
) {
4871 if ((mask
== NULL
) || (*mask
== NULL
)) {
4872 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
4877 KMP_CPU_SET_ITERATE(proc
, ((kmp_affin_mask_t
*)(*mask
))) {
4878 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
4879 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
4881 if (!KMP_CPU_ISSET(proc
, (kmp_affin_mask_t
*)(*mask
))) {
4886 if (num_procs
== 0) {
4887 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
4890 #if KMP_GROUP_AFFINITY
4891 if (__kmp_get_proc_group((kmp_affin_mask_t
*)(*mask
)) < 0) {
4892 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
4894 #endif /* KMP_GROUP_AFFINITY */
4898 th
= __kmp_threads
[gtid
];
4899 KMP_DEBUG_ASSERT(th
->th
.th_affin_mask
!= NULL
);
4900 retval
= __kmp_set_system_affinity((kmp_affin_mask_t
*)(*mask
), FALSE
);
4902 KMP_CPU_COPY(th
->th
.th_affin_mask
, (kmp_affin_mask_t
*)(*mask
));
4905 th
->th
.th_current_place
= KMP_PLACE_UNDEFINED
;
4906 th
->th
.th_new_place
= KMP_PLACE_UNDEFINED
;
4907 th
->th
.th_first_place
= 0;
4908 th
->th
.th_last_place
= __kmp_affinity_num_masks
- 1;
4910 // Turn off 4.0 affinity for the current tread at this parallel level.
4911 th
->th
.th_current_task
->td_icvs
.proc_bind
= proc_bind_false
;
4916 int __kmp_aux_get_affinity(void **mask
) {
4921 if (!KMP_AFFINITY_CAPABLE()) {
4925 gtid
= __kmp_entry_gtid();
4926 th
= __kmp_threads
[gtid
];
4927 KMP_DEBUG_ASSERT(th
->th
.th_affin_mask
!= NULL
);
4929 KA_TRACE(1000, (""); {
4930 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4931 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4932 th
->th
.th_affin_mask
);
4933 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
4937 if (__kmp_env_consistency_check
) {
4938 if ((mask
== NULL
) || (*mask
== NULL
)) {
4939 KMP_FATAL(AffinityInvalidMask
, "kmp_get_affinity");
4945 retval
= __kmp_get_system_affinity((kmp_affin_mask_t
*)(*mask
), FALSE
);
4946 KA_TRACE(1000, (""); {
4947 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4948 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4949 (kmp_affin_mask_t
*)(*mask
));
4950 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
4957 KMP_CPU_COPY((kmp_affin_mask_t
*)(*mask
), th
->th
.th_affin_mask
);
4960 #endif /* KMP_OS_WINDOWS */
4963 int __kmp_aux_get_affinity_max_proc() {
4964 if (!KMP_AFFINITY_CAPABLE()) {
4967 #if KMP_GROUP_AFFINITY
4968 if (__kmp_num_proc_groups
> 1) {
4969 return (int)(__kmp_num_proc_groups
* sizeof(DWORD_PTR
) * CHAR_BIT
);
4975 int __kmp_aux_set_affinity_mask_proc(int proc
, void **mask
) {
4976 if (!KMP_AFFINITY_CAPABLE()) {
4980 KA_TRACE(1000, (""); {
4981 int gtid
= __kmp_entry_gtid();
4982 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4983 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4984 (kmp_affin_mask_t
*)(*mask
));
4985 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4986 "affinity mask for thread %d = %s\n",
4990 if (__kmp_env_consistency_check
) {
4991 if ((mask
== NULL
) || (*mask
== NULL
)) {
4992 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity_mask_proc");
4996 if ((proc
< 0) || (proc
>= __kmp_aux_get_affinity_max_proc())) {
4999 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
5003 KMP_CPU_SET(proc
, (kmp_affin_mask_t
*)(*mask
));
5007 int __kmp_aux_unset_affinity_mask_proc(int proc
, void **mask
) {
5008 if (!KMP_AFFINITY_CAPABLE()) {
5012 KA_TRACE(1000, (""); {
5013 int gtid
= __kmp_entry_gtid();
5014 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5015 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5016 (kmp_affin_mask_t
*)(*mask
));
5017 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5018 "affinity mask for thread %d = %s\n",
5022 if (__kmp_env_consistency_check
) {
5023 if ((mask
== NULL
) || (*mask
== NULL
)) {
5024 KMP_FATAL(AffinityInvalidMask
, "kmp_unset_affinity_mask_proc");
5028 if ((proc
< 0) || (proc
>= __kmp_aux_get_affinity_max_proc())) {
5031 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
5035 KMP_CPU_CLR(proc
, (kmp_affin_mask_t
*)(*mask
));
5039 int __kmp_aux_get_affinity_mask_proc(int proc
, void **mask
) {
5040 if (!KMP_AFFINITY_CAPABLE()) {
5044 KA_TRACE(1000, (""); {
5045 int gtid
= __kmp_entry_gtid();
5046 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5047 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5048 (kmp_affin_mask_t
*)(*mask
));
5049 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5050 "affinity mask for thread %d = %s\n",
5054 if (__kmp_env_consistency_check
) {
5055 if ((mask
== NULL
) || (*mask
== NULL
)) {
5056 KMP_FATAL(AffinityInvalidMask
, "kmp_get_affinity_mask_proc");
5060 if ((proc
< 0) || (proc
>= __kmp_aux_get_affinity_max_proc())) {
5063 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
5067 return KMP_CPU_ISSET(proc
, (kmp_affin_mask_t
*)(*mask
));
5070 // Dynamic affinity settings - Affinity balanced
5071 void __kmp_balanced_affinity(kmp_info_t
*th
, int nthreads
) {
5072 KMP_DEBUG_ASSERT(th
);
5073 bool fine_gran
= true;
5074 int tid
= th
->th
.th_info
.ds
.ds_tid
;
5076 switch (__kmp_affinity_gran
) {
5077 case affinity_gran_fine
:
5078 case affinity_gran_thread
:
5080 case affinity_gran_core
:
5081 if (__kmp_nThreadsPerCore
> 1) {
5085 case affinity_gran_package
:
5086 if (nCoresPerPkg
> 1) {
5094 if (__kmp_affinity_uniform_topology()) {
5097 // Number of hyper threads per core in HT machine
5098 int __kmp_nth_per_core
= __kmp_avail_proc
/ __kmp_ncores
;
5100 int ncores
= __kmp_ncores
;
5101 if ((nPackages
> 1) && (__kmp_nth_per_core
<= 1)) {
5102 __kmp_nth_per_core
= __kmp_avail_proc
/ nPackages
;
5105 // How many threads will be bound to each core
5106 int chunk
= nthreads
/ ncores
;
5107 // How many cores will have an additional thread bound to it - "big cores"
5108 int big_cores
= nthreads
% ncores
;
5109 // Number of threads on the big cores
5110 int big_nth
= (chunk
+ 1) * big_cores
;
5111 if (tid
< big_nth
) {
5112 coreID
= tid
/ (chunk
+ 1);
5113 threadID
= (tid
% (chunk
+ 1)) % __kmp_nth_per_core
;
5114 } else { // tid >= big_nth
5115 coreID
= (tid
- big_cores
) / chunk
;
5116 threadID
= ((tid
- big_cores
) % chunk
) % __kmp_nth_per_core
;
5119 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5120 "Illegal set affinity operation when not capable");
5122 kmp_affin_mask_t
*mask
= th
->th
.th_affin_mask
;
5126 int osID
= address2os
[coreID
* __kmp_nth_per_core
+ threadID
].second
;
5127 KMP_CPU_SET(osID
, mask
);
5129 for (int i
= 0; i
< __kmp_nth_per_core
; i
++) {
5131 osID
= address2os
[coreID
* __kmp_nth_per_core
+ i
].second
;
5132 KMP_CPU_SET(osID
, mask
);
5135 if (__kmp_affinity_verbose
) {
5136 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5137 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, mask
);
5138 KMP_INFORM(BoundToOSProcSet
, "KMP_AFFINITY", (kmp_int32
)getpid(),
5139 __kmp_gettid(), tid
, buf
);
5141 __kmp_set_system_affinity(mask
, TRUE
);
5142 } else { // Non-uniform topology
5144 kmp_affin_mask_t
*mask
= th
->th
.th_affin_mask
;
5147 int core_level
= __kmp_affinity_find_core_level(
5148 address2os
, __kmp_avail_proc
, __kmp_aff_depth
- 1);
5149 int ncores
= __kmp_affinity_compute_ncores(address2os
, __kmp_avail_proc
,
5150 __kmp_aff_depth
- 1, core_level
);
5151 int nth_per_core
= __kmp_affinity_max_proc_per_core(
5152 address2os
, __kmp_avail_proc
, __kmp_aff_depth
- 1, core_level
);
5154 // For performance gain consider the special case nthreads ==
5156 if (nthreads
== __kmp_avail_proc
) {
5158 int osID
= address2os
[tid
].second
;
5159 KMP_CPU_SET(osID
, mask
);
5161 int core
= __kmp_affinity_find_core(address2os
, tid
,
5162 __kmp_aff_depth
- 1, core_level
);
5163 for (int i
= 0; i
< __kmp_avail_proc
; i
++) {
5164 int osID
= address2os
[i
].second
;
5165 if (__kmp_affinity_find_core(address2os
, i
, __kmp_aff_depth
- 1,
5166 core_level
) == core
) {
5167 KMP_CPU_SET(osID
, mask
);
5171 } else if (nthreads
<= ncores
) {
5174 for (int i
= 0; i
< ncores
; i
++) {
5175 // Check if this core from procarr[] is in the mask
5177 for (int j
= 0; j
< nth_per_core
; j
++) {
5178 if (procarr
[i
* nth_per_core
+ j
] != -1) {
5185 for (int j
= 0; j
< nth_per_core
; j
++) {
5186 int osID
= procarr
[i
* nth_per_core
+ j
];
5188 KMP_CPU_SET(osID
, mask
);
5189 // For fine granularity it is enough to set the first available
5190 // osID for this core
5202 } else { // nthreads > ncores
5203 // Array to save the number of processors at each core
5204 int *nproc_at_core
= (int *)KMP_ALLOCA(sizeof(int) * ncores
);
5205 // Array to save the number of cores with "x" available processors;
5206 int *ncores_with_x_procs
=
5207 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core
+ 1));
5208 // Array to save the number of cores with # procs from x to nth_per_core
5209 int *ncores_with_x_to_max_procs
=
5210 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core
+ 1));
5212 for (int i
= 0; i
<= nth_per_core
; i
++) {
5213 ncores_with_x_procs
[i
] = 0;
5214 ncores_with_x_to_max_procs
[i
] = 0;
5217 for (int i
= 0; i
< ncores
; i
++) {
5219 for (int j
= 0; j
< nth_per_core
; j
++) {
5220 if (procarr
[i
* nth_per_core
+ j
] != -1) {
5224 nproc_at_core
[i
] = cnt
;
5225 ncores_with_x_procs
[cnt
]++;
5228 for (int i
= 0; i
<= nth_per_core
; i
++) {
5229 for (int j
= i
; j
<= nth_per_core
; j
++) {
5230 ncores_with_x_to_max_procs
[i
] += ncores_with_x_procs
[j
];
5234 // Max number of processors
5235 int nproc
= nth_per_core
* ncores
;
5236 // An array to keep number of threads per each context
5237 int *newarr
= (int *)__kmp_allocate(sizeof(int) * nproc
);
5238 for (int i
= 0; i
< nproc
; i
++) {
5245 for (int j
= 1; j
<= nth_per_core
; j
++) {
5246 int cnt
= ncores_with_x_to_max_procs
[j
];
5247 for (int i
= 0; i
< ncores
; i
++) {
5248 // Skip the core with 0 processors
5249 if (nproc_at_core
[i
] == 0) {
5252 for (int k
= 0; k
< nth_per_core
; k
++) {
5253 if (procarr
[i
* nth_per_core
+ k
] != -1) {
5254 if (newarr
[i
* nth_per_core
+ k
] == 0) {
5255 newarr
[i
* nth_per_core
+ k
] = 1;
5261 newarr
[i
* nth_per_core
+ k
]++;
5269 if (cnt
== 0 || nth
== 0) {
5280 for (int i
= 0; i
< nproc
; i
++) {
5284 int osID
= procarr
[i
];
5285 KMP_CPU_SET(osID
, mask
);
5287 int coreID
= i
/ nth_per_core
;
5288 for (int ii
= 0; ii
< nth_per_core
; ii
++) {
5289 int osID
= procarr
[coreID
* nth_per_core
+ ii
];
5291 KMP_CPU_SET(osID
, mask
);
5301 if (__kmp_affinity_verbose
) {
5302 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5303 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, mask
);
5304 KMP_INFORM(BoundToOSProcSet
, "KMP_AFFINITY", (kmp_int32
)getpid(),
5305 __kmp_gettid(), tid
, buf
);
5307 __kmp_set_system_affinity(mask
, TRUE
);
5311 #if KMP_OS_LINUX || KMP_OS_FREEBSD
5312 // We don't need this entry for Windows because
5313 // there is GetProcessAffinityMask() api
5315 // The intended usage is indicated by these steps:
5316 // 1) The user gets the current affinity mask
5317 // 2) Then sets the affinity by calling this function
5318 // 3) Error check the return value
5319 // 4) Use non-OpenMP parallelization
5320 // 5) Reset the affinity to what was stored in step 1)
5325 kmp_set_thread_affinity_mask_initial()
5326 // the function returns 0 on success,
5327 // -1 if we cannot bind thread
5328 // >0 (errno) if an error happened during binding
5330 int gtid
= __kmp_get_gtid();
5332 // Do not touch non-omp threads
5333 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5334 "non-omp thread, returning\n"));
5337 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle
) {
5338 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5339 "affinity not initialized, returning\n"));
5342 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5343 "set full mask for thread %d\n",
5345 KMP_DEBUG_ASSERT(__kmp_affin_fullMask
!= NULL
);
5346 return __kmp_set_system_affinity(__kmp_affin_fullMask
, FALSE
);
5350 #endif // KMP_AFFINITY_SUPPORTED