2 * kmp_affinity.cpp -- affinity management
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "kmp_affinity.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
31 // The machine topology
32 kmp_topology_t
*__kmp_topology
= nullptr;
33 // KMP_HW_SUBSET environment variable
34 kmp_hw_subset_t
*__kmp_hw_subset
= nullptr;
36 // Store the real or imagined machine hierarchy here
37 static hierarchy_info machine_hierarchy
;
39 void __kmp_cleanup_hierarchy() { machine_hierarchy
.fini(); }
41 #if KMP_AFFINITY_SUPPORTED
42 // Helper class to see if place lists further restrict the fullMask
43 class kmp_full_mask_modifier_t
{
44 kmp_affin_mask_t
*mask
;
47 kmp_full_mask_modifier_t() {
51 ~kmp_full_mask_modifier_t() {
55 void include(const kmp_affin_mask_t
*other
) { KMP_CPU_UNION(mask
, other
); }
56 // If the new full mask is different from the current full mask,
57 // then switch them. Returns true if full mask was affected, false otherwise.
58 bool restrict_to_mask() {
59 // See if the new mask further restricts or changes the full mask
60 if (KMP_CPU_EQUAL(__kmp_affin_fullMask
, mask
) || KMP_CPU_ISEMPTY(mask
))
62 return __kmp_topology
->restrict_to_mask(mask
);
66 static inline const char *
67 __kmp_get_affinity_env_var(const kmp_affinity_t
&affinity
,
68 bool for_binding
= false) {
69 if (affinity
.flags
.omp_places
) {
71 return "OMP_PROC_BIND";
74 return affinity
.env_var
;
76 #endif // KMP_AFFINITY_SUPPORTED
78 void __kmp_get_hierarchy(kmp_uint32 nproc
, kmp_bstate_t
*thr_bar
) {
80 // The test below is true if affinity is available, but set to "none". Need to
81 // init on first use of hierarchical barrier.
82 if (TCR_1(machine_hierarchy
.uninitialized
))
83 machine_hierarchy
.init(nproc
);
85 // Adjust the hierarchy in case num threads exceeds original
86 if (nproc
> machine_hierarchy
.base_num_threads
)
87 machine_hierarchy
.resize(nproc
);
89 depth
= machine_hierarchy
.depth
;
90 KMP_DEBUG_ASSERT(depth
> 0);
92 thr_bar
->depth
= depth
;
93 __kmp_type_convert(machine_hierarchy
.numPerLevel
[0] - 1,
94 &(thr_bar
->base_leaf_kids
));
95 thr_bar
->skip_per_level
= machine_hierarchy
.skipPerLevel
;
98 static int nCoresPerPkg
, nPackages
;
99 static int __kmp_nThreadsPerCore
;
100 #ifndef KMP_DFLT_NTH_CORES
101 static int __kmp_ncores
;
104 const char *__kmp_hw_get_catalog_string(kmp_hw_t type
, bool plural
) {
107 return ((plural
) ? KMP_I18N_STR(Sockets
) : KMP_I18N_STR(Socket
));
109 return ((plural
) ? KMP_I18N_STR(Dice
) : KMP_I18N_STR(Die
));
111 return ((plural
) ? KMP_I18N_STR(Modules
) : KMP_I18N_STR(Module
));
113 return ((plural
) ? KMP_I18N_STR(Tiles
) : KMP_I18N_STR(Tile
));
115 return ((plural
) ? KMP_I18N_STR(NumaDomains
) : KMP_I18N_STR(NumaDomain
));
117 return ((plural
) ? KMP_I18N_STR(L3Caches
) : KMP_I18N_STR(L3Cache
));
119 return ((plural
) ? KMP_I18N_STR(L2Caches
) : KMP_I18N_STR(L2Cache
));
121 return ((plural
) ? KMP_I18N_STR(L1Caches
) : KMP_I18N_STR(L1Cache
));
123 return ((plural
) ? KMP_I18N_STR(LLCaches
) : KMP_I18N_STR(LLCache
));
125 return ((plural
) ? KMP_I18N_STR(Cores
) : KMP_I18N_STR(Core
));
127 return ((plural
) ? KMP_I18N_STR(Threads
) : KMP_I18N_STR(Thread
));
128 case KMP_HW_PROC_GROUP
:
129 return ((plural
) ? KMP_I18N_STR(ProcGroups
) : KMP_I18N_STR(ProcGroup
));
131 return KMP_I18N_STR(Unknown
);
134 const char *__kmp_hw_get_keyword(kmp_hw_t type
, bool plural
) {
137 return ((plural
) ? "sockets" : "socket");
139 return ((plural
) ? "dice" : "die");
141 return ((plural
) ? "modules" : "module");
143 return ((plural
) ? "tiles" : "tile");
145 return ((plural
) ? "numa_domains" : "numa_domain");
147 return ((plural
) ? "l3_caches" : "l3_cache");
149 return ((plural
) ? "l2_caches" : "l2_cache");
151 return ((plural
) ? "l1_caches" : "l1_cache");
153 return ((plural
) ? "ll_caches" : "ll_cache");
155 return ((plural
) ? "cores" : "core");
157 return ((plural
) ? "threads" : "thread");
158 case KMP_HW_PROC_GROUP
:
159 return ((plural
) ? "proc_groups" : "proc_group");
161 return ((plural
) ? "unknowns" : "unknown");
164 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type
) {
166 case KMP_HW_CORE_TYPE_UNKNOWN
:
168 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
169 case KMP_HW_CORE_TYPE_ATOM
:
170 return "Intel Atom(R) processor";
171 case KMP_HW_CORE_TYPE_CORE
:
172 return "Intel(R) Core(TM) processor";
178 #if KMP_AFFINITY_SUPPORTED
179 // If affinity is supported, check the affinity
180 // verbose and warning flags before printing warning
181 #define KMP_AFF_WARNING(s, ...) \
182 if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) { \
183 KMP_WARNING(__VA_ARGS__); \
186 #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
189 ////////////////////////////////////////////////////////////////////////////////
190 // kmp_hw_thread_t methods
191 int kmp_hw_thread_t::compare_ids(const void *a
, const void *b
) {
192 const kmp_hw_thread_t
*ahwthread
= (const kmp_hw_thread_t
*)a
;
193 const kmp_hw_thread_t
*bhwthread
= (const kmp_hw_thread_t
*)b
;
194 int depth
= __kmp_topology
->get_depth();
195 for (int level
= 0; level
< depth
; ++level
) {
196 if (ahwthread
->ids
[level
] < bhwthread
->ids
[level
])
198 else if (ahwthread
->ids
[level
] > bhwthread
->ids
[level
])
201 if (ahwthread
->os_id
< bhwthread
->os_id
)
203 else if (ahwthread
->os_id
> bhwthread
->os_id
)
208 #if KMP_AFFINITY_SUPPORTED
209 int kmp_hw_thread_t::compare_compact(const void *a
, const void *b
) {
211 const kmp_hw_thread_t
*aa
= (const kmp_hw_thread_t
*)a
;
212 const kmp_hw_thread_t
*bb
= (const kmp_hw_thread_t
*)b
;
213 int depth
= __kmp_topology
->get_depth();
214 int compact
= __kmp_topology
->compact
;
215 KMP_DEBUG_ASSERT(compact
>= 0);
216 KMP_DEBUG_ASSERT(compact
<= depth
);
217 for (i
= 0; i
< compact
; i
++) {
218 int j
= depth
- i
- 1;
219 if (aa
->sub_ids
[j
] < bb
->sub_ids
[j
])
221 if (aa
->sub_ids
[j
] > bb
->sub_ids
[j
])
224 for (; i
< depth
; i
++) {
226 if (aa
->sub_ids
[j
] < bb
->sub_ids
[j
])
228 if (aa
->sub_ids
[j
] > bb
->sub_ids
[j
])
235 void kmp_hw_thread_t::print() const {
236 int depth
= __kmp_topology
->get_depth();
237 printf("%4d ", os_id
);
238 for (int i
= 0; i
< depth
; ++i
) {
239 printf("%4d ", ids
[i
]);
242 if (attrs
.is_core_type_valid())
243 printf(" (%s)", __kmp_hw_get_core_type_string(attrs
.get_core_type()));
244 if (attrs
.is_core_eff_valid())
245 printf(" (eff=%d)", attrs
.get_core_eff());
252 ////////////////////////////////////////////////////////////////////////////////
253 // kmp_topology_t methods
255 // Add a layer to the topology based on the ids. Assume the topology
256 // is perfectly nested (i.e., so no object has more than one parent)
257 void kmp_topology_t::_insert_layer(kmp_hw_t type
, const int *ids
) {
258 // Figure out where the layer should go by comparing the ids of the current
259 // layers with the new ids
261 int previous_id
= kmp_hw_thread_t::UNKNOWN_ID
;
262 int previous_new_id
= kmp_hw_thread_t::UNKNOWN_ID
;
264 // Start from the highest layer and work down to find target layer
265 // If new layer is equal to another layer then put the new layer above
266 for (target_layer
= 0; target_layer
< depth
; ++target_layer
) {
267 bool layers_equal
= true;
268 bool strictly_above_target_layer
= false;
269 for (int i
= 0; i
< num_hw_threads
; ++i
) {
270 int id
= hw_threads
[i
].ids
[target_layer
];
272 if (id
!= previous_id
&& new_id
== previous_new_id
) {
273 // Found the layer we are strictly above
274 strictly_above_target_layer
= true;
275 layers_equal
= false;
277 } else if (id
== previous_id
&& new_id
!= previous_new_id
) {
278 // Found a layer we are below. Move to next layer and check.
279 layers_equal
= false;
283 previous_new_id
= new_id
;
285 if (strictly_above_target_layer
|| layers_equal
)
289 // Found the layer we are above. Now move everything to accommodate the new
290 // layer. And put the new ids and type into the topology.
291 for (int i
= depth
- 1, j
= depth
; i
>= target_layer
; --i
, --j
)
293 types
[target_layer
] = type
;
294 for (int k
= 0; k
< num_hw_threads
; ++k
) {
295 for (int i
= depth
- 1, j
= depth
; i
>= target_layer
; --i
, --j
)
296 hw_threads
[k
].ids
[j
] = hw_threads
[k
].ids
[i
];
297 hw_threads
[k
].ids
[target_layer
] = ids
[k
];
299 equivalent
[type
] = type
;
303 #if KMP_GROUP_AFFINITY
304 // Insert the Windows Processor Group structure into the topology
305 void kmp_topology_t::_insert_windows_proc_groups() {
306 // Do not insert the processor group structure for a single group
307 if (__kmp_num_proc_groups
== 1)
309 kmp_affin_mask_t
*mask
;
310 int *ids
= (int *)__kmp_allocate(sizeof(int) * num_hw_threads
);
312 for (int i
= 0; i
< num_hw_threads
; ++i
) {
314 KMP_CPU_SET(hw_threads
[i
].os_id
, mask
);
315 ids
[i
] = __kmp_get_proc_group(mask
);
318 _insert_layer(KMP_HW_PROC_GROUP
, ids
);
323 // Remove layers that don't add information to the topology.
324 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
325 void kmp_topology_t::_remove_radix1_layers() {
326 int preference
[KMP_HW_LAST
];
327 int top_index1
, top_index2
;
328 // Set up preference associative array
329 preference
[KMP_HW_SOCKET
] = 110;
330 preference
[KMP_HW_PROC_GROUP
] = 100;
331 preference
[KMP_HW_CORE
] = 95;
332 preference
[KMP_HW_THREAD
] = 90;
333 preference
[KMP_HW_NUMA
] = 85;
334 preference
[KMP_HW_DIE
] = 80;
335 preference
[KMP_HW_TILE
] = 75;
336 preference
[KMP_HW_MODULE
] = 73;
337 preference
[KMP_HW_L3
] = 70;
338 preference
[KMP_HW_L2
] = 65;
339 preference
[KMP_HW_L1
] = 60;
340 preference
[KMP_HW_LLC
] = 5;
343 while (top_index1
< depth
- 1 && top_index2
< depth
) {
344 kmp_hw_t type1
= types
[top_index1
];
345 kmp_hw_t type2
= types
[top_index2
];
346 KMP_ASSERT_VALID_HW_TYPE(type1
);
347 KMP_ASSERT_VALID_HW_TYPE(type2
);
348 // Do not allow the three main topology levels (sockets, cores, threads) to
350 if ((type1
== KMP_HW_THREAD
|| type1
== KMP_HW_CORE
||
351 type1
== KMP_HW_SOCKET
) &&
352 (type2
== KMP_HW_THREAD
|| type2
== KMP_HW_CORE
||
353 type2
== KMP_HW_SOCKET
)) {
354 top_index1
= top_index2
++;
358 bool all_same
= true;
359 int id1
= hw_threads
[0].ids
[top_index1
];
360 int id2
= hw_threads
[0].ids
[top_index2
];
361 int pref1
= preference
[type1
];
362 int pref2
= preference
[type2
];
363 for (int hwidx
= 1; hwidx
< num_hw_threads
; ++hwidx
) {
364 if (hw_threads
[hwidx
].ids
[top_index1
] == id1
&&
365 hw_threads
[hwidx
].ids
[top_index2
] != id2
) {
369 if (hw_threads
[hwidx
].ids
[top_index2
] != id2
)
371 id1
= hw_threads
[hwidx
].ids
[top_index1
];
372 id2
= hw_threads
[hwidx
].ids
[top_index2
];
375 // Select the layer to remove based on preference
376 kmp_hw_t remove_type
, keep_type
;
377 int remove_layer
, remove_layer_ids
;
380 remove_layer
= remove_layer_ids
= top_index2
;
384 remove_layer
= remove_layer_ids
= top_index1
;
387 // If all the indexes for the second (deeper) layer are the same.
388 // e.g., all are zero, then make sure to keep the first layer's ids
390 remove_layer_ids
= top_index2
;
391 // Remove radix one type by setting the equivalence, removing the id from
392 // the hw threads and removing the layer from types and depth
393 set_equivalent_type(remove_type
, keep_type
);
394 for (int idx
= 0; idx
< num_hw_threads
; ++idx
) {
395 kmp_hw_thread_t
&hw_thread
= hw_threads
[idx
];
396 for (int d
= remove_layer_ids
; d
< depth
- 1; ++d
)
397 hw_thread
.ids
[d
] = hw_thread
.ids
[d
+ 1];
399 for (int idx
= remove_layer
; idx
< depth
- 1; ++idx
)
400 types
[idx
] = types
[idx
+ 1];
403 top_index1
= top_index2
++;
406 KMP_ASSERT(depth
> 0);
409 void kmp_topology_t::_set_last_level_cache() {
410 if (get_equivalent_type(KMP_HW_L3
) != KMP_HW_UNKNOWN
)
411 set_equivalent_type(KMP_HW_LLC
, KMP_HW_L3
);
412 else if (get_equivalent_type(KMP_HW_L2
) != KMP_HW_UNKNOWN
)
413 set_equivalent_type(KMP_HW_LLC
, KMP_HW_L2
);
414 #if KMP_MIC_SUPPORTED
415 else if (__kmp_mic_type
== mic3
) {
416 if (get_equivalent_type(KMP_HW_L2
) != KMP_HW_UNKNOWN
)
417 set_equivalent_type(KMP_HW_LLC
, KMP_HW_L2
);
418 else if (get_equivalent_type(KMP_HW_TILE
) != KMP_HW_UNKNOWN
)
419 set_equivalent_type(KMP_HW_LLC
, KMP_HW_TILE
);
420 // L2/Tile wasn't detected so just say L1
422 set_equivalent_type(KMP_HW_LLC
, KMP_HW_L1
);
425 else if (get_equivalent_type(KMP_HW_L1
) != KMP_HW_UNKNOWN
)
426 set_equivalent_type(KMP_HW_LLC
, KMP_HW_L1
);
427 // Fallback is to set last level cache to socket or core
428 if (get_equivalent_type(KMP_HW_LLC
) == KMP_HW_UNKNOWN
) {
429 if (get_equivalent_type(KMP_HW_SOCKET
) != KMP_HW_UNKNOWN
)
430 set_equivalent_type(KMP_HW_LLC
, KMP_HW_SOCKET
);
431 else if (get_equivalent_type(KMP_HW_CORE
) != KMP_HW_UNKNOWN
)
432 set_equivalent_type(KMP_HW_LLC
, KMP_HW_CORE
);
434 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC
) != KMP_HW_UNKNOWN
);
437 // Gather the count of each topology layer and the ratio
438 void kmp_topology_t::_gather_enumeration_information() {
439 int previous_id
[KMP_HW_LAST
];
440 int max
[KMP_HW_LAST
];
442 for (int i
= 0; i
< depth
; ++i
) {
443 previous_id
[i
] = kmp_hw_thread_t::UNKNOWN_ID
;
448 int core_level
= get_level(KMP_HW_CORE
);
449 for (int i
= 0; i
< num_hw_threads
; ++i
) {
450 kmp_hw_thread_t
&hw_thread
= hw_threads
[i
];
451 for (int layer
= 0; layer
< depth
; ++layer
) {
452 int id
= hw_thread
.ids
[layer
];
453 if (id
!= previous_id
[layer
]) {
454 // Add an additional increment to each count
455 for (int l
= layer
; l
< depth
; ++l
)
457 // Keep track of topology layer ratio statistics
459 for (int l
= layer
+ 1; l
< depth
; ++l
) {
460 if (max
[l
] > ratio
[l
])
464 // Figure out the number of different core types
465 // and efficiencies for hybrid CPUs
466 if (__kmp_is_hybrid_cpu() && core_level
>= 0 && layer
<= core_level
) {
467 if (hw_thread
.attrs
.is_core_eff_valid() &&
468 hw_thread
.attrs
.core_eff
>= num_core_efficiencies
) {
469 // Because efficiencies can range from 0 to max efficiency - 1,
470 // the number of efficiencies is max efficiency + 1
471 num_core_efficiencies
= hw_thread
.attrs
.core_eff
+ 1;
473 if (hw_thread
.attrs
.is_core_type_valid()) {
475 for (int j
= 0; j
< num_core_types
; ++j
) {
476 if (hw_thread
.attrs
.get_core_type() == core_types
[j
]) {
482 KMP_ASSERT(num_core_types
< KMP_HW_MAX_NUM_CORE_TYPES
);
483 core_types
[num_core_types
++] = hw_thread
.attrs
.get_core_type();
490 for (int layer
= 0; layer
< depth
; ++layer
) {
491 previous_id
[layer
] = hw_thread
.ids
[layer
];
494 for (int layer
= 0; layer
< depth
; ++layer
) {
495 if (max
[layer
] > ratio
[layer
])
496 ratio
[layer
] = max
[layer
];
500 int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t
&attr
,
502 bool find_all
) const {
503 int current
, current_max
;
504 int previous_id
[KMP_HW_LAST
];
505 for (int i
= 0; i
< depth
; ++i
)
506 previous_id
[i
] = kmp_hw_thread_t::UNKNOWN_ID
;
507 int core_level
= get_level(KMP_HW_CORE
);
510 KMP_ASSERT(above_level
< core_level
);
513 for (int i
= 0; i
< num_hw_threads
; ++i
) {
514 kmp_hw_thread_t
&hw_thread
= hw_threads
[i
];
515 if (!find_all
&& hw_thread
.ids
[above_level
] != previous_id
[above_level
]) {
516 if (current
> current_max
)
517 current_max
= current
;
518 current
= hw_thread
.attrs
.contains(attr
);
520 for (int level
= above_level
+ 1; level
<= core_level
; ++level
) {
521 if (hw_thread
.ids
[level
] != previous_id
[level
]) {
522 if (hw_thread
.attrs
.contains(attr
))
528 for (int level
= 0; level
< depth
; ++level
)
529 previous_id
[level
] = hw_thread
.ids
[level
];
531 if (current
> current_max
)
532 current_max
= current
;
536 // Find out if the topology is uniform
537 void kmp_topology_t::_discover_uniformity() {
539 for (int level
= 0; level
< depth
; ++level
)
541 flags
.uniform
= (num
== count
[depth
- 1]);
544 // Set all the sub_ids for each hardware thread
545 void kmp_topology_t::_set_sub_ids() {
546 int previous_id
[KMP_HW_LAST
];
547 int sub_id
[KMP_HW_LAST
];
549 for (int i
= 0; i
< depth
; ++i
) {
553 for (int i
= 0; i
< num_hw_threads
; ++i
) {
554 kmp_hw_thread_t
&hw_thread
= hw_threads
[i
];
556 for (int j
= 0; j
< depth
; ++j
) {
557 if (hw_thread
.ids
[j
] != previous_id
[j
]) {
559 for (int k
= j
+ 1; k
< depth
; ++k
) {
566 for (int j
= 0; j
< depth
; ++j
) {
567 previous_id
[j
] = hw_thread
.ids
[j
];
569 // Set the sub_ids field
570 for (int j
= 0; j
< depth
; ++j
) {
571 hw_thread
.sub_ids
[j
] = sub_id
[j
];
576 void kmp_topology_t::_set_globals() {
577 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
578 int core_level
, thread_level
, package_level
;
579 package_level
= get_level(KMP_HW_SOCKET
);
580 #if KMP_GROUP_AFFINITY
581 if (package_level
== -1)
582 package_level
= get_level(KMP_HW_PROC_GROUP
);
584 core_level
= get_level(KMP_HW_CORE
);
585 thread_level
= get_level(KMP_HW_THREAD
);
587 KMP_ASSERT(core_level
!= -1);
588 KMP_ASSERT(thread_level
!= -1);
590 __kmp_nThreadsPerCore
= calculate_ratio(thread_level
, core_level
);
591 if (package_level
!= -1) {
592 nCoresPerPkg
= calculate_ratio(core_level
, package_level
);
593 nPackages
= get_count(package_level
);
596 nCoresPerPkg
= get_count(core_level
);
599 #ifndef KMP_DFLT_NTH_CORES
600 __kmp_ncores
= get_count(core_level
);
604 kmp_topology_t
*kmp_topology_t::allocate(int nproc
, int ndepth
,
605 const kmp_hw_t
*types
) {
606 kmp_topology_t
*retval
;
607 // Allocate all data in one large allocation
608 size_t size
= sizeof(kmp_topology_t
) + sizeof(kmp_hw_thread_t
) * nproc
+
609 sizeof(int) * (size_t)KMP_HW_LAST
* 3;
610 char *bytes
= (char *)__kmp_allocate(size
);
611 retval
= (kmp_topology_t
*)bytes
;
613 retval
->hw_threads
= (kmp_hw_thread_t
*)(bytes
+ sizeof(kmp_topology_t
));
615 retval
->hw_threads
= nullptr;
617 retval
->num_hw_threads
= nproc
;
618 retval
->depth
= ndepth
;
620 (int *)(bytes
+ sizeof(kmp_topology_t
) + sizeof(kmp_hw_thread_t
) * nproc
);
621 retval
->types
= (kmp_hw_t
*)arr
;
622 retval
->ratio
= arr
+ (size_t)KMP_HW_LAST
;
623 retval
->count
= arr
+ 2 * (size_t)KMP_HW_LAST
;
624 retval
->num_core_efficiencies
= 0;
625 retval
->num_core_types
= 0;
627 for (int i
= 0; i
< KMP_HW_MAX_NUM_CORE_TYPES
; ++i
)
628 retval
->core_types
[i
] = KMP_HW_CORE_TYPE_UNKNOWN
;
629 KMP_FOREACH_HW_TYPE(type
) { retval
->equivalent
[type
] = KMP_HW_UNKNOWN
; }
630 for (int i
= 0; i
< ndepth
; ++i
) {
631 retval
->types
[i
] = types
[i
];
632 retval
->equivalent
[types
[i
]] = types
[i
];
637 void kmp_topology_t::deallocate(kmp_topology_t
*topology
) {
639 __kmp_free(topology
);
642 bool kmp_topology_t::check_ids() const {
643 // Assume ids have been sorted
644 if (num_hw_threads
== 0)
646 for (int i
= 1; i
< num_hw_threads
; ++i
) {
647 kmp_hw_thread_t
¤t_thread
= hw_threads
[i
];
648 kmp_hw_thread_t
&previous_thread
= hw_threads
[i
- 1];
650 for (int j
= 0; j
< depth
; ++j
) {
651 if (previous_thread
.ids
[j
] != current_thread
.ids
[j
]) {
663 void kmp_topology_t::dump() const {
664 printf("***********************\n");
665 printf("*** __kmp_topology: ***\n");
666 printf("***********************\n");
667 printf("* depth: %d\n", depth
);
670 for (int i
= 0; i
< depth
; ++i
)
671 printf("%15s ", __kmp_hw_get_keyword(types
[i
]));
675 for (int i
= 0; i
< depth
; ++i
) {
676 printf("%15d ", ratio
[i
]);
681 for (int i
= 0; i
< depth
; ++i
) {
682 printf("%15d ", count
[i
]);
686 printf("* num_core_eff: %d\n", num_core_efficiencies
);
687 printf("* num_core_types: %d\n", num_core_types
);
688 printf("* core_types: ");
689 for (int i
= 0; i
< num_core_types
; ++i
)
690 printf("%3d ", core_types
[i
]);
693 printf("* equivalent map:\n");
694 KMP_FOREACH_HW_TYPE(i
) {
695 const char *key
= __kmp_hw_get_keyword(i
);
696 const char *value
= __kmp_hw_get_keyword(equivalent
[i
]);
697 printf("%-15s -> %-15s\n", key
, value
);
700 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
702 printf("* num_hw_threads: %d\n", num_hw_threads
);
703 printf("* hw_threads:\n");
704 for (int i
= 0; i
< num_hw_threads
; ++i
) {
705 hw_threads
[i
].print();
707 printf("***********************\n");
710 void kmp_topology_t::print(const char *env_var
) const {
712 int print_types_depth
;
713 __kmp_str_buf_init(&buf
);
714 kmp_hw_t print_types
[KMP_HW_LAST
+ 2];
716 // Num Available Threads
717 if (num_hw_threads
) {
718 KMP_INFORM(AvailableOSProc
, env_var
, num_hw_threads
);
720 KMP_INFORM(AvailableOSProc
, env_var
, __kmp_xproc
);
725 KMP_INFORM(Uniform
, env_var
);
727 KMP_INFORM(NonUniform
, env_var
);
731 KMP_FOREACH_HW_TYPE(type
) {
732 kmp_hw_t eq_type
= equivalent
[type
];
733 if (eq_type
!= KMP_HW_UNKNOWN
&& eq_type
!= type
) {
734 KMP_INFORM(AffEqualTopologyTypes
, env_var
,
735 __kmp_hw_get_catalog_string(type
),
736 __kmp_hw_get_catalog_string(eq_type
));
741 KMP_ASSERT(depth
> 0 && depth
<= (int)KMP_HW_LAST
);
742 // Create a print types array that always guarantees printing
743 // the core and thread level
744 print_types_depth
= 0;
745 for (int level
= 0; level
< depth
; ++level
)
746 print_types
[print_types_depth
++] = types
[level
];
747 if (equivalent
[KMP_HW_CORE
] != KMP_HW_CORE
) {
748 // Force in the core level for quick topology
749 if (print_types
[print_types_depth
- 1] == KMP_HW_THREAD
) {
750 // Force core before thread e.g., 1 socket X 2 threads/socket
751 // becomes 1 socket X 1 core/socket X 2 threads/socket
752 print_types
[print_types_depth
- 1] = KMP_HW_CORE
;
753 print_types
[print_types_depth
++] = KMP_HW_THREAD
;
755 print_types
[print_types_depth
++] = KMP_HW_CORE
;
758 // Always put threads at very end of quick topology
759 if (equivalent
[KMP_HW_THREAD
] != KMP_HW_THREAD
)
760 print_types
[print_types_depth
++] = KMP_HW_THREAD
;
762 __kmp_str_buf_clear(&buf
);
763 kmp_hw_t numerator_type
;
764 kmp_hw_t denominator_type
= KMP_HW_UNKNOWN
;
765 int core_level
= get_level(KMP_HW_CORE
);
766 int ncores
= get_count(core_level
);
768 for (int plevel
= 0, level
= 0; plevel
< print_types_depth
; ++plevel
) {
771 numerator_type
= print_types
[plevel
];
772 KMP_ASSERT_VALID_HW_TYPE(numerator_type
);
773 if (equivalent
[numerator_type
] != numerator_type
)
776 c
= get_ratio(level
++);
779 __kmp_str_buf_print(&buf
, "%d %s", c
,
780 __kmp_hw_get_catalog_string(numerator_type
, plural
));
782 __kmp_str_buf_print(&buf
, " x %d %s/%s", c
,
783 __kmp_hw_get_catalog_string(numerator_type
, plural
),
784 __kmp_hw_get_catalog_string(denominator_type
));
786 denominator_type
= numerator_type
;
788 KMP_INFORM(TopologyGeneric
, env_var
, buf
.str
, ncores
);
790 // Hybrid topology information
791 if (__kmp_is_hybrid_cpu()) {
792 for (int i
= 0; i
< num_core_types
; ++i
) {
793 kmp_hw_core_type_t core_type
= core_types
[i
];
796 attr
.set_core_type(core_type
);
797 int ncores
= get_ncores_with_attr(attr
);
799 KMP_INFORM(TopologyHybrid
, env_var
, ncores
,
800 __kmp_hw_get_core_type_string(core_type
));
801 KMP_ASSERT(num_core_efficiencies
<= KMP_HW_MAX_NUM_CORE_EFFS
)
802 for (int eff
= 0; eff
< num_core_efficiencies
; ++eff
) {
803 attr
.set_core_eff(eff
);
804 int ncores_with_eff
= get_ncores_with_attr(attr
);
805 if (ncores_with_eff
> 0) {
806 KMP_INFORM(TopologyHybridCoreEff
, env_var
, ncores_with_eff
, eff
);
813 if (num_hw_threads
<= 0) {
814 __kmp_str_buf_free(&buf
);
818 // Full OS proc to hardware thread map
819 KMP_INFORM(OSProcToPhysicalThreadMap
, env_var
);
820 for (int i
= 0; i
< num_hw_threads
; i
++) {
821 __kmp_str_buf_clear(&buf
);
822 for (int level
= 0; level
< depth
; ++level
) {
823 kmp_hw_t type
= types
[level
];
824 __kmp_str_buf_print(&buf
, "%s ", __kmp_hw_get_catalog_string(type
));
825 __kmp_str_buf_print(&buf
, "%d ", hw_threads
[i
].ids
[level
]);
827 if (__kmp_is_hybrid_cpu())
830 __kmp_hw_get_core_type_string(hw_threads
[i
].attrs
.get_core_type()));
831 KMP_INFORM(OSProcMapToPack
, env_var
, hw_threads
[i
].os_id
, buf
.str
);
834 __kmp_str_buf_free(&buf
);
837 #if KMP_AFFINITY_SUPPORTED
838 void kmp_topology_t::set_granularity(kmp_affinity_t
&affinity
) const {
839 const char *env_var
= __kmp_get_affinity_env_var(affinity
);
840 // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
841 // KMP_AFFINITY), but none exist, then reset granularity and have below method
842 // select a granularity and warn user.
843 if (!__kmp_is_hybrid_cpu()) {
844 if (affinity
.core_attr_gran
.valid
) {
845 // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
848 affinity
, AffIgnoringNonHybrid
, env_var
,
849 __kmp_hw_get_catalog_string(KMP_HW_CORE
, /*plural=*/true));
850 affinity
.gran
= KMP_HW_CORE
;
851 affinity
.gran_levels
= -1;
852 affinity
.core_attr_gran
= KMP_AFFINITY_ATTRS_UNKNOWN
;
853 affinity
.flags
.core_types_gran
= affinity
.flags
.core_effs_gran
= 0;
854 } else if (affinity
.flags
.core_types_gran
||
855 affinity
.flags
.core_effs_gran
) {
856 // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
857 if (affinity
.flags
.omp_places
) {
859 affinity
, AffIgnoringNonHybrid
, env_var
,
860 __kmp_hw_get_catalog_string(KMP_HW_CORE
, /*plural=*/true));
862 // KMP_AFFINITY=granularity=core_type|core_eff,...
863 KMP_AFF_WARNING(affinity
, AffGranularityBad
, env_var
,
864 "Intel(R) Hybrid Technology core attribute",
865 __kmp_hw_get_catalog_string(KMP_HW_CORE
));
867 affinity
.gran
= KMP_HW_CORE
;
868 affinity
.gran_levels
= -1;
869 affinity
.core_attr_gran
= KMP_AFFINITY_ATTRS_UNKNOWN
;
870 affinity
.flags
.core_types_gran
= affinity
.flags
.core_effs_gran
= 0;
873 // Set the number of affinity granularity levels
874 if (affinity
.gran_levels
< 0) {
875 kmp_hw_t gran_type
= get_equivalent_type(affinity
.gran
);
876 // Check if user's granularity request is valid
877 if (gran_type
== KMP_HW_UNKNOWN
) {
878 // First try core, then thread, then package
879 kmp_hw_t gran_types
[3] = {KMP_HW_CORE
, KMP_HW_THREAD
, KMP_HW_SOCKET
};
880 for (auto g
: gran_types
) {
881 if (get_equivalent_type(g
) != KMP_HW_UNKNOWN
) {
886 KMP_ASSERT(gran_type
!= KMP_HW_UNKNOWN
);
887 // Warn user what granularity setting will be used instead
888 KMP_AFF_WARNING(affinity
, AffGranularityBad
, env_var
,
889 __kmp_hw_get_catalog_string(affinity
.gran
),
890 __kmp_hw_get_catalog_string(gran_type
));
891 affinity
.gran
= gran_type
;
893 #if KMP_GROUP_AFFINITY
894 // If more than one processor group exists, and the level of
895 // granularity specified by the user is too coarse, then the
896 // granularity must be adjusted "down" to processor group affinity
897 // because threads can only exist within one processor group.
898 // For example, if a user sets granularity=socket and there are two
899 // processor groups that cover a socket, then the runtime must
900 // restrict the granularity down to the processor group level.
901 if (__kmp_num_proc_groups
> 1) {
902 int gran_depth
= get_level(gran_type
);
903 int proc_group_depth
= get_level(KMP_HW_PROC_GROUP
);
904 if (gran_depth
>= 0 && proc_group_depth
>= 0 &&
905 gran_depth
< proc_group_depth
) {
906 KMP_AFF_WARNING(affinity
, AffGranTooCoarseProcGroup
, env_var
,
907 __kmp_hw_get_catalog_string(affinity
.gran
));
908 affinity
.gran
= gran_type
= KMP_HW_PROC_GROUP
;
912 affinity
.gran_levels
= 0;
913 for (int i
= depth
- 1; i
>= 0 && get_type(i
) != gran_type
; --i
)
914 affinity
.gran_levels
++;
919 void kmp_topology_t::canonicalize() {
920 #if KMP_GROUP_AFFINITY
921 _insert_windows_proc_groups();
923 _remove_radix1_layers();
924 _gather_enumeration_information();
925 _discover_uniformity();
928 _set_last_level_cache();
930 #if KMP_MIC_SUPPORTED
931 // Manually Add L2 = Tile equivalence
932 if (__kmp_mic_type
== mic3
) {
933 if (get_level(KMP_HW_L2
) != -1)
934 set_equivalent_type(KMP_HW_TILE
, KMP_HW_L2
);
935 else if (get_level(KMP_HW_TILE
) != -1)
936 set_equivalent_type(KMP_HW_L2
, KMP_HW_TILE
);
940 // Perform post canonicalization checking
941 KMP_ASSERT(depth
> 0);
942 for (int level
= 0; level
< depth
; ++level
) {
943 // All counts, ratios, and types must be valid
944 KMP_ASSERT(count
[level
] > 0 && ratio
[level
] > 0);
945 KMP_ASSERT_VALID_HW_TYPE(types
[level
]);
946 // Detected types must point to themselves
947 KMP_ASSERT(equivalent
[types
[level
]] == types
[level
]);
951 // Canonicalize an explicit packages X cores/pkg X threads/core topology
952 void kmp_topology_t::canonicalize(int npackages
, int ncores_per_pkg
,
953 int nthreads_per_core
, int ncores
) {
956 KMP_FOREACH_HW_TYPE(i
) { equivalent
[i
] = KMP_HW_UNKNOWN
; }
957 for (int level
= 0; level
< depth
; ++level
) {
961 count
[0] = npackages
;
963 count
[2] = __kmp_xproc
;
964 ratio
[0] = npackages
;
965 ratio
[1] = ncores_per_pkg
;
966 ratio
[2] = nthreads_per_core
;
967 equivalent
[KMP_HW_SOCKET
] = KMP_HW_SOCKET
;
968 equivalent
[KMP_HW_CORE
] = KMP_HW_CORE
;
969 equivalent
[KMP_HW_THREAD
] = KMP_HW_THREAD
;
970 types
[0] = KMP_HW_SOCKET
;
971 types
[1] = KMP_HW_CORE
;
972 types
[2] = KMP_HW_THREAD
;
973 //__kmp_avail_proc = __kmp_xproc;
974 _discover_uniformity();
977 // Represents running sub IDs for a single core attribute where
978 // attribute values have SIZE possibilities.
979 template <size_t SIZE
, typename IndexFunc
> struct kmp_sub_ids_t
{
980 int last_level
; // last level in topology to consider for sub_ids
981 int sub_id
[SIZE
]; // The sub ID for a given attribute value
982 int prev_sub_id
[KMP_HW_LAST
];
986 kmp_sub_ids_t(int last_level
) : last_level(last_level
) {
987 KMP_ASSERT(last_level
< KMP_HW_LAST
);
988 for (size_t i
= 0; i
< SIZE
; ++i
)
990 for (size_t i
= 0; i
< KMP_HW_LAST
; ++i
)
993 void update(const kmp_hw_thread_t
&hw_thread
) {
994 int idx
= indexer(hw_thread
);
995 KMP_ASSERT(idx
< (int)SIZE
);
996 for (int level
= 0; level
<= last_level
; ++level
) {
997 if (hw_thread
.sub_ids
[level
] != prev_sub_id
[level
]) {
998 if (level
< last_level
)
1004 for (int level
= 0; level
<= last_level
; ++level
)
1005 prev_sub_id
[level
] = hw_thread
.sub_ids
[level
];
1007 int get_sub_id(const kmp_hw_thread_t
&hw_thread
) const {
1008 return sub_id
[indexer(hw_thread
)];
1012 #if KMP_AFFINITY_SUPPORTED
1013 static kmp_str_buf_t
*
1014 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t
&attr
, kmp_str_buf_t
*buf
,
1016 __kmp_str_buf_init(buf
);
1017 if (attr
.is_core_type_valid())
1018 __kmp_str_buf_print(buf
, "%s %s",
1019 __kmp_hw_get_core_type_string(attr
.get_core_type()),
1020 __kmp_hw_get_catalog_string(KMP_HW_CORE
, plural
));
1022 __kmp_str_buf_print(buf
, "%s eff=%d",
1023 __kmp_hw_get_catalog_string(KMP_HW_CORE
, plural
),
1024 attr
.get_core_eff());
1028 bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t
*mask
) {
1032 for (int i
= 0; i
< num_hw_threads
; ++i
) {
1033 int os_id
= hw_threads
[i
].os_id
;
1034 if (KMP_CPU_ISSET(os_id
, mask
)) {
1036 hw_threads
[new_index
] = hw_threads
[i
];
1039 KMP_CPU_CLR(os_id
, __kmp_affin_fullMask
);
1044 KMP_DEBUG_ASSERT(new_index
<= num_hw_threads
);
1045 affected
= (num_hw_threads
!= new_index
);
1046 num_hw_threads
= new_index
;
1048 // Post hardware subset canonicalization
1050 _gather_enumeration_information();
1051 _discover_uniformity();
1053 _set_last_level_cache();
1055 // Copy filtered full mask if topology has single processor group
1056 if (__kmp_num_proc_groups
<= 1)
1058 __kmp_affin_origMask
->copy(__kmp_affin_fullMask
);
1063 // Apply the KMP_HW_SUBSET envirable to the topology
1064 // Returns true if KMP_HW_SUBSET filtered any processors
1065 // otherwise, returns false
1066 bool kmp_topology_t::filter_hw_subset() {
1067 // If KMP_HW_SUBSET wasn't requested, then do nothing.
1068 if (!__kmp_hw_subset
)
1071 // First, sort the KMP_HW_SUBSET items by the machine topology
1072 __kmp_hw_subset
->sort();
1074 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
1075 bool using_core_types
= false;
1076 bool using_core_effs
= false;
1077 int hw_subset_depth
= __kmp_hw_subset
->get_depth();
1078 kmp_hw_t specified
[KMP_HW_LAST
];
1079 int *topology_levels
= (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth
);
1080 KMP_ASSERT(hw_subset_depth
> 0);
1081 KMP_FOREACH_HW_TYPE(i
) { specified
[i
] = KMP_HW_UNKNOWN
; }
1082 int core_level
= get_level(KMP_HW_CORE
);
1083 for (int i
= 0; i
< hw_subset_depth
; ++i
) {
1085 const kmp_hw_subset_t::item_t
&item
= __kmp_hw_subset
->at(i
);
1086 int num
= item
.num
[0];
1087 int offset
= item
.offset
[0];
1088 kmp_hw_t type
= item
.type
;
1089 kmp_hw_t equivalent_type
= equivalent
[type
];
1090 int level
= get_level(type
);
1091 topology_levels
[i
] = level
;
1093 // Check to see if current layer is in detected machine topology
1094 if (equivalent_type
!= KMP_HW_UNKNOWN
) {
1095 __kmp_hw_subset
->at(i
).type
= equivalent_type
;
1097 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetNotExistGeneric
,
1098 __kmp_hw_get_catalog_string(type
));
1102 // Check to see if current layer has already been
1103 // specified either directly or through an equivalent type
1104 if (specified
[equivalent_type
] != KMP_HW_UNKNOWN
) {
1105 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetEqvLayers
,
1106 __kmp_hw_get_catalog_string(type
),
1107 __kmp_hw_get_catalog_string(specified
[equivalent_type
]));
1110 specified
[equivalent_type
] = type
;
1112 // Check to see if each layer's num & offset parameters are valid
1113 max_count
= get_ratio(level
);
1114 if (max_count
< 0 ||
1115 (num
!= kmp_hw_subset_t::USE_ALL
&& num
+ offset
> max_count
)) {
1116 bool plural
= (num
> 1);
1117 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetManyGeneric
,
1118 __kmp_hw_get_catalog_string(type
, plural
));
1122 // Check to see if core attributes are consistent
1123 if (core_level
== level
) {
1124 // Determine which core attributes are specified
1125 for (int j
= 0; j
< item
.num_attrs
; ++j
) {
1126 if (item
.attr
[j
].is_core_type_valid())
1127 using_core_types
= true;
1128 if (item
.attr
[j
].is_core_eff_valid())
1129 using_core_effs
= true;
1132 // Check if using a single core attribute on non-hybrid arch.
1133 // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
1135 // Check if using multiple core attributes on non-hyrbid arch.
1136 // Ignore all of KMP_HW_SUBSET if this is the case.
1137 if ((using_core_effs
|| using_core_types
) && !__kmp_is_hybrid_cpu()) {
1138 if (item
.num_attrs
== 1) {
1139 if (using_core_effs
) {
1140 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetIgnoringAttr
,
1143 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetIgnoringAttr
,
1146 using_core_effs
= false;
1147 using_core_types
= false;
1149 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetAttrsNonHybrid
);
1154 // Check if using both core types and core efficiencies together
1155 if (using_core_types
&& using_core_effs
) {
1156 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetIncompat
, "core_type",
1161 // Check that core efficiency values are valid
1162 if (using_core_effs
) {
1163 for (int j
= 0; j
< item
.num_attrs
; ++j
) {
1164 if (item
.attr
[j
].is_core_eff_valid()) {
1165 int core_eff
= item
.attr
[j
].get_core_eff();
1166 if (core_eff
< 0 || core_eff
>= num_core_efficiencies
) {
1168 __kmp_str_buf_init(&buf
);
1169 __kmp_str_buf_print(&buf
, "%d", item
.attr
[j
].get_core_eff());
1170 __kmp_msg(kmp_ms_warning
,
1171 KMP_MSG(AffHWSubsetAttrInvalid
, "efficiency", buf
.str
),
1172 KMP_HNT(ValidValuesRange
, 0, num_core_efficiencies
- 1),
1174 __kmp_str_buf_free(&buf
);
1181 // Check that the number of requested cores with attributes is valid
1182 if (using_core_types
|| using_core_effs
) {
1183 for (int j
= 0; j
< item
.num_attrs
; ++j
) {
1184 int num
= item
.num
[j
];
1185 int offset
= item
.offset
[j
];
1186 int level_above
= core_level
- 1;
1187 if (level_above
>= 0) {
1188 max_count
= get_ncores_with_attr_per(item
.attr
[j
], level_above
);
1189 if (max_count
<= 0 ||
1190 (num
!= kmp_hw_subset_t::USE_ALL
&& num
+ offset
> max_count
)) {
1192 __kmp_hw_get_catalog_core_string(item
.attr
[j
], &buf
, num
> 0);
1193 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetManyGeneric
, buf
.str
);
1194 __kmp_str_buf_free(&buf
);
1201 if ((using_core_types
|| using_core_effs
) && item
.num_attrs
> 1) {
1202 for (int j
= 0; j
< item
.num_attrs
; ++j
) {
1203 // Ambiguous use of specific core attribute + generic core
1204 // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
1205 if (!item
.attr
[j
]) {
1206 kmp_hw_attr_t other_attr
;
1207 for (int k
= 0; k
< item
.num_attrs
; ++k
) {
1208 if (item
.attr
[k
] != item
.attr
[j
]) {
1209 other_attr
= item
.attr
[k
];
1214 __kmp_hw_get_catalog_core_string(other_attr
, &buf
, item
.num
[j
] > 0);
1215 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetIncompat
,
1216 __kmp_hw_get_catalog_string(KMP_HW_CORE
), buf
.str
);
1217 __kmp_str_buf_free(&buf
);
1220 // Allow specifying a specific core type or core eff exactly once
1221 for (int k
= 0; k
< j
; ++k
) {
1222 if (!item
.attr
[j
] || !item
.attr
[k
])
1224 if (item
.attr
[k
] == item
.attr
[j
]) {
1226 __kmp_hw_get_catalog_core_string(item
.attr
[j
], &buf
,
1228 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetAttrRepeat
, buf
.str
);
1229 __kmp_str_buf_free(&buf
);
1238 struct core_type_indexer
{
1239 int operator()(const kmp_hw_thread_t
&t
) const {
1240 switch (t
.attrs
.get_core_type()) {
1241 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1242 case KMP_HW_CORE_TYPE_ATOM
:
1244 case KMP_HW_CORE_TYPE_CORE
:
1247 case KMP_HW_CORE_TYPE_UNKNOWN
:
1254 struct core_eff_indexer
{
1255 int operator()(const kmp_hw_thread_t
&t
) const {
1256 return t
.attrs
.get_core_eff();
1260 kmp_sub_ids_t
<KMP_HW_MAX_NUM_CORE_TYPES
, core_type_indexer
> core_type_sub_ids(
1262 kmp_sub_ids_t
<KMP_HW_MAX_NUM_CORE_EFFS
, core_eff_indexer
> core_eff_sub_ids(
1265 // Determine which hardware threads should be filtered.
1266 int num_filtered
= 0;
1267 kmp_affin_mask_t
*filtered_mask
;
1268 KMP_CPU_ALLOC(filtered_mask
);
1269 KMP_CPU_COPY(filtered_mask
, __kmp_affin_fullMask
);
1270 for (int i
= 0; i
< num_hw_threads
; ++i
) {
1271 kmp_hw_thread_t
&hw_thread
= hw_threads
[i
];
1272 // Update type_sub_id
1273 if (using_core_types
)
1274 core_type_sub_ids
.update(hw_thread
);
1275 if (using_core_effs
)
1276 core_eff_sub_ids
.update(hw_thread
);
1278 // Check to see if this hardware thread should be filtered
1279 bool should_be_filtered
= false;
1280 for (int hw_subset_index
= 0; hw_subset_index
< hw_subset_depth
;
1281 ++hw_subset_index
) {
1282 const auto &hw_subset_item
= __kmp_hw_subset
->at(hw_subset_index
);
1283 int level
= topology_levels
[hw_subset_index
];
1286 if ((using_core_effs
|| using_core_types
) && level
== core_level
) {
1287 // Look for the core attribute in KMP_HW_SUBSET which corresponds
1288 // to this hardware thread's core attribute. Use this num,offset plus
1289 // the running sub_id for the particular core attribute of this hardware
1290 // thread to determine if the hardware thread should be filtered or not.
1292 kmp_hw_core_type_t core_type
= hw_thread
.attrs
.get_core_type();
1293 int core_eff
= hw_thread
.attrs
.get_core_eff();
1294 for (attr_idx
= 0; attr_idx
< hw_subset_item
.num_attrs
; ++attr_idx
) {
1295 if (using_core_types
&&
1296 hw_subset_item
.attr
[attr_idx
].get_core_type() == core_type
)
1298 if (using_core_effs
&&
1299 hw_subset_item
.attr
[attr_idx
].get_core_eff() == core_eff
)
1302 // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
1303 if (attr_idx
== hw_subset_item
.num_attrs
) {
1304 should_be_filtered
= true;
1308 int num
= hw_subset_item
.num
[attr_idx
];
1309 int offset
= hw_subset_item
.offset
[attr_idx
];
1310 if (using_core_types
)
1311 sub_id
= core_type_sub_ids
.get_sub_id(hw_thread
);
1313 sub_id
= core_eff_sub_ids
.get_sub_id(hw_thread
);
1314 if (sub_id
< offset
||
1315 (num
!= kmp_hw_subset_t::USE_ALL
&& sub_id
>= offset
+ num
)) {
1316 should_be_filtered
= true;
1320 int num
= hw_subset_item
.num
[0];
1321 int offset
= hw_subset_item
.offset
[0];
1322 if (hw_thread
.sub_ids
[level
] < offset
||
1323 (num
!= kmp_hw_subset_t::USE_ALL
&&
1324 hw_thread
.sub_ids
[level
] >= offset
+ num
)) {
1325 should_be_filtered
= true;
1330 // Collect filtering information
1331 if (should_be_filtered
) {
1332 KMP_CPU_CLR(hw_thread
.os_id
, filtered_mask
);
1337 // One last check that we shouldn't allow filtering entire machine
1338 if (num_filtered
== num_hw_threads
) {
1339 KMP_AFF_WARNING(__kmp_affinity
, AffHWSubsetAllFiltered
);
1344 restrict_to_mask(filtered_mask
);
1348 bool kmp_topology_t::is_close(int hwt1
, int hwt2
,
1349 const kmp_affinity_t
&stgs
) const {
1350 int hw_level
= stgs
.gran_levels
;
1351 if (hw_level
>= depth
)
1354 const kmp_hw_thread_t
&t1
= hw_threads
[hwt1
];
1355 const kmp_hw_thread_t
&t2
= hw_threads
[hwt2
];
1356 if (stgs
.flags
.core_types_gran
)
1357 return t1
.attrs
.get_core_type() == t2
.attrs
.get_core_type();
1358 if (stgs
.flags
.core_effs_gran
)
1359 return t1
.attrs
.get_core_eff() == t2
.attrs
.get_core_eff();
1360 for (int i
= 0; i
< (depth
- hw_level
); ++i
) {
1361 if (t1
.ids
[i
] != t2
.ids
[i
])
1367 ////////////////////////////////////////////////////////////////////////////////
1369 bool KMPAffinity::picked_api
= false;
1371 void *KMPAffinity::Mask::operator new(size_t n
) { return __kmp_allocate(n
); }
1372 void *KMPAffinity::Mask::operator new[](size_t n
) { return __kmp_allocate(n
); }
1373 void KMPAffinity::Mask::operator delete(void *p
) { __kmp_free(p
); }
1374 void KMPAffinity::Mask::operator delete[](void *p
) { __kmp_free(p
); }
1375 void *KMPAffinity::operator new(size_t n
) { return __kmp_allocate(n
); }
1376 void KMPAffinity::operator delete(void *p
) { __kmp_free(p
); }
1378 void KMPAffinity::pick_api() {
1379 KMPAffinity
*affinity_dispatch
;
1383 // Only use Hwloc if affinity isn't explicitly disabled and
1384 // user requests Hwloc topology method
1385 if (__kmp_affinity_top_method
== affinity_top_method_hwloc
&&
1386 __kmp_affinity
.type
!= affinity_disabled
) {
1387 affinity_dispatch
= new KMPHwlocAffinity();
1391 affinity_dispatch
= new KMPNativeAffinity();
1393 __kmp_affinity_dispatch
= affinity_dispatch
;
1397 void KMPAffinity::destroy_api() {
1398 if (__kmp_affinity_dispatch
!= NULL
) {
1399 delete __kmp_affinity_dispatch
;
1400 __kmp_affinity_dispatch
= NULL
;
1405 #define KMP_ADVANCE_SCAN(scan) \
1406 while (*scan != '\0') { \
1410 // Print the affinity mask to the character array in a pretty format.
1411 // The format is a comma separated list of non-negative integers or integer
1412 // ranges: e.g., 1,2,3-5,7,9-15
1413 // The format can also be the string "{<empty>}" if no bits are set in mask
1414 char *__kmp_affinity_print_mask(char *buf
, int buf_len
,
1415 kmp_affin_mask_t
*mask
) {
1416 int start
= 0, finish
= 0, previous
= 0;
1419 KMP_ASSERT(buf_len
>= 40);
1422 char *end
= buf
+ buf_len
- 1;
1424 // Check for empty set.
1425 if (mask
->begin() == mask
->end()) {
1426 KMP_SNPRINTF(scan
, end
- scan
+ 1, "{<empty>}");
1427 KMP_ADVANCE_SCAN(scan
);
1428 KMP_ASSERT(scan
<= end
);
1433 start
= mask
->begin();
1436 // [start, previous] is inclusive range of contiguous bits in mask
1437 for (finish
= mask
->next(start
), previous
= start
;
1438 finish
== previous
+ 1 && finish
!= mask
->end();
1439 finish
= mask
->next(finish
)) {
1443 // The first range does not need a comma printed before it, but the rest
1444 // of the ranges do need a comma beforehand
1446 KMP_SNPRINTF(scan
, end
- scan
+ 1, "%s", ",");
1447 KMP_ADVANCE_SCAN(scan
);
1449 first_range
= false;
1451 // Range with three or more contiguous bits in the affinity mask
1452 if (previous
- start
> 1) {
1453 KMP_SNPRINTF(scan
, end
- scan
+ 1, "%u-%u", start
, previous
);
1455 // Range with one or two contiguous bits in the affinity mask
1456 KMP_SNPRINTF(scan
, end
- scan
+ 1, "%u", start
);
1457 KMP_ADVANCE_SCAN(scan
);
1458 if (previous
- start
> 0) {
1459 KMP_SNPRINTF(scan
, end
- scan
+ 1, ",%u", previous
);
1462 KMP_ADVANCE_SCAN(scan
);
1463 // Start over with new start point
1465 if (start
== mask
->end())
1467 // Check for overflow
1472 // Check for overflow
1473 KMP_ASSERT(scan
<= end
);
1476 #undef KMP_ADVANCE_SCAN
1478 // Print the affinity mask to the string buffer object in a pretty format
1479 // The format is a comma separated list of non-negative integers or integer
1480 // ranges: e.g., 1,2,3-5,7,9-15
1481 // The format can also be the string "{<empty>}" if no bits are set in mask
1482 kmp_str_buf_t
*__kmp_affinity_str_buf_mask(kmp_str_buf_t
*buf
,
1483 kmp_affin_mask_t
*mask
) {
1484 int start
= 0, finish
= 0, previous
= 0;
1489 __kmp_str_buf_clear(buf
);
1491 // Check for empty set.
1492 if (mask
->begin() == mask
->end()) {
1493 __kmp_str_buf_print(buf
, "%s", "{<empty>}");
1498 start
= mask
->begin();
1501 // [start, previous] is inclusive range of contiguous bits in mask
1502 for (finish
= mask
->next(start
), previous
= start
;
1503 finish
== previous
+ 1 && finish
!= mask
->end();
1504 finish
= mask
->next(finish
)) {
1508 // The first range does not need a comma printed before it, but the rest
1509 // of the ranges do need a comma beforehand
1511 __kmp_str_buf_print(buf
, "%s", ",");
1513 first_range
= false;
1515 // Range with three or more contiguous bits in the affinity mask
1516 if (previous
- start
> 1) {
1517 __kmp_str_buf_print(buf
, "%u-%u", start
, previous
);
1519 // Range with one or two contiguous bits in the affinity mask
1520 __kmp_str_buf_print(buf
, "%u", start
);
1521 if (previous
- start
> 0) {
1522 __kmp_str_buf_print(buf
, ",%u", previous
);
1525 // Start over with new start point
1527 if (start
== mask
->end())
1533 // Return (possibly empty) affinity mask representing the offline CPUs
1534 // Caller must free the mask
1535 kmp_affin_mask_t
*__kmp_affinity_get_offline_cpus() {
1536 kmp_affin_mask_t
*offline
;
1537 KMP_CPU_ALLOC(offline
);
1538 KMP_CPU_ZERO(offline
);
1540 int n
, begin_cpu
, end_cpu
;
1541 kmp_safe_raii_file_t offline_file
;
1542 auto skip_ws
= [](FILE *f
) {
1546 } while (isspace(c
));
1550 // File contains CSV of integer ranges representing the offline CPUs
1551 // e.g., 1,2,4-7,9,11-15
1552 int status
= offline_file
.try_open("/sys/devices/system/cpu/offline", "r");
1555 while (!feof(offline_file
)) {
1556 skip_ws(offline_file
);
1557 n
= fscanf(offline_file
, "%d", &begin_cpu
);
1560 skip_ws(offline_file
);
1561 int c
= fgetc(offline_file
);
1562 if (c
== EOF
|| c
== ',') {
1564 end_cpu
= begin_cpu
;
1565 } else if (c
== '-') {
1567 skip_ws(offline_file
);
1568 n
= fscanf(offline_file
, "%d", &end_cpu
);
1571 skip_ws(offline_file
);
1572 c
= fgetc(offline_file
); // skip ','
1577 // Ensure a valid range of CPUs
1578 if (begin_cpu
< 0 || begin_cpu
>= __kmp_xproc
|| end_cpu
< 0 ||
1579 end_cpu
>= __kmp_xproc
|| begin_cpu
> end_cpu
) {
1582 // Insert [begin_cpu, end_cpu] into offline mask
1583 for (int cpu
= begin_cpu
; cpu
<= end_cpu
; ++cpu
) {
1584 KMP_CPU_SET(cpu
, offline
);
1591 // Return the number of available procs
1592 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t
*mask
) {
1596 #if KMP_GROUP_AFFINITY
1598 if (__kmp_num_proc_groups
> 1) {
1600 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount
!= NULL
);
1601 for (group
= 0; group
< __kmp_num_proc_groups
; group
++) {
1603 int num
= __kmp_GetActiveProcessorCount(group
);
1604 for (i
= 0; i
< num
; i
++) {
1605 KMP_CPU_SET(i
+ group
* (CHAR_BIT
* sizeof(DWORD_PTR
)), mask
);
1611 #endif /* KMP_GROUP_AFFINITY */
1615 kmp_affin_mask_t
*offline_cpus
= __kmp_affinity_get_offline_cpus();
1616 for (proc
= 0; proc
< __kmp_xproc
; proc
++) {
1617 // Skip offline CPUs
1618 if (KMP_CPU_ISSET(proc
, offline_cpus
))
1620 KMP_CPU_SET(proc
, mask
);
1623 KMP_CPU_FREE(offline_cpus
);
1629 // All of the __kmp_affinity_create_*_map() routines should allocate the
1630 // internal topology object and set the layer ids for it. Each routine
1631 // returns a boolean on whether it was successful at doing so.
1632 kmp_affin_mask_t
*__kmp_affin_fullMask
= NULL
;
1633 // Original mask is a subset of full mask in multiple processor groups topology
1634 kmp_affin_mask_t
*__kmp_affin_origMask
= NULL
;
1637 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj
) {
1638 #if HWLOC_API_VERSION >= 0x00020000
1639 return hwloc_obj_type_is_cache(obj
->type
);
1641 return obj
->type
== HWLOC_OBJ_CACHE
;
1645 // Returns KMP_HW_* type derived from HWLOC_* type
1646 static inline kmp_hw_t
__kmp_hwloc_type_2_topology_type(hwloc_obj_t obj
) {
1648 if (__kmp_hwloc_is_cache_type(obj
)) {
1649 if (obj
->attr
->cache
.type
== HWLOC_OBJ_CACHE_INSTRUCTION
)
1650 return KMP_HW_UNKNOWN
;
1651 switch (obj
->attr
->cache
.depth
) {
1655 #if KMP_MIC_SUPPORTED
1656 if (__kmp_mic_type
== mic3
) {
1664 return KMP_HW_UNKNOWN
;
1667 switch (obj
->type
) {
1668 case HWLOC_OBJ_PACKAGE
:
1669 return KMP_HW_SOCKET
;
1670 case HWLOC_OBJ_NUMANODE
:
1672 case HWLOC_OBJ_CORE
:
1675 return KMP_HW_THREAD
;
1676 case HWLOC_OBJ_GROUP
:
1677 #if HWLOC_API_VERSION >= 0x00020000
1678 if (obj
->attr
->group
.kind
== HWLOC_GROUP_KIND_INTEL_DIE
)
1680 else if (obj
->attr
->group
.kind
== HWLOC_GROUP_KIND_INTEL_TILE
)
1682 else if (obj
->attr
->group
.kind
== HWLOC_GROUP_KIND_INTEL_MODULE
)
1683 return KMP_HW_MODULE
;
1684 else if (obj
->attr
->group
.kind
== HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP
)
1685 return KMP_HW_PROC_GROUP
;
1687 return KMP_HW_UNKNOWN
;
1688 #if HWLOC_API_VERSION >= 0x00020100
1693 return KMP_HW_UNKNOWN
;
1696 // Returns the number of objects of type 'type' below 'obj' within the topology
1697 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1698 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1700 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj
,
1701 hwloc_obj_type_t type
) {
1704 for (first
= hwloc_get_obj_below_by_type(__kmp_hwloc_topology
, obj
->type
,
1705 obj
->logical_index
, type
, 0);
1706 first
!= NULL
&& hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology
,
1707 obj
->type
, first
) == obj
;
1708 first
= hwloc_get_next_obj_by_type(__kmp_hwloc_topology
, first
->type
,
1715 // This gets the sub_id for a lower object under a higher object in the
1717 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t
, hwloc_obj_t higher
,
1718 hwloc_obj_t lower
) {
1720 hwloc_obj_type_t ltype
= lower
->type
;
1721 int lindex
= lower
->logical_index
- 1;
1723 // Get the previous lower object
1724 obj
= hwloc_get_obj_by_type(t
, ltype
, lindex
);
1725 while (obj
&& lindex
>= 0 &&
1726 hwloc_bitmap_isincluded(obj
->cpuset
, higher
->cpuset
)) {
1727 if (obj
->userdata
) {
1728 sub_id
= (int)(RCAST(kmp_intptr_t
, obj
->userdata
));
1733 obj
= hwloc_get_obj_by_type(t
, ltype
, lindex
);
1735 // store sub_id + 1 so that 0 is differed from NULL
1736 lower
->userdata
= RCAST(void *, sub_id
+ 1);
1740 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t
*const msg_id
) {
1742 int hw_thread_index
, sub_id
;
1744 hwloc_obj_t pu
, obj
, root
, prev
;
1745 kmp_hw_t types
[KMP_HW_LAST
];
1746 hwloc_obj_type_t hwloc_types
[KMP_HW_LAST
];
1748 hwloc_topology_t tp
= __kmp_hwloc_topology
;
1749 *msg_id
= kmp_i18n_null
;
1750 if (__kmp_affinity
.flags
.verbose
) {
1751 KMP_INFORM(AffUsingHwloc
, "KMP_AFFINITY");
1754 if (!KMP_AFFINITY_CAPABLE()) {
1755 // Hack to try and infer the machine topology using only the data
1756 // available from hwloc on the current thread, and __kmp_xproc.
1757 KMP_ASSERT(__kmp_affinity
.type
== affinity_none
);
1758 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1759 hwloc_obj_t o
= hwloc_get_obj_by_type(tp
, HWLOC_OBJ_PACKAGE
, 0);
1761 nCoresPerPkg
= __kmp_hwloc_get_nobjs_under_obj(o
, HWLOC_OBJ_CORE
);
1763 nCoresPerPkg
= 1; // no PACKAGE found
1764 o
= hwloc_get_obj_by_type(tp
, HWLOC_OBJ_CORE
, 0);
1766 __kmp_nThreadsPerCore
= __kmp_hwloc_get_nobjs_under_obj(o
, HWLOC_OBJ_PU
);
1768 __kmp_nThreadsPerCore
= 1; // no CORE found
1769 __kmp_ncores
= __kmp_xproc
/ __kmp_nThreadsPerCore
;
1770 if (nCoresPerPkg
== 0)
1771 nCoresPerPkg
= 1; // to prevent possible division by 0
1772 nPackages
= (__kmp_xproc
+ nCoresPerPkg
- 1) / nCoresPerPkg
;
1776 #if HWLOC_API_VERSION >= 0x00020400
1777 // Handle multiple types of cores if they exist on the system
1778 int nr_cpu_kinds
= hwloc_cpukinds_get_nr(tp
, 0);
1780 typedef struct kmp_hwloc_cpukinds_info_t
{
1782 kmp_hw_core_type_t core_type
;
1783 hwloc_bitmap_t mask
;
1784 } kmp_hwloc_cpukinds_info_t
;
1785 kmp_hwloc_cpukinds_info_t
*cpukinds
= nullptr;
1787 if (nr_cpu_kinds
> 0) {
1789 struct hwloc_info_s
*infos
;
1790 cpukinds
= (kmp_hwloc_cpukinds_info_t
*)__kmp_allocate(
1791 sizeof(kmp_hwloc_cpukinds_info_t
) * nr_cpu_kinds
);
1792 for (unsigned idx
= 0; idx
< (unsigned)nr_cpu_kinds
; ++idx
) {
1793 cpukinds
[idx
].efficiency
= -1;
1794 cpukinds
[idx
].core_type
= KMP_HW_CORE_TYPE_UNKNOWN
;
1795 cpukinds
[idx
].mask
= hwloc_bitmap_alloc();
1796 if (hwloc_cpukinds_get_info(tp
, idx
, cpukinds
[idx
].mask
,
1797 &cpukinds
[idx
].efficiency
, &nr_infos
, &infos
,
1799 for (unsigned i
= 0; i
< nr_infos
; ++i
) {
1800 if (__kmp_str_match("CoreType", 8, infos
[i
].name
)) {
1801 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1802 if (__kmp_str_match("IntelAtom", 9, infos
[i
].value
)) {
1803 cpukinds
[idx
].core_type
= KMP_HW_CORE_TYPE_ATOM
;
1805 } else if (__kmp_str_match("IntelCore", 9, infos
[i
].value
)) {
1806 cpukinds
[idx
].core_type
= KMP_HW_CORE_TYPE_CORE
;
1817 root
= hwloc_get_root_obj(tp
);
1819 // Figure out the depth and types in the topology
1821 pu
= hwloc_get_pu_obj_by_os_index(tp
, __kmp_affin_fullMask
->begin());
1824 types
[depth
] = KMP_HW_THREAD
;
1825 hwloc_types
[depth
] = obj
->type
;
1827 while (obj
!= root
&& obj
!= NULL
) {
1829 #if HWLOC_API_VERSION >= 0x00020000
1830 if (obj
->memory_arity
) {
1832 for (memory
= obj
->memory_first_child
; memory
;
1833 memory
= hwloc_get_next_child(tp
, obj
, memory
)) {
1834 if (memory
->type
== HWLOC_OBJ_NUMANODE
)
1837 if (memory
&& memory
->type
== HWLOC_OBJ_NUMANODE
) {
1838 types
[depth
] = KMP_HW_NUMA
;
1839 hwloc_types
[depth
] = memory
->type
;
1844 type
= __kmp_hwloc_type_2_topology_type(obj
);
1845 if (type
!= KMP_HW_UNKNOWN
) {
1846 types
[depth
] = type
;
1847 hwloc_types
[depth
] = obj
->type
;
1851 KMP_ASSERT(depth
> 0);
1853 // Get the order for the types correct
1854 for (int i
= 0, j
= depth
- 1; i
< j
; ++i
, --j
) {
1855 hwloc_obj_type_t hwloc_temp
= hwloc_types
[i
];
1856 kmp_hw_t temp
= types
[i
];
1857 types
[i
] = types
[j
];
1859 hwloc_types
[i
] = hwloc_types
[j
];
1860 hwloc_types
[j
] = hwloc_temp
;
1863 // Allocate the data structure to be returned.
1864 __kmp_topology
= kmp_topology_t::allocate(__kmp_avail_proc
, depth
, types
);
1866 hw_thread_index
= 0;
1868 while ((pu
= hwloc_get_next_obj_by_type(tp
, HWLOC_OBJ_PU
, pu
))) {
1869 int index
= depth
- 1;
1870 bool included
= KMP_CPU_ISSET(pu
->os_index
, __kmp_affin_fullMask
);
1871 kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(hw_thread_index
);
1874 hw_thread
.ids
[index
] = pu
->logical_index
;
1875 hw_thread
.os_id
= pu
->os_index
;
1876 // If multiple core types, then set that attribute for the hardware thread
1877 #if HWLOC_API_VERSION >= 0x00020400
1879 int cpukind_index
= -1;
1880 for (int i
= 0; i
< nr_cpu_kinds
; ++i
) {
1881 if (hwloc_bitmap_isset(cpukinds
[i
].mask
, hw_thread
.os_id
)) {
1886 if (cpukind_index
>= 0) {
1887 hw_thread
.attrs
.set_core_type(cpukinds
[cpukind_index
].core_type
);
1888 hw_thread
.attrs
.set_core_eff(cpukinds
[cpukind_index
].efficiency
);
1896 while (obj
!= root
&& obj
!= NULL
) {
1898 #if HWLOC_API_VERSION >= 0x00020000
1899 // NUMA Nodes are handled differently since they are not within the
1900 // parent/child structure anymore. They are separate children
1901 // of obj (memory_first_child points to first memory child)
1902 if (obj
->memory_arity
) {
1904 for (memory
= obj
->memory_first_child
; memory
;
1905 memory
= hwloc_get_next_child(tp
, obj
, memory
)) {
1906 if (memory
->type
== HWLOC_OBJ_NUMANODE
)
1909 if (memory
&& memory
->type
== HWLOC_OBJ_NUMANODE
) {
1910 sub_id
= __kmp_hwloc_get_sub_id(tp
, memory
, prev
);
1912 hw_thread
.ids
[index
] = memory
->logical_index
;
1913 hw_thread
.ids
[index
+ 1] = sub_id
;
1921 type
= __kmp_hwloc_type_2_topology_type(obj
);
1922 if (type
!= KMP_HW_UNKNOWN
) {
1923 sub_id
= __kmp_hwloc_get_sub_id(tp
, obj
, prev
);
1925 hw_thread
.ids
[index
] = obj
->logical_index
;
1926 hw_thread
.ids
[index
+ 1] = sub_id
;
1936 #if HWLOC_API_VERSION >= 0x00020400
1937 // Free the core types information
1939 for (int idx
= 0; idx
< nr_cpu_kinds
; ++idx
)
1940 hwloc_bitmap_free(cpukinds
[idx
].mask
);
1941 __kmp_free(cpukinds
);
1944 __kmp_topology
->sort_ids();
1947 #endif // KMP_USE_HWLOC
1949 // If we don't know how to retrieve the machine's processor topology, or
1950 // encounter an error in doing so, this routine is called to form a "flat"
1951 // mapping of os thread id's <-> processor id's.
1952 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t
*const msg_id
) {
1953 *msg_id
= kmp_i18n_null
;
1955 kmp_hw_t types
[] = {KMP_HW_SOCKET
, KMP_HW_CORE
, KMP_HW_THREAD
};
1957 if (__kmp_affinity
.flags
.verbose
) {
1958 KMP_INFORM(UsingFlatOS
, "KMP_AFFINITY");
1961 // Even if __kmp_affinity.type == affinity_none, this routine might still
1962 // be called to set __kmp_ncores, as well as
1963 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1964 if (!KMP_AFFINITY_CAPABLE()) {
1965 KMP_ASSERT(__kmp_affinity
.type
== affinity_none
);
1966 __kmp_ncores
= nPackages
= __kmp_xproc
;
1967 __kmp_nThreadsPerCore
= nCoresPerPkg
= 1;
1971 // When affinity is off, this routine will still be called to set
1972 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1973 // Make sure all these vars are set correctly, and return now if affinity is
1975 __kmp_ncores
= nPackages
= __kmp_avail_proc
;
1976 __kmp_nThreadsPerCore
= nCoresPerPkg
= 1;
1978 // Construct the data structure to be returned.
1979 __kmp_topology
= kmp_topology_t::allocate(__kmp_avail_proc
, depth
, types
);
1982 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
1983 // Skip this proc if it is not included in the machine model.
1984 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
1987 kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(avail_ct
);
1989 hw_thread
.os_id
= i
;
1990 hw_thread
.ids
[0] = i
;
1991 hw_thread
.ids
[1] = 0;
1992 hw_thread
.ids
[2] = 0;
1995 if (__kmp_affinity
.flags
.verbose
) {
1996 KMP_INFORM(OSProcToPackage
, "KMP_AFFINITY");
2001 #if KMP_GROUP_AFFINITY
2002 // If multiple Windows* OS processor groups exist, we can create a 2-level
2003 // topology map with the groups at level 0 and the individual procs at level 1.
2004 // This facilitates letting the threads float among all procs in a group,
2005 // if granularity=group (the default when there are multiple groups).
2006 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t
*const msg_id
) {
2007 *msg_id
= kmp_i18n_null
;
2009 kmp_hw_t types
[] = {KMP_HW_PROC_GROUP
, KMP_HW_CORE
, KMP_HW_THREAD
};
2010 const static size_t BITS_PER_GROUP
= CHAR_BIT
* sizeof(DWORD_PTR
);
2012 if (__kmp_affinity
.flags
.verbose
) {
2013 KMP_INFORM(AffWindowsProcGroupMap
, "KMP_AFFINITY");
2016 // If we aren't affinity capable, then use flat topology
2017 if (!KMP_AFFINITY_CAPABLE()) {
2018 KMP_ASSERT(__kmp_affinity
.type
== affinity_none
);
2019 nPackages
= __kmp_num_proc_groups
;
2020 __kmp_nThreadsPerCore
= 1;
2021 __kmp_ncores
= __kmp_xproc
;
2022 nCoresPerPkg
= nPackages
/ __kmp_ncores
;
2026 // Construct the data structure to be returned.
2027 __kmp_topology
= kmp_topology_t::allocate(__kmp_avail_proc
, depth
, types
);
2030 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
2031 // Skip this proc if it is not included in the machine model.
2032 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
2035 kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(avail_ct
++);
2037 hw_thread
.os_id
= i
;
2038 hw_thread
.ids
[0] = i
/ BITS_PER_GROUP
;
2039 hw_thread
.ids
[1] = hw_thread
.ids
[2] = i
% BITS_PER_GROUP
;
2043 #endif /* KMP_GROUP_AFFINITY */
2045 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2047 template <kmp_uint32 LSB
, kmp_uint32 MSB
>
2048 static inline unsigned __kmp_extract_bits(kmp_uint32 v
) {
2049 const kmp_uint32 SHIFT_LEFT
= sizeof(kmp_uint32
) * 8 - 1 - MSB
;
2050 const kmp_uint32 SHIFT_RIGHT
= LSB
;
2051 kmp_uint32 retval
= v
;
2052 retval
<<= SHIFT_LEFT
;
2053 retval
>>= (SHIFT_LEFT
+ SHIFT_RIGHT
);
2057 static int __kmp_cpuid_mask_width(int count
) {
2060 while ((1 << r
) < count
)
2065 class apicThreadInfo
{
2067 unsigned osId
; // param to __kmp_affinity_bind_thread
2068 unsigned apicId
; // from cpuid after binding
2069 unsigned maxCoresPerPkg
; // ""
2070 unsigned maxThreadsPerPkg
; // ""
2071 unsigned pkgId
; // inferred from above values
2072 unsigned coreId
; // ""
2073 unsigned threadId
; // ""
2076 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a
,
2078 const apicThreadInfo
*aa
= (const apicThreadInfo
*)a
;
2079 const apicThreadInfo
*bb
= (const apicThreadInfo
*)b
;
2080 if (aa
->pkgId
< bb
->pkgId
)
2082 if (aa
->pkgId
> bb
->pkgId
)
2084 if (aa
->coreId
< bb
->coreId
)
2086 if (aa
->coreId
> bb
->coreId
)
2088 if (aa
->threadId
< bb
->threadId
)
2090 if (aa
->threadId
> bb
->threadId
)
2095 class kmp_cache_info_t
{
2098 unsigned level
, mask
;
2100 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
2101 size_t get_depth() const { return depth
; }
2102 info_t
&operator[](size_t index
) { return table
[index
]; }
2103 const info_t
&operator[](size_t index
) const { return table
[index
]; }
2105 static kmp_hw_t
get_topology_type(unsigned level
) {
2106 KMP_DEBUG_ASSERT(level
>= 1 && level
<= MAX_CACHE_LEVEL
);
2115 return KMP_HW_UNKNOWN
;
2119 static const int MAX_CACHE_LEVEL
= 3;
2122 info_t table
[MAX_CACHE_LEVEL
];
2124 void get_leaf4_levels() {
2126 while (depth
< MAX_CACHE_LEVEL
) {
2127 unsigned cache_type
, max_threads_sharing
;
2128 unsigned cache_level
, cache_mask_width
;
2130 __kmp_x86_cpuid(4, level
, &buf2
);
2131 cache_type
= __kmp_extract_bits
<0, 4>(buf2
.eax
);
2134 // Skip instruction caches
2135 if (cache_type
== 2) {
2139 max_threads_sharing
= __kmp_extract_bits
<14, 25>(buf2
.eax
) + 1;
2140 cache_mask_width
= __kmp_cpuid_mask_width(max_threads_sharing
);
2141 cache_level
= __kmp_extract_bits
<5, 7>(buf2
.eax
);
2142 table
[depth
].level
= cache_level
;
2143 table
[depth
].mask
= ((-1) << cache_mask_width
);
2150 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2151 // an algorithm which cycles through the available os threads, setting
2152 // the current thread's affinity mask to that thread, and then retrieves
2153 // the Apic Id for each thread context using the cpuid instruction.
2154 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t
*const msg_id
) {
2156 *msg_id
= kmp_i18n_null
;
2158 if (__kmp_affinity
.flags
.verbose
) {
2159 KMP_INFORM(AffInfoStr
, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC
));
2162 // Check if cpuid leaf 4 is supported.
2163 __kmp_x86_cpuid(0, 0, &buf
);
2165 *msg_id
= kmp_i18n_str_NoLeaf4Support
;
2169 // The algorithm used starts by setting the affinity to each available thread
2170 // and retrieving info from the cpuid instruction, so if we are not capable of
2171 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
2172 // need to do something else - use the defaults that we calculated from
2173 // issuing cpuid without binding to each proc.
2174 if (!KMP_AFFINITY_CAPABLE()) {
2175 // Hack to try and infer the machine topology using only the data
2176 // available from cpuid on the current thread, and __kmp_xproc.
2177 KMP_ASSERT(__kmp_affinity
.type
== affinity_none
);
2179 // Get an upper bound on the number of threads per package using cpuid(1).
2180 // On some OS/chps combinations where HT is supported by the chip but is
2181 // disabled, this value will be 2 on a single core chip. Usually, it will be
2182 // 2 if HT is enabled and 1 if HT is disabled.
2183 __kmp_x86_cpuid(1, 0, &buf
);
2184 int maxThreadsPerPkg
= (buf
.ebx
>> 16) & 0xff;
2185 if (maxThreadsPerPkg
== 0) {
2186 maxThreadsPerPkg
= 1;
2189 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
2192 // The author of cpu_count.cpp treated this only an upper bound on the
2193 // number of cores, but I haven't seen any cases where it was greater than
2194 // the actual number of cores, so we will treat it as exact in this block of
2197 // First, we need to check if cpuid(4) is supported on this chip. To see if
2198 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
2200 __kmp_x86_cpuid(0, 0, &buf
);
2202 __kmp_x86_cpuid(4, 0, &buf
);
2203 nCoresPerPkg
= ((buf
.eax
>> 26) & 0x3f) + 1;
2208 // There is no way to reliably tell if HT is enabled without issuing the
2209 // cpuid instruction from every thread, can correlating the cpuid info, so
2210 // if the machine is not affinity capable, we assume that HT is off. We have
2211 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
2212 // does not support HT.
2214 // - Older OSes are usually found on machines with older chips, which do not
2216 // - The performance penalty for mistakenly identifying a machine as HT when
2217 // it isn't (which results in blocktime being incorrectly set to 0) is
2218 // greater than the penalty when for mistakenly identifying a machine as
2219 // being 1 thread/core when it is really HT enabled (which results in
2220 // blocktime being incorrectly set to a positive value).
2221 __kmp_ncores
= __kmp_xproc
;
2222 nPackages
= (__kmp_xproc
+ nCoresPerPkg
- 1) / nCoresPerPkg
;
2223 __kmp_nThreadsPerCore
= 1;
2227 // From here on, we can assume that it is safe to call
2228 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2229 // __kmp_affinity.type = affinity_none.
2231 // Save the affinity mask for the current thread.
2232 kmp_affinity_raii_t previous_affinity
;
2234 // Run through each of the available contexts, binding the current thread
2235 // to it, and obtaining the pertinent information using the cpuid instr.
2237 // The relevant information is:
2238 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2239 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
2240 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2241 // of this field determines the width of the core# + thread# fields in the
2242 // Apic Id. It is also an upper bound on the number of threads per
2243 // package, but it has been verified that situations happen were it is not
2244 // exact. In particular, on certain OS/chip combinations where Intel(R)
2245 // Hyper-Threading Technology is supported by the chip but has been
2246 // disabled, the value of this field will be 2 (for a single core chip).
2247 // On other OS/chip combinations supporting Intel(R) Hyper-Threading
2248 // Technology, the value of this field will be 1 when Intel(R)
2249 // Hyper-Threading Technology is disabled and 2 when it is enabled.
2250 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
2251 // of this field (+1) determines the width of the core# field in the Apic
2252 // Id. The comments in "cpucount.cpp" say that this value is an upper
2253 // bound, but the IA-32 architecture manual says that it is exactly the
2254 // number of cores per package, and I haven't seen any case where it
2257 // From this information, deduce the package Id, core Id, and thread Id,
2258 // and set the corresponding fields in the apicThreadInfo struct.
2260 apicThreadInfo
*threadInfo
= (apicThreadInfo
*)__kmp_allocate(
2261 __kmp_avail_proc
* sizeof(apicThreadInfo
));
2262 unsigned nApics
= 0;
2263 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
2264 // Skip this proc if it is not included in the machine model.
2265 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
2268 KMP_DEBUG_ASSERT((int)nApics
< __kmp_avail_proc
);
2270 __kmp_affinity_dispatch
->bind_thread(i
);
2271 threadInfo
[nApics
].osId
= i
;
2273 // The apic id and max threads per pkg come from cpuid(1).
2274 __kmp_x86_cpuid(1, 0, &buf
);
2275 if (((buf
.edx
>> 9) & 1) == 0) {
2276 __kmp_free(threadInfo
);
2277 *msg_id
= kmp_i18n_str_ApicNotPresent
;
2280 threadInfo
[nApics
].apicId
= (buf
.ebx
>> 24) & 0xff;
2281 threadInfo
[nApics
].maxThreadsPerPkg
= (buf
.ebx
>> 16) & 0xff;
2282 if (threadInfo
[nApics
].maxThreadsPerPkg
== 0) {
2283 threadInfo
[nApics
].maxThreadsPerPkg
= 1;
2286 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
2289 // First, we need to check if cpuid(4) is supported on this chip. To see if
2290 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
2292 __kmp_x86_cpuid(0, 0, &buf
);
2294 __kmp_x86_cpuid(4, 0, &buf
);
2295 threadInfo
[nApics
].maxCoresPerPkg
= ((buf
.eax
>> 26) & 0x3f) + 1;
2297 threadInfo
[nApics
].maxCoresPerPkg
= 1;
2300 // Infer the pkgId / coreId / threadId using only the info obtained locally.
2301 int widthCT
= __kmp_cpuid_mask_width(threadInfo
[nApics
].maxThreadsPerPkg
);
2302 threadInfo
[nApics
].pkgId
= threadInfo
[nApics
].apicId
>> widthCT
;
2304 int widthC
= __kmp_cpuid_mask_width(threadInfo
[nApics
].maxCoresPerPkg
);
2305 int widthT
= widthCT
- widthC
;
2307 // I've never seen this one happen, but I suppose it could, if the cpuid
2308 // instruction on a chip was really screwed up. Make sure to restore the
2309 // affinity mask before the tail call.
2310 __kmp_free(threadInfo
);
2311 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
2315 int maskC
= (1 << widthC
) - 1;
2316 threadInfo
[nApics
].coreId
= (threadInfo
[nApics
].apicId
>> widthT
) & maskC
;
2318 int maskT
= (1 << widthT
) - 1;
2319 threadInfo
[nApics
].threadId
= threadInfo
[nApics
].apicId
& maskT
;
2324 // We've collected all the info we need.
2325 // Restore the old affinity mask for this thread.
2326 previous_affinity
.restore();
2328 // Sort the threadInfo table by physical Id.
2329 qsort(threadInfo
, nApics
, sizeof(*threadInfo
),
2330 __kmp_affinity_cmp_apicThreadInfo_phys_id
);
2332 // The table is now sorted by pkgId / coreId / threadId, but we really don't
2333 // know the radix of any of the fields. pkgId's may be sparsely assigned among
2334 // the chips on a system. Although coreId's are usually assigned
2335 // [0 .. coresPerPkg-1] and threadId's are usually assigned
2336 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2338 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2339 // total # packages) are at this point - we want to determine that now. We
2340 // only have an upper bound on the first two figures.
2342 // We also perform a consistency check at this point: the values returned by
2343 // the cpuid instruction for any thread bound to a given package had better
2344 // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
2347 __kmp_nThreadsPerCore
= 1;
2348 unsigned nCores
= 1;
2350 unsigned pkgCt
= 1; // to determine radii
2351 unsigned lastPkgId
= threadInfo
[0].pkgId
;
2352 unsigned coreCt
= 1;
2353 unsigned lastCoreId
= threadInfo
[0].coreId
;
2354 unsigned threadCt
= 1;
2355 unsigned lastThreadId
= threadInfo
[0].threadId
;
2357 // intra-pkg consist checks
2358 unsigned prevMaxCoresPerPkg
= threadInfo
[0].maxCoresPerPkg
;
2359 unsigned prevMaxThreadsPerPkg
= threadInfo
[0].maxThreadsPerPkg
;
2361 for (i
= 1; i
< nApics
; i
++) {
2362 if (threadInfo
[i
].pkgId
!= lastPkgId
) {
2365 lastPkgId
= threadInfo
[i
].pkgId
;
2366 if ((int)coreCt
> nCoresPerPkg
)
2367 nCoresPerPkg
= coreCt
;
2369 lastCoreId
= threadInfo
[i
].coreId
;
2370 if ((int)threadCt
> __kmp_nThreadsPerCore
)
2371 __kmp_nThreadsPerCore
= threadCt
;
2373 lastThreadId
= threadInfo
[i
].threadId
;
2375 // This is a different package, so go on to the next iteration without
2376 // doing any consistency checks. Reset the consistency check vars, though.
2377 prevMaxCoresPerPkg
= threadInfo
[i
].maxCoresPerPkg
;
2378 prevMaxThreadsPerPkg
= threadInfo
[i
].maxThreadsPerPkg
;
2382 if (threadInfo
[i
].coreId
!= lastCoreId
) {
2385 lastCoreId
= threadInfo
[i
].coreId
;
2386 if ((int)threadCt
> __kmp_nThreadsPerCore
)
2387 __kmp_nThreadsPerCore
= threadCt
;
2389 lastThreadId
= threadInfo
[i
].threadId
;
2390 } else if (threadInfo
[i
].threadId
!= lastThreadId
) {
2392 lastThreadId
= threadInfo
[i
].threadId
;
2394 __kmp_free(threadInfo
);
2395 *msg_id
= kmp_i18n_str_LegacyApicIDsNotUnique
;
2399 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
2400 // fields agree between all the threads bounds to a given package.
2401 if ((prevMaxCoresPerPkg
!= threadInfo
[i
].maxCoresPerPkg
) ||
2402 (prevMaxThreadsPerPkg
!= threadInfo
[i
].maxThreadsPerPkg
)) {
2403 __kmp_free(threadInfo
);
2404 *msg_id
= kmp_i18n_str_InconsistentCpuidInfo
;
2408 // When affinity is off, this routine will still be called to set
2409 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2410 // Make sure all these vars are set correctly
2412 if ((int)coreCt
> nCoresPerPkg
)
2413 nCoresPerPkg
= coreCt
;
2414 if ((int)threadCt
> __kmp_nThreadsPerCore
)
2415 __kmp_nThreadsPerCore
= threadCt
;
2416 __kmp_ncores
= nCores
;
2417 KMP_DEBUG_ASSERT(nApics
== (unsigned)__kmp_avail_proc
);
2419 // Now that we've determined the number of packages, the number of cores per
2420 // package, and the number of threads per core, we can construct the data
2421 // structure that is to be returned.
2425 int threadLevel
= 2;
2426 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2427 int depth
= (pkgLevel
>= 0) + (coreLevel
>= 0) + (threadLevel
>= 0);
2430 types
[idx
++] = KMP_HW_SOCKET
;
2432 types
[idx
++] = KMP_HW_CORE
;
2433 if (threadLevel
>= 0)
2434 types
[idx
++] = KMP_HW_THREAD
;
2436 KMP_ASSERT(depth
> 0);
2437 __kmp_topology
= kmp_topology_t::allocate(nApics
, depth
, types
);
2439 for (i
= 0; i
< nApics
; ++i
) {
2441 unsigned os
= threadInfo
[i
].osId
;
2442 kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(i
);
2445 if (pkgLevel
>= 0) {
2446 hw_thread
.ids
[idx
++] = threadInfo
[i
].pkgId
;
2448 if (coreLevel
>= 0) {
2449 hw_thread
.ids
[idx
++] = threadInfo
[i
].coreId
;
2451 if (threadLevel
>= 0) {
2452 hw_thread
.ids
[idx
++] = threadInfo
[i
].threadId
;
2454 hw_thread
.os_id
= os
;
2457 __kmp_free(threadInfo
);
2458 __kmp_topology
->sort_ids();
2459 if (!__kmp_topology
->check_ids()) {
2460 kmp_topology_t::deallocate(__kmp_topology
);
2461 __kmp_topology
= nullptr;
2462 *msg_id
= kmp_i18n_str_LegacyApicIDsNotUnique
;
2468 // Hybrid cpu detection using CPUID.1A
2469 // Thread should be pinned to processor already
2470 static void __kmp_get_hybrid_info(kmp_hw_core_type_t
*type
, int *efficiency
,
2471 unsigned *native_model_id
) {
2473 __kmp_x86_cpuid(0x1a, 0, &buf
);
2474 *type
= (kmp_hw_core_type_t
)__kmp_extract_bits
<24, 31>(buf
.eax
);
2476 case KMP_HW_CORE_TYPE_ATOM
:
2479 case KMP_HW_CORE_TYPE_CORE
:
2485 *native_model_id
= __kmp_extract_bits
<0, 23>(buf
.eax
);
2488 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
2489 // architectures support a newer interface for specifying the x2APIC Ids,
2490 // based on CPUID.B or CPUID.1F
2492 * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
2495 ---+-----------+--------------+-------------+-----------------+
2496 EAX| reserved | reserved | reserved | Bits to Shift |
2497 ---+-----------|--------------+-------------+-----------------|
2498 EBX| reserved | Num logical processors at level (16 bits) |
2499 ---+-----------|--------------+-------------------------------|
2500 ECX| reserved | Level Type | Level Number (8 bits) |
2501 ---+-----------+--------------+-------------------------------|
2502 EDX| X2APIC ID (32 bits) |
2503 ---+----------------------------------------------------------+
2507 INTEL_LEVEL_TYPE_INVALID
= 0, // Package level
2508 INTEL_LEVEL_TYPE_SMT
= 1,
2509 INTEL_LEVEL_TYPE_CORE
= 2,
2510 INTEL_LEVEL_TYPE_MODULE
= 3,
2511 INTEL_LEVEL_TYPE_TILE
= 4,
2512 INTEL_LEVEL_TYPE_DIE
= 5,
2513 INTEL_LEVEL_TYPE_LAST
= 6,
2516 struct cpuid_level_info_t
{
2517 unsigned level_type
, mask
, mask_width
, nitems
, cache_mask
;
2520 static kmp_hw_t
__kmp_intel_type_2_topology_type(int intel_type
) {
2521 switch (intel_type
) {
2522 case INTEL_LEVEL_TYPE_INVALID
:
2523 return KMP_HW_SOCKET
;
2524 case INTEL_LEVEL_TYPE_SMT
:
2525 return KMP_HW_THREAD
;
2526 case INTEL_LEVEL_TYPE_CORE
:
2528 case INTEL_LEVEL_TYPE_TILE
:
2530 case INTEL_LEVEL_TYPE_MODULE
:
2531 return KMP_HW_MODULE
;
2532 case INTEL_LEVEL_TYPE_DIE
:
2535 return KMP_HW_UNKNOWN
;
2538 // This function takes the topology leaf, a levels array to store the levels
2539 // detected and a bitmap of the known levels.
2540 // Returns the number of levels in the topology
2542 __kmp_x2apicid_get_levels(int leaf
,
2543 cpuid_level_info_t levels
[INTEL_LEVEL_TYPE_LAST
],
2544 kmp_uint64 known_levels
) {
2545 unsigned level
, levels_index
;
2546 unsigned level_type
, mask_width
, nitems
;
2549 // New algorithm has known topology layers act as highest unknown topology
2550 // layers when unknown topology layers exist.
2551 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
2552 // are unknown topology layers, Then SMT will take the characteristics of
2553 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
2554 // This eliminates unknown portions of the topology while still keeping the
2555 // correct structure.
2556 level
= levels_index
= 0;
2558 __kmp_x86_cpuid(leaf
, level
, &buf
);
2559 level_type
= __kmp_extract_bits
<8, 15>(buf
.ecx
);
2560 mask_width
= __kmp_extract_bits
<0, 4>(buf
.eax
);
2561 nitems
= __kmp_extract_bits
<0, 15>(buf
.ebx
);
2562 if (level_type
!= INTEL_LEVEL_TYPE_INVALID
&& nitems
== 0)
2565 if (known_levels
& (1ull << level_type
)) {
2566 // Add a new level to the topology
2567 KMP_ASSERT(levels_index
< INTEL_LEVEL_TYPE_LAST
);
2568 levels
[levels_index
].level_type
= level_type
;
2569 levels
[levels_index
].mask_width
= mask_width
;
2570 levels
[levels_index
].nitems
= nitems
;
2573 // If it is an unknown level, then logically move the previous layer up
2574 if (levels_index
> 0) {
2575 levels
[levels_index
- 1].mask_width
= mask_width
;
2576 levels
[levels_index
- 1].nitems
= nitems
;
2580 } while (level_type
!= INTEL_LEVEL_TYPE_INVALID
);
2582 // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2583 if (levels_index
== 0 || levels
[0].level_type
== INTEL_LEVEL_TYPE_INVALID
)
2586 // Set the masks to & with apicid
2587 for (unsigned i
= 0; i
< levels_index
; ++i
) {
2588 if (levels
[i
].level_type
!= INTEL_LEVEL_TYPE_INVALID
) {
2589 levels
[i
].mask
= ~((-1) << levels
[i
].mask_width
);
2590 levels
[i
].cache_mask
= (-1) << levels
[i
].mask_width
;
2591 for (unsigned j
= 0; j
< i
; ++j
)
2592 levels
[i
].mask
^= levels
[j
].mask
;
2594 KMP_DEBUG_ASSERT(i
> 0);
2595 levels
[i
].mask
= (-1) << levels
[i
- 1].mask_width
;
2596 levels
[i
].cache_mask
= 0;
2599 return levels_index
;
2602 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t
*const msg_id
) {
2604 cpuid_level_info_t levels
[INTEL_LEVEL_TYPE_LAST
];
2605 kmp_hw_t types
[INTEL_LEVEL_TYPE_LAST
];
2606 unsigned levels_index
;
2608 kmp_uint64 known_levels
;
2609 int topology_leaf
, highest_leaf
, apic_id
;
2611 static int leaves
[] = {0, 0};
2613 kmp_i18n_id_t leaf_message_id
;
2615 KMP_BUILD_ASSERT(sizeof(known_levels
) * CHAR_BIT
> KMP_HW_LAST
);
2617 *msg_id
= kmp_i18n_null
;
2618 if (__kmp_affinity
.flags
.verbose
) {
2619 KMP_INFORM(AffInfoStr
, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC
));
2622 // Figure out the known topology levels
2623 known_levels
= 0ull;
2624 for (int i
= 0; i
< INTEL_LEVEL_TYPE_LAST
; ++i
) {
2625 if (__kmp_intel_type_2_topology_type(i
) != KMP_HW_UNKNOWN
) {
2626 known_levels
|= (1ull << i
);
2630 // Get the highest cpuid leaf supported
2631 __kmp_x86_cpuid(0, 0, &buf
);
2632 highest_leaf
= buf
.eax
;
2634 // If a specific topology method was requested, only allow that specific leaf
2635 // otherwise, try both leaves 31 and 11 in that order
2637 if (__kmp_affinity_top_method
== affinity_top_method_x2apicid
) {
2640 leaf_message_id
= kmp_i18n_str_NoLeaf11Support
;
2641 } else if (__kmp_affinity_top_method
== affinity_top_method_x2apicid_1f
) {
2644 leaf_message_id
= kmp_i18n_str_NoLeaf31Support
;
2649 leaf_message_id
= kmp_i18n_str_NoLeaf11Support
;
2652 // Check to see if cpuid leaf 31 or 11 is supported.
2653 __kmp_nThreadsPerCore
= nCoresPerPkg
= nPackages
= 1;
2655 for (int i
= 0; i
< num_leaves
; ++i
) {
2656 int leaf
= leaves
[i
];
2657 if (highest_leaf
< leaf
)
2659 __kmp_x86_cpuid(leaf
, 0, &buf
);
2662 topology_leaf
= leaf
;
2663 levels_index
= __kmp_x2apicid_get_levels(leaf
, levels
, known_levels
);
2664 if (levels_index
== 0)
2668 if (topology_leaf
== -1 || levels_index
== 0) {
2669 *msg_id
= leaf_message_id
;
2672 KMP_ASSERT(levels_index
<= INTEL_LEVEL_TYPE_LAST
);
2674 // The algorithm used starts by setting the affinity to each available thread
2675 // and retrieving info from the cpuid instruction, so if we are not capable of
2676 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2677 // we need to do something else - use the defaults that we calculated from
2678 // issuing cpuid without binding to each proc.
2679 if (!KMP_AFFINITY_CAPABLE()) {
2680 // Hack to try and infer the machine topology using only the data
2681 // available from cpuid on the current thread, and __kmp_xproc.
2682 KMP_ASSERT(__kmp_affinity
.type
== affinity_none
);
2683 for (unsigned i
= 0; i
< levels_index
; ++i
) {
2684 if (levels
[i
].level_type
== INTEL_LEVEL_TYPE_SMT
) {
2685 __kmp_nThreadsPerCore
= levels
[i
].nitems
;
2686 } else if (levels
[i
].level_type
== INTEL_LEVEL_TYPE_CORE
) {
2687 nCoresPerPkg
= levels
[i
].nitems
;
2690 __kmp_ncores
= __kmp_xproc
/ __kmp_nThreadsPerCore
;
2691 nPackages
= (__kmp_xproc
+ nCoresPerPkg
- 1) / nCoresPerPkg
;
2695 // Allocate the data structure to be returned.
2696 int depth
= levels_index
;
2697 for (int i
= depth
- 1, j
= 0; i
>= 0; --i
, ++j
)
2698 types
[j
] = __kmp_intel_type_2_topology_type(levels
[i
].level_type
);
2700 kmp_topology_t::allocate(__kmp_avail_proc
, levels_index
, types
);
2702 // Insert equivalent cache types if they exist
2703 kmp_cache_info_t cache_info
;
2704 for (size_t i
= 0; i
< cache_info
.get_depth(); ++i
) {
2705 const kmp_cache_info_t::info_t
&info
= cache_info
[i
];
2706 unsigned cache_mask
= info
.mask
;
2707 unsigned cache_level
= info
.level
;
2708 for (unsigned j
= 0; j
< levels_index
; ++j
) {
2709 unsigned hw_cache_mask
= levels
[j
].cache_mask
;
2710 kmp_hw_t cache_type
= kmp_cache_info_t::get_topology_type(cache_level
);
2711 if (hw_cache_mask
== cache_mask
&& j
< levels_index
- 1) {
2713 __kmp_intel_type_2_topology_type(levels
[j
+ 1].level_type
);
2714 __kmp_topology
->set_equivalent_type(cache_type
, type
);
2719 // From here on, we can assume that it is safe to call
2720 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2721 // __kmp_affinity.type = affinity_none.
2723 // Save the affinity mask for the current thread.
2724 kmp_affinity_raii_t previous_affinity
;
2726 // Run through each of the available contexts, binding the current thread
2727 // to it, and obtaining the pertinent information using the cpuid instr.
2729 int hw_thread_index
= 0;
2730 KMP_CPU_SET_ITERATE(proc
, __kmp_affin_fullMask
) {
2731 cpuid_level_info_t my_levels
[INTEL_LEVEL_TYPE_LAST
];
2732 unsigned my_levels_index
;
2734 // Skip this proc if it is not included in the machine model.
2735 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
2738 KMP_DEBUG_ASSERT(hw_thread_index
< __kmp_avail_proc
);
2740 __kmp_affinity_dispatch
->bind_thread(proc
);
2743 __kmp_x86_cpuid(topology_leaf
, 0, &buf
);
2745 kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(hw_thread_index
);
2747 __kmp_x2apicid_get_levels(topology_leaf
, my_levels
, known_levels
);
2748 if (my_levels_index
== 0 || my_levels_index
!= levels_index
) {
2749 *msg_id
= kmp_i18n_str_InvalidCpuidInfo
;
2753 hw_thread
.os_id
= proc
;
2754 // Put in topology information
2755 for (unsigned j
= 0, idx
= depth
- 1; j
< my_levels_index
; ++j
, --idx
) {
2756 hw_thread
.ids
[idx
] = apic_id
& my_levels
[j
].mask
;
2758 hw_thread
.ids
[idx
] >>= my_levels
[j
- 1].mask_width
;
2761 // Hybrid information
2762 if (__kmp_is_hybrid_cpu() && highest_leaf
>= 0x1a) {
2763 kmp_hw_core_type_t type
;
2764 unsigned native_model_id
;
2766 __kmp_get_hybrid_info(&type
, &efficiency
, &native_model_id
);
2767 hw_thread
.attrs
.set_core_type(type
);
2768 hw_thread
.attrs
.set_core_eff(efficiency
);
2772 KMP_ASSERT(hw_thread_index
> 0);
2773 __kmp_topology
->sort_ids();
2774 if (!__kmp_topology
->check_ids()) {
2775 kmp_topology_t::deallocate(__kmp_topology
);
2776 __kmp_topology
= nullptr;
2777 *msg_id
= kmp_i18n_str_x2ApicIDsNotUnique
;
2782 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2785 #define threadIdIndex 1
2786 #define coreIdIndex 2
2787 #define pkgIdIndex 3
2788 #define nodeIdIndex 4
2790 typedef unsigned *ProcCpuInfo
;
2791 static unsigned maxIndex
= pkgIdIndex
;
2793 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a
,
2796 const unsigned *aa
= *(unsigned *const *)a
;
2797 const unsigned *bb
= *(unsigned *const *)b
;
2798 for (i
= maxIndex
;; i
--) {
2809 #if KMP_USE_HIER_SCHED
2810 // Set the array sizes for the hierarchy layers
2811 static void __kmp_dispatch_set_hierarchy_values() {
2812 // Set the maximum number of L1's to number of cores
2813 // Set the maximum number of L2's to either number of cores / 2 for
2814 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2815 // Or the number of cores for Intel(R) Xeon(R) processors
2816 // Set the maximum number of NUMA nodes and L3's to number of packages
2817 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_THREAD
+ 1] =
2818 nPackages
* nCoresPerPkg
* __kmp_nThreadsPerCore
;
2819 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L1
+ 1] = __kmp_ncores
;
2820 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2822 if (__kmp_mic_type
>= mic3
)
2823 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L2
+ 1] = __kmp_ncores
/ 2;
2825 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2826 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L2
+ 1] = __kmp_ncores
;
2827 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_L3
+ 1] = nPackages
;
2828 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_NUMA
+ 1] = nPackages
;
2829 __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_LOOP
+ 1] = 1;
2830 // Set the number of threads per unit
2831 // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2832 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_THREAD
+ 1] = 1;
2833 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L1
+ 1] =
2834 __kmp_nThreadsPerCore
;
2835 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2837 if (__kmp_mic_type
>= mic3
)
2838 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L2
+ 1] =
2839 2 * __kmp_nThreadsPerCore
;
2841 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2842 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L2
+ 1] =
2843 __kmp_nThreadsPerCore
;
2844 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_L3
+ 1] =
2845 nCoresPerPkg
* __kmp_nThreadsPerCore
;
2846 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_NUMA
+ 1] =
2847 nCoresPerPkg
* __kmp_nThreadsPerCore
;
2848 __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_LOOP
+ 1] =
2849 nPackages
* nCoresPerPkg
* __kmp_nThreadsPerCore
;
2852 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2853 // i.e., this thread's L1 or this thread's L2, etc.
2854 int __kmp_dispatch_get_index(int tid
, kmp_hier_layer_e type
) {
2855 int index
= type
+ 1;
2856 int num_hw_threads
= __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_THREAD
+ 1];
2857 KMP_DEBUG_ASSERT(type
!= kmp_hier_layer_e::LAYER_LAST
);
2858 if (type
== kmp_hier_layer_e::LAYER_THREAD
)
2860 else if (type
== kmp_hier_layer_e::LAYER_LOOP
)
2862 KMP_DEBUG_ASSERT(__kmp_hier_max_units
[index
] != 0);
2863 if (tid
>= num_hw_threads
)
2864 tid
= tid
% num_hw_threads
;
2865 return (tid
/ __kmp_hier_threads_per
[index
]) % __kmp_hier_max_units
[index
];
2868 // Return the number of t1's per t2
2869 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1
, kmp_hier_layer_e t2
) {
2872 KMP_DEBUG_ASSERT(i1
<= i2
);
2873 KMP_DEBUG_ASSERT(t1
!= kmp_hier_layer_e::LAYER_LAST
);
2874 KMP_DEBUG_ASSERT(t2
!= kmp_hier_layer_e::LAYER_LAST
);
2875 KMP_DEBUG_ASSERT(__kmp_hier_threads_per
[i1
] != 0);
2876 // (nthreads/t2) / (nthreads/t1) = t1 / t2
2877 return __kmp_hier_threads_per
[i2
] / __kmp_hier_threads_per
[i1
];
2879 #endif // KMP_USE_HIER_SCHED
2881 static inline const char *__kmp_cpuinfo_get_filename() {
2882 const char *filename
;
2883 if (__kmp_cpuinfo_file
!= nullptr)
2884 filename
= __kmp_cpuinfo_file
;
2886 filename
= "/proc/cpuinfo";
2890 static inline const char *__kmp_cpuinfo_get_envvar() {
2891 const char *envvar
= nullptr;
2892 if (__kmp_cpuinfo_file
!= nullptr)
2893 envvar
= "KMP_CPUINFO_FILE";
2897 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2899 static bool __kmp_affinity_create_cpuinfo_map(int *line
,
2900 kmp_i18n_id_t
*const msg_id
) {
2901 const char *filename
= __kmp_cpuinfo_get_filename();
2902 const char *envvar
= __kmp_cpuinfo_get_envvar();
2903 *msg_id
= kmp_i18n_null
;
2905 if (__kmp_affinity
.flags
.verbose
) {
2906 KMP_INFORM(AffParseFilename
, "KMP_AFFINITY", filename
);
2909 kmp_safe_raii_file_t
f(filename
, "r", envvar
);
2911 // Scan of the file, and count the number of "processor" (osId) fields,
2912 // and find the highest value of <n> for a node_<n> field.
2914 unsigned num_records
= 0;
2916 buf
[sizeof(buf
) - 1] = 1;
2917 if (!fgets(buf
, sizeof(buf
), f
)) {
2918 // Read errors presumably because of EOF
2922 char s1
[] = "processor";
2923 if (strncmp(buf
, s1
, sizeof(s1
) - 1) == 0) {
2928 // FIXME - this will match "node_<n> <garbage>"
2930 if (KMP_SSCANF(buf
, "node_%u id", &level
) == 1) {
2931 // validate the input fisrt:
2932 if (level
> (unsigned)__kmp_xproc
) { // level is too big
2933 level
= __kmp_xproc
;
2935 if (nodeIdIndex
+ level
>= maxIndex
) {
2936 maxIndex
= nodeIdIndex
+ level
;
2942 // Check for empty file / no valid processor records, or too many. The number
2943 // of records can't exceed the number of valid bits in the affinity mask.
2944 if (num_records
== 0) {
2945 *msg_id
= kmp_i18n_str_NoProcRecords
;
2948 if (num_records
> (unsigned)__kmp_xproc
) {
2949 *msg_id
= kmp_i18n_str_TooManyProcRecords
;
2953 // Set the file pointer back to the beginning, so that we can scan the file
2954 // again, this time performing a full parse of the data. Allocate a vector of
2955 // ProcCpuInfo object, where we will place the data. Adding an extra element
2956 // at the end allows us to remove a lot of extra checks for termination
2958 if (fseek(f
, 0, SEEK_SET
) != 0) {
2959 *msg_id
= kmp_i18n_str_CantRewindCpuinfo
;
2963 // Allocate the array of records to store the proc info in. The dummy
2964 // element at the end makes the logic in filling them out easier to code.
2965 unsigned **threadInfo
=
2966 (unsigned **)__kmp_allocate((num_records
+ 1) * sizeof(unsigned *));
2968 for (i
= 0; i
<= num_records
; i
++) {
2970 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
2973 #define CLEANUP_THREAD_INFO \
2974 for (i = 0; i <= num_records; i++) { \
2975 __kmp_free(threadInfo[i]); \
2977 __kmp_free(threadInfo);
2979 // A value of UINT_MAX means that we didn't find the field
2982 #define INIT_PROC_INFO(p) \
2983 for (__index = 0; __index <= maxIndex; __index++) { \
2984 (p)[__index] = UINT_MAX; \
2987 for (i
= 0; i
<= num_records
; i
++) {
2988 INIT_PROC_INFO(threadInfo
[i
]);
2991 unsigned num_avail
= 0;
2994 // Create an inner scoping level, so that all the goto targets at the end of
2995 // the loop appear in an outer scoping level. This avoids warnings about
2996 // jumping past an initialization to a target in the same block.
2998 buf
[sizeof(buf
) - 1] = 1;
2999 bool long_line
= false;
3000 if (!fgets(buf
, sizeof(buf
), f
)) {
3001 // Read errors presumably because of EOF
3002 // If there is valid data in threadInfo[num_avail], then fake
3003 // a blank line in ensure that the last address gets parsed.
3005 for (i
= 0; i
<= maxIndex
; i
++) {
3006 if (threadInfo
[num_avail
][i
] != UINT_MAX
) {
3014 } else if (!buf
[sizeof(buf
) - 1]) {
3015 // The line is longer than the buffer. Set a flag and don't
3016 // emit an error if we were going to ignore the line, anyway.
3019 #define CHECK_LINE \
3021 CLEANUP_THREAD_INFO; \
3022 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
3028 #if KMP_ARCH_LOONGARCH64
3029 // The parsing logic of /proc/cpuinfo in this function highly depends on
3030 // the blank lines between each processor info block. But on LoongArch a
3031 // blank line exists before the first processor info block (i.e. after the
3032 // "system type" line). This blank line was added because the "system
3033 // type" line is unrelated to any of the CPUs. We must skip this line so
3034 // that the original logic works on LoongArch.
3035 if (*buf
== '\n' && *line
== 2)
3039 char s1
[] = "processor";
3040 if (strncmp(buf
, s1
, sizeof(s1
) - 1) == 0) {
3042 char *p
= strchr(buf
+ sizeof(s1
) - 1, ':');
3044 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
3046 if (threadInfo
[num_avail
][osIdIndex
] != UINT_MAX
)
3047 #if KMP_ARCH_AARCH64
3048 // Handle the old AArch64 /proc/cpuinfo layout differently,
3049 // it contains all of the 'processor' entries listed in a
3050 // single 'Processor' section, therefore the normal looking
3051 // for duplicates in that section will always fail.
3056 threadInfo
[num_avail
][osIdIndex
] = val
;
3057 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
3061 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
3062 threadInfo
[num_avail
][osIdIndex
]);
3063 __kmp_read_from_file(path
, "%u", &threadInfo
[num_avail
][pkgIdIndex
]);
3065 KMP_SNPRINTF(path
, sizeof(path
),
3066 "/sys/devices/system/cpu/cpu%u/topology/core_id",
3067 threadInfo
[num_avail
][osIdIndex
]);
3068 __kmp_read_from_file(path
, "%u", &threadInfo
[num_avail
][coreIdIndex
]);
3072 char s2
[] = "physical id";
3073 if (strncmp(buf
, s2
, sizeof(s2
) - 1) == 0) {
3075 char *p
= strchr(buf
+ sizeof(s2
) - 1, ':');
3077 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
3079 if (threadInfo
[num_avail
][pkgIdIndex
] != UINT_MAX
)
3081 threadInfo
[num_avail
][pkgIdIndex
] = val
;
3084 char s3
[] = "core id";
3085 if (strncmp(buf
, s3
, sizeof(s3
) - 1) == 0) {
3087 char *p
= strchr(buf
+ sizeof(s3
) - 1, ':');
3089 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
3091 if (threadInfo
[num_avail
][coreIdIndex
] != UINT_MAX
)
3093 threadInfo
[num_avail
][coreIdIndex
] = val
;
3095 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
3097 char s4
[] = "thread id";
3098 if (strncmp(buf
, s4
, sizeof(s4
) - 1) == 0) {
3100 char *p
= strchr(buf
+ sizeof(s4
) - 1, ':');
3102 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
3104 if (threadInfo
[num_avail
][threadIdIndex
] != UINT_MAX
)
3106 threadInfo
[num_avail
][threadIdIndex
] = val
;
3110 if (KMP_SSCANF(buf
, "node_%u id", &level
) == 1) {
3112 char *p
= strchr(buf
+ sizeof(s4
) - 1, ':');
3114 if ((p
== NULL
) || (KMP_SSCANF(p
+ 1, "%u\n", &val
) != 1))
3116 // validate the input before using level:
3117 if (level
> (unsigned)__kmp_xproc
) { // level is too big
3118 level
= __kmp_xproc
;
3120 if (threadInfo
[num_avail
][nodeIdIndex
+ level
] != UINT_MAX
)
3122 threadInfo
[num_avail
][nodeIdIndex
+ level
] = val
;
3126 // We didn't recognize the leading token on the line. There are lots of
3127 // leading tokens that we don't recognize - if the line isn't empty, go on
3128 // to the next line.
3129 if ((*buf
!= 0) && (*buf
!= '\n')) {
3130 // If the line is longer than the buffer, read characters
3131 // until we find a newline.
3134 while (((ch
= fgetc(f
)) != EOF
) && (ch
!= '\n'))
3140 // A newline has signalled the end of the processor record.
3141 // Check that there aren't too many procs specified.
3142 if ((int)num_avail
== __kmp_xproc
) {
3143 CLEANUP_THREAD_INFO
;
3144 *msg_id
= kmp_i18n_str_TooManyEntries
;
3148 // Check for missing fields. The osId field must be there, and we
3149 // currently require that the physical id field is specified, also.
3150 if (threadInfo
[num_avail
][osIdIndex
] == UINT_MAX
) {
3151 CLEANUP_THREAD_INFO
;
3152 *msg_id
= kmp_i18n_str_MissingProcField
;
3155 if (threadInfo
[0][pkgIdIndex
] == UINT_MAX
) {
3156 CLEANUP_THREAD_INFO
;
3157 *msg_id
= kmp_i18n_str_MissingPhysicalIDField
;
3161 // Skip this proc if it is not included in the machine model.
3162 if (KMP_AFFINITY_CAPABLE() &&
3163 !KMP_CPU_ISSET(threadInfo
[num_avail
][osIdIndex
],
3164 __kmp_affin_fullMask
)) {
3165 INIT_PROC_INFO(threadInfo
[num_avail
]);
3169 // We have a successful parse of this proc's info.
3170 // Increment the counter, and prepare for the next proc.
3172 KMP_ASSERT(num_avail
<= num_records
);
3173 INIT_PROC_INFO(threadInfo
[num_avail
]);
3178 CLEANUP_THREAD_INFO
;
3179 *msg_id
= kmp_i18n_str_MissingValCpuinfo
;
3183 CLEANUP_THREAD_INFO
;
3184 *msg_id
= kmp_i18n_str_DuplicateFieldCpuinfo
;
3189 #if KMP_MIC && REDUCE_TEAM_SIZE
3190 unsigned teamSize
= 0;
3191 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3193 // check for num_records == __kmp_xproc ???
3195 // If it is configured to omit the package level when there is only a single
3196 // package, the logic at the end of this routine won't work if there is only a
3198 KMP_ASSERT(num_avail
> 0);
3199 KMP_ASSERT(num_avail
<= num_records
);
3201 // Sort the threadInfo table by physical Id.
3202 qsort(threadInfo
, num_avail
, sizeof(*threadInfo
),
3203 __kmp_affinity_cmp_ProcCpuInfo_phys_id
);
3205 // The table is now sorted by pkgId / coreId / threadId, but we really don't
3206 // know the radix of any of the fields. pkgId's may be sparsely assigned among
3207 // the chips on a system. Although coreId's are usually assigned
3208 // [0 .. coresPerPkg-1] and threadId's are usually assigned
3209 // [0..threadsPerCore-1], we don't want to make any such assumptions.
3211 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
3212 // total # packages) are at this point - we want to determine that now. We
3213 // only have an upper bound on the first two figures.
3215 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
3217 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
3219 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
3221 (unsigned *)__kmp_allocate((maxIndex
+ 1) * sizeof(unsigned));
3223 bool assign_thread_ids
= false;
3224 unsigned threadIdCt
;
3227 restart_radix_check
:
3230 // Initialize the counter arrays with data from threadInfo[0].
3231 if (assign_thread_ids
) {
3232 if (threadInfo
[0][threadIdIndex
] == UINT_MAX
) {
3233 threadInfo
[0][threadIdIndex
] = threadIdCt
++;
3234 } else if (threadIdCt
<= threadInfo
[0][threadIdIndex
]) {
3235 threadIdCt
= threadInfo
[0][threadIdIndex
] + 1;
3238 for (index
= 0; index
<= maxIndex
; index
++) {
3242 lastId
[index
] = threadInfo
[0][index
];
3246 // Run through the rest of the OS procs.
3247 for (i
= 1; i
< num_avail
; i
++) {
3248 // Find the most significant index whose id differs from the id for the
3249 // previous OS proc.
3250 for (index
= maxIndex
; index
>= threadIdIndex
; index
--) {
3251 if (assign_thread_ids
&& (index
== threadIdIndex
)) {
3252 // Auto-assign the thread id field if it wasn't specified.
3253 if (threadInfo
[i
][threadIdIndex
] == UINT_MAX
) {
3254 threadInfo
[i
][threadIdIndex
] = threadIdCt
++;
3256 // Apparently the thread id field was specified for some entries and not
3257 // others. Start the thread id counter off at the next higher thread id.
3258 else if (threadIdCt
<= threadInfo
[i
][threadIdIndex
]) {
3259 threadIdCt
= threadInfo
[i
][threadIdIndex
] + 1;
3262 if (threadInfo
[i
][index
] != lastId
[index
]) {
3263 // Run through all indices which are less significant, and reset the
3264 // counts to 1. At all levels up to and including index, we need to
3265 // increment the totals and record the last id.
3267 for (index2
= threadIdIndex
; index2
< index
; index2
++) {
3269 if (counts
[index2
] > maxCt
[index2
]) {
3270 maxCt
[index2
] = counts
[index2
];
3273 lastId
[index2
] = threadInfo
[i
][index2
];
3277 lastId
[index
] = threadInfo
[i
][index
];
3279 if (assign_thread_ids
&& (index
> threadIdIndex
)) {
3281 #if KMP_MIC && REDUCE_TEAM_SIZE
3282 // The default team size is the total #threads in the machine
3283 // minus 1 thread for every core that has 3 or more threads.
3284 teamSize
+= (threadIdCt
<= 2) ? (threadIdCt
) : (threadIdCt
- 1);
3285 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3287 // Restart the thread counter, as we are on a new core.
3290 // Auto-assign the thread id field if it wasn't specified.
3291 if (threadInfo
[i
][threadIdIndex
] == UINT_MAX
) {
3292 threadInfo
[i
][threadIdIndex
] = threadIdCt
++;
3295 // Apparently the thread id field was specified for some entries and
3296 // not others. Start the thread id counter off at the next higher
3298 else if (threadIdCt
<= threadInfo
[i
][threadIdIndex
]) {
3299 threadIdCt
= threadInfo
[i
][threadIdIndex
] + 1;
3305 if (index
< threadIdIndex
) {
3306 // If thread ids were specified, it is an error if they are not unique.
3307 // Also, check that we waven't already restarted the loop (to be safe -
3308 // shouldn't need to).
3309 if ((threadInfo
[i
][threadIdIndex
] != UINT_MAX
) || assign_thread_ids
) {
3314 CLEANUP_THREAD_INFO
;
3315 *msg_id
= kmp_i18n_str_PhysicalIDsNotUnique
;
3319 // If the thread ids were not specified and we see entries that
3320 // are duplicates, start the loop over and assign the thread ids manually.
3321 assign_thread_ids
= true;
3322 goto restart_radix_check
;
3326 #if KMP_MIC && REDUCE_TEAM_SIZE
3327 // The default team size is the total #threads in the machine
3328 // minus 1 thread for every core that has 3 or more threads.
3329 teamSize
+= (threadIdCt
<= 2) ? (threadIdCt
) : (threadIdCt
- 1);
3330 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3332 for (index
= threadIdIndex
; index
<= maxIndex
; index
++) {
3333 if (counts
[index
] > maxCt
[index
]) {
3334 maxCt
[index
] = counts
[index
];
3338 __kmp_nThreadsPerCore
= maxCt
[threadIdIndex
];
3339 nCoresPerPkg
= maxCt
[coreIdIndex
];
3340 nPackages
= totals
[pkgIdIndex
];
3342 // When affinity is off, this routine will still be called to set
3343 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
3344 // Make sure all these vars are set correctly, and return now if affinity is
3346 __kmp_ncores
= totals
[coreIdIndex
];
3347 if (!KMP_AFFINITY_CAPABLE()) {
3348 KMP_ASSERT(__kmp_affinity
.type
== affinity_none
);
3352 #if KMP_MIC && REDUCE_TEAM_SIZE
3353 // Set the default team size.
3354 if ((__kmp_dflt_team_nth
== 0) && (teamSize
> 0)) {
3355 __kmp_dflt_team_nth
= teamSize
;
3356 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
3357 "__kmp_dflt_team_nth = %d\n",
3358 __kmp_dflt_team_nth
));
3360 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3362 KMP_DEBUG_ASSERT(num_avail
== (unsigned)__kmp_avail_proc
);
3364 // Count the number of levels which have more nodes at that level than at the
3365 // parent's level (with there being an implicit root node of the top level).
3366 // This is equivalent to saying that there is at least one node at this level
3367 // which has a sibling. These levels are in the map, and the package level is
3368 // always in the map.
3369 bool *inMap
= (bool *)__kmp_allocate((maxIndex
+ 1) * sizeof(bool));
3370 for (index
= threadIdIndex
; index
< maxIndex
; index
++) {
3371 KMP_ASSERT(totals
[index
] >= totals
[index
+ 1]);
3372 inMap
[index
] = (totals
[index
] > totals
[index
+ 1]);
3374 inMap
[maxIndex
] = (totals
[maxIndex
] > 1);
3375 inMap
[pkgIdIndex
] = true;
3376 inMap
[coreIdIndex
] = true;
3377 inMap
[threadIdIndex
] = true;
3381 kmp_hw_t types
[KMP_HW_LAST
];
3384 int threadLevel
= -1;
3385 for (index
= threadIdIndex
; index
<= maxIndex
; index
++) {
3390 if (inMap
[pkgIdIndex
]) {
3392 types
[idx
++] = KMP_HW_SOCKET
;
3394 if (inMap
[coreIdIndex
]) {
3396 types
[idx
++] = KMP_HW_CORE
;
3398 if (inMap
[threadIdIndex
]) {
3400 types
[idx
++] = KMP_HW_THREAD
;
3402 KMP_ASSERT(depth
> 0);
3404 // Construct the data structure that is to be returned.
3405 __kmp_topology
= kmp_topology_t::allocate(num_avail
, depth
, types
);
3407 for (i
= 0; i
< num_avail
; ++i
) {
3408 unsigned os
= threadInfo
[i
][osIdIndex
];
3410 kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(i
);
3412 hw_thread
.os_id
= os
;
3415 for (src_index
= maxIndex
; src_index
>= threadIdIndex
; src_index
--) {
3416 if (!inMap
[src_index
]) {
3419 if (src_index
== pkgIdIndex
) {
3420 hw_thread
.ids
[pkgLevel
] = threadInfo
[i
][src_index
];
3421 } else if (src_index
== coreIdIndex
) {
3422 hw_thread
.ids
[coreLevel
] = threadInfo
[i
][src_index
];
3423 } else if (src_index
== threadIdIndex
) {
3424 hw_thread
.ids
[threadLevel
] = threadInfo
[i
][src_index
];
3434 CLEANUP_THREAD_INFO
;
3435 __kmp_topology
->sort_ids();
3436 if (!__kmp_topology
->check_ids()) {
3437 kmp_topology_t::deallocate(__kmp_topology
);
3438 __kmp_topology
= nullptr;
3439 *msg_id
= kmp_i18n_str_PhysicalIDsNotUnique
;
3445 // Create and return a table of affinity masks, indexed by OS thread ID.
3446 // This routine handles OR'ing together all the affinity masks of threads
3447 // that are sufficiently close, if granularity > fine.
3448 template <typename FindNextFunctionType
>
3449 static void __kmp_create_os_id_masks(unsigned *numUnique
,
3450 kmp_affinity_t
&affinity
,
3451 FindNextFunctionType find_next
) {
3452 // First form a table of affinity masks in order of OS thread id.
3455 int numAddrs
= __kmp_topology
->get_num_hw_threads();
3456 int depth
= __kmp_topology
->get_depth();
3457 const char *env_var
= __kmp_get_affinity_env_var(affinity
);
3458 KMP_ASSERT(numAddrs
);
3462 // If could not find HW thread location with attributes, then return and
3463 // fallback to increment find_next and disregard core attributes.
3468 for (i
= numAddrs
- 1;; --i
) {
3469 int osId
= __kmp_topology
->at(i
).os_id
;
3470 if (osId
> maxOsId
) {
3476 affinity
.num_os_id_masks
= maxOsId
+ 1;
3477 KMP_CPU_ALLOC_ARRAY(affinity
.os_id_masks
, affinity
.num_os_id_masks
);
3478 KMP_ASSERT(affinity
.gran_levels
>= 0);
3479 if (affinity
.flags
.verbose
&& (affinity
.gran_levels
> 0)) {
3480 KMP_INFORM(ThreadsMigrate
, env_var
, affinity
.gran_levels
);
3482 if (affinity
.gran_levels
>= (int)depth
) {
3483 KMP_AFF_WARNING(affinity
, AffThreadsMayMigrate
);
3486 // Run through the table, forming the masks for all threads on each core.
3487 // Threads on the same core will have identical kmp_hw_thread_t objects, not
3488 // considering the last level, which must be the thread id. All threads on a
3489 // core will appear consecutively.
3491 int j
= 0; // index of 1st thread on core
3493 kmp_affin_mask_t
*sum
;
3494 KMP_CPU_ALLOC_ON_STACK(sum
);
3497 i
= j
= leader
= find_next(-1);
3498 KMP_CPU_SET(__kmp_topology
->at(i
).os_id
, sum
);
3499 kmp_full_mask_modifier_t full_mask
;
3500 for (i
= find_next(i
); i
< numAddrs
; i
= find_next(i
)) {
3501 // If this thread is sufficiently close to the leader (within the
3502 // granularity setting), then set the bit for this os thread in the
3503 // affinity mask for this group, and go on to the next thread.
3504 if (__kmp_topology
->is_close(leader
, i
, affinity
)) {
3505 KMP_CPU_SET(__kmp_topology
->at(i
).os_id
, sum
);
3509 // For every thread in this group, copy the mask to the thread's entry in
3510 // the OS Id mask table. Mark the first address as a leader.
3511 for (; j
< i
; j
= find_next(j
)) {
3512 int osId
= __kmp_topology
->at(j
).os_id
;
3513 KMP_DEBUG_ASSERT(osId
<= maxOsId
);
3514 kmp_affin_mask_t
*mask
= KMP_CPU_INDEX(affinity
.os_id_masks
, osId
);
3515 KMP_CPU_COPY(mask
, sum
);
3516 __kmp_topology
->at(j
).leader
= (j
== leader
);
3520 // Start a new mask.
3522 full_mask
.include(sum
);
3524 KMP_CPU_SET(__kmp_topology
->at(i
).os_id
, sum
);
3527 // For every thread in last group, copy the mask to the thread's
3528 // entry in the OS Id mask table.
3529 for (; j
< i
; j
= find_next(j
)) {
3530 int osId
= __kmp_topology
->at(j
).os_id
;
3531 KMP_DEBUG_ASSERT(osId
<= maxOsId
);
3532 kmp_affin_mask_t
*mask
= KMP_CPU_INDEX(affinity
.os_id_masks
, osId
);
3533 KMP_CPU_COPY(mask
, sum
);
3534 __kmp_topology
->at(j
).leader
= (j
== leader
);
3536 full_mask
.include(sum
);
3538 KMP_CPU_FREE_FROM_STACK(sum
);
3540 // See if the OS Id mask table further restricts or changes the full mask
3541 if (full_mask
.restrict_to_mask() && affinity
.flags
.verbose
) {
3542 __kmp_topology
->print(env_var
);
3545 *numUnique
= unique
;
3548 // Stuff for the affinity proclist parsers. It's easier to declare these vars
3549 // as file-static than to try and pass them through the calling sequence of
3550 // the recursive-descent OMP_PLACES parser.
3551 static kmp_affin_mask_t
*newMasks
;
3552 static int numNewMasks
;
3553 static int nextNewMask
;
3555 #define ADD_MASK(_mask) \
3557 if (nextNewMask >= numNewMasks) { \
3560 kmp_affin_mask_t *temp; \
3561 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
3562 for (i = 0; i < numNewMasks / 2; i++) { \
3563 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
3564 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
3565 KMP_CPU_COPY(dest, src); \
3567 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
3570 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
3574 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
3576 if (((_osId) > _maxOsId) || \
3577 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
3578 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId); \
3580 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
3584 // Re-parse the proclist (for the explicit affinity type), and form the list
3585 // of affinity newMasks indexed by gtid.
3586 static void __kmp_affinity_process_proclist(kmp_affinity_t
&affinity
) {
3588 kmp_affin_mask_t
**out_masks
= &affinity
.masks
;
3589 unsigned *out_numMasks
= &affinity
.num_masks
;
3590 const char *proclist
= affinity
.proclist
;
3591 kmp_affin_mask_t
*osId2Mask
= affinity
.os_id_masks
;
3592 int maxOsId
= affinity
.num_os_id_masks
- 1;
3593 const char *scan
= proclist
;
3594 const char *next
= proclist
;
3596 // We use malloc() for the temporary mask vector, so that we can use
3597 // realloc() to extend it.
3599 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks
, numNewMasks
);
3601 kmp_affin_mask_t
*sumMask
;
3602 KMP_CPU_ALLOC(sumMask
);
3606 int start
, end
, stride
;
3610 if (*next
== '\0') {
3621 // Read the first integer in the set.
3622 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad proclist");
3624 num
= __kmp_str_to_int(scan
, *next
);
3625 KMP_ASSERT2(num
>= 0, "bad explicit proc list");
3627 // Copy the mask for that osId to the sum (union) mask.
3628 if ((num
> maxOsId
) ||
3629 (!KMP_CPU_ISSET(num
, KMP_CPU_INDEX(osId2Mask
, num
)))) {
3630 KMP_AFF_WARNING(affinity
, AffIgnoreInvalidProcID
, num
);
3631 KMP_CPU_ZERO(sumMask
);
3633 KMP_CPU_COPY(sumMask
, KMP_CPU_INDEX(osId2Mask
, num
));
3638 // Check for end of set.
3645 // Skip optional comma.
3651 // Read the next integer in the set.
3653 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
3656 num
= __kmp_str_to_int(scan
, *next
);
3657 KMP_ASSERT2(num
>= 0, "bad explicit proc list");
3659 // Add the mask for that osId to the sum mask.
3660 if ((num
> maxOsId
) ||
3661 (!KMP_CPU_ISSET(num
, KMP_CPU_INDEX(osId2Mask
, num
)))) {
3662 KMP_AFF_WARNING(affinity
, AffIgnoreInvalidProcID
, num
);
3664 KMP_CPU_UNION(sumMask
, KMP_CPU_INDEX(osId2Mask
, num
));
3680 // Read the first integer.
3681 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
3683 start
= __kmp_str_to_int(scan
, *next
);
3684 KMP_ASSERT2(start
>= 0, "bad explicit proc list");
3687 // If this isn't a range, then add a mask to the list and go on.
3689 ADD_MASK_OSID(start
, osId2Mask
, maxOsId
);
3691 // Skip optional comma.
3699 // This is a range. Skip over the '-' and read in the 2nd int.
3703 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
3705 end
= __kmp_str_to_int(scan
, *next
);
3706 KMP_ASSERT2(end
>= 0, "bad explicit proc list");
3708 // Check for a stride parameter
3712 // A stride is specified. Skip over the ':" and read the 3rd int.
3723 KMP_ASSERT2((*next
>= '0') && (*next
<= '9'), "bad explicit proc list");
3725 stride
= __kmp_str_to_int(scan
, *next
);
3726 KMP_ASSERT2(stride
>= 0, "bad explicit proc list");
3730 // Do some range checks.
3731 KMP_ASSERT2(stride
!= 0, "bad explicit proc list");
3733 KMP_ASSERT2(start
<= end
, "bad explicit proc list");
3735 KMP_ASSERT2(start
>= end
, "bad explicit proc list");
3737 KMP_ASSERT2((end
- start
) / stride
<= 65536, "bad explicit proc list");
3739 // Add the mask for each OS proc # to the list.
3742 ADD_MASK_OSID(start
, osId2Mask
, maxOsId
);
3744 } while (start
<= end
);
3747 ADD_MASK_OSID(start
, osId2Mask
, maxOsId
);
3749 } while (start
>= end
);
3752 // Skip optional comma.
3760 *out_numMasks
= nextNewMask
;
3761 if (nextNewMask
== 0) {
3763 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
3766 KMP_CPU_ALLOC_ARRAY((*out_masks
), nextNewMask
);
3767 for (i
= 0; i
< nextNewMask
; i
++) {
3768 kmp_affin_mask_t
*src
= KMP_CPU_INDEX(newMasks
, i
);
3769 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX((*out_masks
), i
);
3770 KMP_CPU_COPY(dest
, src
);
3772 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
3773 KMP_CPU_FREE(sumMask
);
3776 /*-----------------------------------------------------------------------------
3777 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3778 places. Again, Here is the grammar:
3781 place_list := place , place_list
3783 place := place : num
3784 place := place : num : signed
3785 place := { subplacelist }
3786 place := ! place // (lowest priority)
3787 subplace_list := subplace
3788 subplace_list := subplace , subplace_list
3790 subplace := num : num
3791 subplace := num : num : signed
3795 -----------------------------------------------------------------------------*/
3796 static void __kmp_process_subplace_list(const char **scan
,
3797 kmp_affinity_t
&affinity
, int maxOsId
,
3798 kmp_affin_mask_t
*tempMask
,
3801 kmp_affin_mask_t
*osId2Mask
= affinity
.os_id_masks
;
3804 int start
, count
, stride
, i
;
3806 // Read in the starting proc id
3808 KMP_ASSERT2((**scan
>= '0') && (**scan
<= '9'), "bad explicit places list");
3811 start
= __kmp_str_to_int(*scan
, *next
);
3812 KMP_ASSERT(start
>= 0);
3815 // valid follow sets are ',' ':' and '}'
3817 if (**scan
== '}' || **scan
== ',') {
3818 if ((start
> maxOsId
) ||
3819 (!KMP_CPU_ISSET(start
, KMP_CPU_INDEX(osId2Mask
, start
)))) {
3820 KMP_AFF_WARNING(affinity
, AffIgnoreInvalidProcID
, start
);
3822 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, start
));
3825 if (**scan
== '}') {
3828 (*scan
)++; // skip ','
3831 KMP_ASSERT2(**scan
== ':', "bad explicit places list");
3832 (*scan
)++; // skip ':'
3834 // Read count parameter
3836 KMP_ASSERT2((**scan
>= '0') && (**scan
<= '9'), "bad explicit places list");
3839 count
= __kmp_str_to_int(*scan
, *next
);
3840 KMP_ASSERT(count
>= 0);
3843 // valid follow sets are ',' ':' and '}'
3845 if (**scan
== '}' || **scan
== ',') {
3846 for (i
= 0; i
< count
; i
++) {
3847 if ((start
> maxOsId
) ||
3848 (!KMP_CPU_ISSET(start
, KMP_CPU_INDEX(osId2Mask
, start
)))) {
3849 KMP_AFF_WARNING(affinity
, AffIgnoreInvalidProcID
, start
);
3850 break; // don't proliferate warnings for large count
3852 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, start
));
3857 if (**scan
== '}') {
3860 (*scan
)++; // skip ','
3863 KMP_ASSERT2(**scan
== ':', "bad explicit places list");
3864 (*scan
)++; // skip ':'
3866 // Read stride parameter
3870 if (**scan
== '+') {
3871 (*scan
)++; // skip '+'
3874 if (**scan
== '-') {
3876 (*scan
)++; // skip '-'
3882 KMP_ASSERT2((**scan
>= '0') && (**scan
<= '9'), "bad explicit places list");
3885 stride
= __kmp_str_to_int(*scan
, *next
);
3886 KMP_ASSERT(stride
>= 0);
3890 // valid follow sets are ',' and '}'
3892 if (**scan
== '}' || **scan
== ',') {
3893 for (i
= 0; i
< count
; i
++) {
3894 if ((start
> maxOsId
) ||
3895 (!KMP_CPU_ISSET(start
, KMP_CPU_INDEX(osId2Mask
, start
)))) {
3896 KMP_AFF_WARNING(affinity
, AffIgnoreInvalidProcID
, start
);
3897 break; // don't proliferate warnings for large count
3899 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, start
));
3904 if (**scan
== '}') {
3907 (*scan
)++; // skip ','
3911 KMP_ASSERT2(0, "bad explicit places list");
3915 static void __kmp_process_place(const char **scan
, kmp_affinity_t
&affinity
,
3916 int maxOsId
, kmp_affin_mask_t
*tempMask
,
3919 kmp_affin_mask_t
*osId2Mask
= affinity
.os_id_masks
;
3921 // valid follow sets are '{' '!' and num
3923 if (**scan
== '{') {
3924 (*scan
)++; // skip '{'
3925 __kmp_process_subplace_list(scan
, affinity
, maxOsId
, tempMask
, setSize
);
3926 KMP_ASSERT2(**scan
== '}', "bad explicit places list");
3927 (*scan
)++; // skip '}'
3928 } else if (**scan
== '!') {
3929 (*scan
)++; // skip '!'
3930 __kmp_process_place(scan
, affinity
, maxOsId
, tempMask
, setSize
);
3931 KMP_CPU_COMPLEMENT(maxOsId
, tempMask
);
3932 } else if ((**scan
>= '0') && (**scan
<= '9')) {
3935 int num
= __kmp_str_to_int(*scan
, *next
);
3936 KMP_ASSERT(num
>= 0);
3937 if ((num
> maxOsId
) ||
3938 (!KMP_CPU_ISSET(num
, KMP_CPU_INDEX(osId2Mask
, num
)))) {
3939 KMP_AFF_WARNING(affinity
, AffIgnoreInvalidProcID
, num
);
3941 KMP_CPU_UNION(tempMask
, KMP_CPU_INDEX(osId2Mask
, num
));
3944 *scan
= next
; // skip num
3946 KMP_ASSERT2(0, "bad explicit places list");
3951 void __kmp_affinity_process_placelist(kmp_affinity_t
&affinity
) {
3952 int i
, j
, count
, stride
, sign
;
3953 kmp_affin_mask_t
**out_masks
= &affinity
.masks
;
3954 unsigned *out_numMasks
= &affinity
.num_masks
;
3955 const char *placelist
= affinity
.proclist
;
3956 kmp_affin_mask_t
*osId2Mask
= affinity
.os_id_masks
;
3957 int maxOsId
= affinity
.num_os_id_masks
- 1;
3958 const char *scan
= placelist
;
3959 const char *next
= placelist
;
3962 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks
, numNewMasks
);
3965 // tempMask is modified based on the previous or initial
3966 // place to form the current place
3967 // previousMask contains the previous place
3968 kmp_affin_mask_t
*tempMask
;
3969 kmp_affin_mask_t
*previousMask
;
3970 KMP_CPU_ALLOC(tempMask
);
3971 KMP_CPU_ZERO(tempMask
);
3972 KMP_CPU_ALLOC(previousMask
);
3973 KMP_CPU_ZERO(previousMask
);
3977 __kmp_process_place(&scan
, affinity
, maxOsId
, tempMask
, &setSize
);
3979 // valid follow sets are ',' ':' and EOL
3981 if (*scan
== '\0' || *scan
== ',') {
3985 KMP_CPU_ZERO(tempMask
);
3987 if (*scan
== '\0') {
3994 KMP_ASSERT2(*scan
== ':', "bad explicit places list");
3997 // Read count parameter
3999 KMP_ASSERT2((*scan
>= '0') && (*scan
<= '9'), "bad explicit places list");
4002 count
= __kmp_str_to_int(scan
, *next
);
4003 KMP_ASSERT(count
>= 0);
4006 // valid follow sets are ',' ':' and EOL
4008 if (*scan
== '\0' || *scan
== ',') {
4011 KMP_ASSERT2(*scan
== ':', "bad explicit places list");
4014 // Read stride parameter
4030 KMP_ASSERT2((*scan
>= '0') && (*scan
<= '9'), "bad explicit places list");
4033 stride
= __kmp_str_to_int(scan
, *next
);
4034 KMP_DEBUG_ASSERT(stride
>= 0);
4039 // Add places determined by initial_place : count : stride
4040 for (i
= 0; i
< count
; i
++) {
4044 // Add the current place, then build the next place (tempMask) from that
4045 KMP_CPU_COPY(previousMask
, tempMask
);
4046 ADD_MASK(previousMask
);
4047 KMP_CPU_ZERO(tempMask
);
4049 KMP_CPU_SET_ITERATE(j
, previousMask
) {
4050 if (!KMP_CPU_ISSET(j
, previousMask
)) {
4053 if ((j
+ stride
> maxOsId
) || (j
+ stride
< 0) ||
4054 (!KMP_CPU_ISSET(j
, __kmp_affin_fullMask
)) ||
4055 (!KMP_CPU_ISSET(j
+ stride
,
4056 KMP_CPU_INDEX(osId2Mask
, j
+ stride
)))) {
4057 if (i
< count
- 1) {
4058 KMP_AFF_WARNING(affinity
, AffIgnoreInvalidProcID
, j
+ stride
);
4062 KMP_CPU_SET(j
+ stride
, tempMask
);
4066 KMP_CPU_ZERO(tempMask
);
4069 // valid follow sets are ',' and EOL
4071 if (*scan
== '\0') {
4079 KMP_ASSERT2(0, "bad explicit places list");
4082 *out_numMasks
= nextNewMask
;
4083 if (nextNewMask
== 0) {
4085 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
4088 KMP_CPU_ALLOC_ARRAY((*out_masks
), nextNewMask
);
4089 KMP_CPU_FREE(tempMask
);
4090 KMP_CPU_FREE(previousMask
);
4091 for (i
= 0; i
< nextNewMask
; i
++) {
4092 kmp_affin_mask_t
*src
= KMP_CPU_INDEX(newMasks
, i
);
4093 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX((*out_masks
), i
);
4094 KMP_CPU_COPY(dest
, src
);
4096 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks
, numNewMasks
);
4100 #undef ADD_MASK_OSID
4102 // This function figures out the deepest level at which there is at least one
4103 // cluster/core with more than one processing unit bound to it.
4104 static int __kmp_affinity_find_core_level(int nprocs
, int bottom_level
) {
4107 for (int i
= 0; i
< nprocs
; i
++) {
4108 const kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(i
);
4109 for (int j
= bottom_level
; j
> 0; j
--) {
4110 if (hw_thread
.ids
[j
] > 0) {
4111 if (core_level
< (j
- 1)) {
4120 // This function counts number of clusters/cores at given level.
4121 static int __kmp_affinity_compute_ncores(int nprocs
, int bottom_level
,
4123 return __kmp_topology
->get_count(core_level
);
4125 // This function finds to which cluster/core given processing unit is bound.
4126 static int __kmp_affinity_find_core(int proc
, int bottom_level
,
4129 KMP_DEBUG_ASSERT(proc
>= 0 && proc
< __kmp_topology
->get_num_hw_threads());
4130 for (int i
= 0; i
<= proc
; ++i
) {
4131 if (i
+ 1 <= proc
) {
4132 for (int j
= 0; j
<= core_level
; ++j
) {
4133 if (__kmp_topology
->at(i
+ 1).sub_ids
[j
] !=
4134 __kmp_topology
->at(i
).sub_ids
[j
]) {
4144 // This function finds maximal number of processing units bound to a
4145 // cluster/core at given level.
4146 static int __kmp_affinity_max_proc_per_core(int nprocs
, int bottom_level
,
4148 if (core_level
>= bottom_level
)
4150 int thread_level
= __kmp_topology
->get_level(KMP_HW_THREAD
);
4151 return __kmp_topology
->calculate_ratio(thread_level
, core_level
);
4154 static int *procarr
= NULL
;
4155 static int __kmp_aff_depth
= 0;
4156 static int *__kmp_osid_to_hwthread_map
= NULL
;
4158 static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t
*mask
,
4159 kmp_affinity_ids_t
&ids
,
4160 kmp_affinity_attrs_t
&attrs
) {
4161 if (!KMP_AFFINITY_CAPABLE())
4164 // Initiailze ids and attrs thread data
4165 for (int i
= 0; i
< KMP_HW_LAST
; ++i
)
4166 ids
[i
] = kmp_hw_thread_t::UNKNOWN_ID
;
4167 attrs
= KMP_AFFINITY_ATTRS_UNKNOWN
;
4169 // Iterate through each os id within the mask and determine
4170 // the topology id and attribute information
4172 int depth
= __kmp_topology
->get_depth();
4173 KMP_CPU_SET_ITERATE(cpu
, mask
) {
4174 int osid_idx
= __kmp_osid_to_hwthread_map
[cpu
];
4175 const kmp_hw_thread_t
&hw_thread
= __kmp_topology
->at(osid_idx
);
4176 for (int level
= 0; level
< depth
; ++level
) {
4177 kmp_hw_t type
= __kmp_topology
->get_type(level
);
4178 int id
= hw_thread
.sub_ids
[level
];
4179 if (ids
[type
] == kmp_hw_thread_t::UNKNOWN_ID
|| ids
[type
] == id
) {
4182 // This mask spans across multiple topology units, set it as such
4183 // and mark every level below as such as well.
4184 ids
[type
] = kmp_hw_thread_t::MULTIPLE_ID
;
4185 for (; level
< depth
; ++level
) {
4186 kmp_hw_t type
= __kmp_topology
->get_type(level
);
4187 ids
[type
] = kmp_hw_thread_t::MULTIPLE_ID
;
4192 attrs
.core_type
= hw_thread
.attrs
.get_core_type();
4193 attrs
.core_eff
= hw_thread
.attrs
.get_core_eff();
4196 // This mask spans across multiple attributes, set it as such
4197 if (attrs
.core_type
!= hw_thread
.attrs
.get_core_type())
4198 attrs
.core_type
= KMP_HW_CORE_TYPE_UNKNOWN
;
4199 if (attrs
.core_eff
!= hw_thread
.attrs
.get_core_eff())
4200 attrs
.core_eff
= kmp_hw_attr_t::UNKNOWN_CORE_EFF
;
4205 static void __kmp_affinity_get_thread_topology_info(kmp_info_t
*th
) {
4206 if (!KMP_AFFINITY_CAPABLE())
4208 const kmp_affin_mask_t
*mask
= th
->th
.th_affin_mask
;
4209 kmp_affinity_ids_t
&ids
= th
->th
.th_topology_ids
;
4210 kmp_affinity_attrs_t
&attrs
= th
->th
.th_topology_attrs
;
4211 __kmp_affinity_get_mask_topology_info(mask
, ids
, attrs
);
4214 // Assign the topology information to each place in the place list
4215 // A thread can then grab not only its affinity mask, but the topology
4216 // information associated with that mask. e.g., Which socket is a thread on
4217 static void __kmp_affinity_get_topology_info(kmp_affinity_t
&affinity
) {
4218 if (!KMP_AFFINITY_CAPABLE())
4220 if (affinity
.type
!= affinity_none
) {
4221 KMP_ASSERT(affinity
.num_os_id_masks
);
4222 KMP_ASSERT(affinity
.os_id_masks
);
4224 KMP_ASSERT(affinity
.num_masks
);
4225 KMP_ASSERT(affinity
.masks
);
4226 KMP_ASSERT(__kmp_affin_fullMask
);
4228 int max_cpu
= __kmp_affin_fullMask
->get_max_cpu();
4229 int num_hw_threads
= __kmp_topology
->get_num_hw_threads();
4231 // Allocate thread topology information
4232 if (!affinity
.ids
) {
4233 affinity
.ids
= (kmp_affinity_ids_t
*)__kmp_allocate(
4234 sizeof(kmp_affinity_ids_t
) * affinity
.num_masks
);
4236 if (!affinity
.attrs
) {
4237 affinity
.attrs
= (kmp_affinity_attrs_t
*)__kmp_allocate(
4238 sizeof(kmp_affinity_attrs_t
) * affinity
.num_masks
);
4240 if (!__kmp_osid_to_hwthread_map
) {
4241 // Want the +1 because max_cpu should be valid index into map
4242 __kmp_osid_to_hwthread_map
=
4243 (int *)__kmp_allocate(sizeof(int) * (max_cpu
+ 1));
4246 // Create the OS proc to hardware thread map
4247 for (int hw_thread
= 0; hw_thread
< num_hw_threads
; ++hw_thread
) {
4248 int os_id
= __kmp_topology
->at(hw_thread
).os_id
;
4249 if (KMP_CPU_ISSET(os_id
, __kmp_affin_fullMask
))
4250 __kmp_osid_to_hwthread_map
[os_id
] = hw_thread
;
4253 for (unsigned i
= 0; i
< affinity
.num_masks
; ++i
) {
4254 kmp_affinity_ids_t
&ids
= affinity
.ids
[i
];
4255 kmp_affinity_attrs_t
&attrs
= affinity
.attrs
[i
];
4256 kmp_affin_mask_t
*mask
= KMP_CPU_INDEX(affinity
.masks
, i
);
4257 __kmp_affinity_get_mask_topology_info(mask
, ids
, attrs
);
4261 // Called when __kmp_topology is ready
4262 static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t
&affinity
) {
4263 // Initialize other data structures which depend on the topology
4264 if (__kmp_topology
&& __kmp_topology
->get_num_hw_threads()) {
4265 machine_hierarchy
.init(__kmp_topology
->get_num_hw_threads());
4266 __kmp_affinity_get_topology_info(affinity
);
4270 // Create a one element mask array (set of places) which only contains the
4271 // initial process's affinity mask
4272 static void __kmp_create_affinity_none_places(kmp_affinity_t
&affinity
) {
4273 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4274 KMP_ASSERT(affinity
.type
== affinity_none
);
4275 KMP_ASSERT(__kmp_avail_proc
== __kmp_topology
->get_num_hw_threads());
4276 affinity
.num_masks
= 1;
4277 KMP_CPU_ALLOC_ARRAY(affinity
.masks
, affinity
.num_masks
);
4278 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX(affinity
.masks
, 0);
4279 KMP_CPU_COPY(dest
, __kmp_affin_fullMask
);
4280 __kmp_aux_affinity_initialize_other_data(affinity
);
4283 static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t
&affinity
) {
4284 // Create the "full" mask - this defines all of the processors that we
4285 // consider to be in the machine model. If respect is set, then it is the
4286 // initialization thread's affinity mask. Otherwise, it is all processors that
4287 // we know about on the machine.
4288 int verbose
= affinity
.flags
.verbose
;
4289 const char *env_var
= affinity
.env_var
;
4291 // Already initialized
4292 if (__kmp_affin_fullMask
&& __kmp_affin_origMask
)
4295 if (__kmp_affin_fullMask
== NULL
) {
4296 KMP_CPU_ALLOC(__kmp_affin_fullMask
);
4298 if (__kmp_affin_origMask
== NULL
) {
4299 KMP_CPU_ALLOC(__kmp_affin_origMask
);
4301 if (KMP_AFFINITY_CAPABLE()) {
4302 __kmp_get_system_affinity(__kmp_affin_fullMask
, TRUE
);
4303 // Make a copy before possible expanding to the entire machine mask
4304 __kmp_affin_origMask
->copy(__kmp_affin_fullMask
);
4305 if (affinity
.flags
.respect
) {
4306 // Count the number of available processors.
4308 __kmp_avail_proc
= 0;
4309 KMP_CPU_SET_ITERATE(i
, __kmp_affin_fullMask
) {
4310 if (!KMP_CPU_ISSET(i
, __kmp_affin_fullMask
)) {
4315 if (__kmp_avail_proc
> __kmp_xproc
) {
4316 KMP_AFF_WARNING(affinity
, ErrorInitializeAffinity
);
4317 affinity
.type
= affinity_none
;
4318 KMP_AFFINITY_DISABLE();
4323 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4324 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4325 __kmp_affin_fullMask
);
4326 KMP_INFORM(InitOSProcSetRespect
, env_var
, buf
);
4330 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4331 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4332 __kmp_affin_fullMask
);
4333 KMP_INFORM(InitOSProcSetNotRespect
, env_var
, buf
);
4336 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask
);
4338 if (__kmp_num_proc_groups
<= 1) {
4339 // Copy expanded full mask if topology has single processor group
4340 __kmp_affin_origMask
->copy(__kmp_affin_fullMask
);
4342 // Set the process affinity mask since threads' affinity
4343 // masks must be subset of process mask in Windows* OS
4344 __kmp_affin_fullMask
->set_process_affinity(true);
4350 static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t
&affinity
) {
4351 bool success
= false;
4352 const char *env_var
= affinity
.env_var
;
4353 kmp_i18n_id_t msg_id
= kmp_i18n_null
;
4354 int verbose
= affinity
.flags
.verbose
;
4356 // For backward compatibility, setting KMP_CPUINFO_FILE =>
4357 // KMP_TOPOLOGY_METHOD=cpuinfo
4358 if ((__kmp_cpuinfo_file
!= NULL
) &&
4359 (__kmp_affinity_top_method
== affinity_top_method_all
)) {
4360 __kmp_affinity_top_method
= affinity_top_method_cpuinfo
;
4363 if (__kmp_affinity_top_method
== affinity_top_method_all
) {
4364 // In the default code path, errors are not fatal - we just try using
4365 // another method. We only emit a warning message if affinity is on, or the
4366 // verbose flag is set, an the nowarnings flag was not set.
4369 __kmp_affinity_dispatch
->get_api_type() == KMPAffinity::HWLOC
) {
4370 if (!__kmp_hwloc_error
) {
4371 success
= __kmp_affinity_create_hwloc_map(&msg_id
);
4372 if (!success
&& verbose
) {
4373 KMP_INFORM(AffIgnoringHwloc
, env_var
);
4375 } else if (verbose
) {
4376 KMP_INFORM(AffIgnoringHwloc
, env_var
);
4381 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4383 success
= __kmp_affinity_create_x2apicid_map(&msg_id
);
4384 if (!success
&& verbose
&& msg_id
!= kmp_i18n_null
) {
4385 KMP_INFORM(AffInfoStr
, env_var
, __kmp_i18n_catgets(msg_id
));
4389 success
= __kmp_affinity_create_apicid_map(&msg_id
);
4390 if (!success
&& verbose
&& msg_id
!= kmp_i18n_null
) {
4391 KMP_INFORM(AffInfoStr
, env_var
, __kmp_i18n_catgets(msg_id
));
4394 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4399 success
= __kmp_affinity_create_cpuinfo_map(&line
, &msg_id
);
4400 if (!success
&& verbose
&& msg_id
!= kmp_i18n_null
) {
4401 KMP_INFORM(AffInfoStr
, env_var
, __kmp_i18n_catgets(msg_id
));
4404 #endif /* KMP_OS_LINUX */
4406 #if KMP_GROUP_AFFINITY
4407 if (!success
&& (__kmp_num_proc_groups
> 1)) {
4408 success
= __kmp_affinity_create_proc_group_map(&msg_id
);
4409 if (!success
&& verbose
&& msg_id
!= kmp_i18n_null
) {
4410 KMP_INFORM(AffInfoStr
, env_var
, __kmp_i18n_catgets(msg_id
));
4413 #endif /* KMP_GROUP_AFFINITY */
4416 success
= __kmp_affinity_create_flat_map(&msg_id
);
4417 if (!success
&& verbose
&& msg_id
!= kmp_i18n_null
) {
4418 KMP_INFORM(AffInfoStr
, env_var
, __kmp_i18n_catgets(msg_id
));
4420 KMP_ASSERT(success
);
4424 // If the user has specified that a paricular topology discovery method is to be
4425 // used, then we abort if that method fails. The exception is group affinity,
4426 // which might have been implicitly set.
4428 else if (__kmp_affinity_top_method
== affinity_top_method_hwloc
) {
4429 KMP_ASSERT(__kmp_affinity_dispatch
->get_api_type() == KMPAffinity::HWLOC
);
4430 success
= __kmp_affinity_create_hwloc_map(&msg_id
);
4432 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4433 KMP_FATAL(MsgExiting
, __kmp_i18n_catgets(msg_id
));
4436 #endif // KMP_USE_HWLOC
4438 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4439 else if (__kmp_affinity_top_method
== affinity_top_method_x2apicid
||
4440 __kmp_affinity_top_method
== affinity_top_method_x2apicid_1f
) {
4441 success
= __kmp_affinity_create_x2apicid_map(&msg_id
);
4443 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4444 KMP_FATAL(MsgExiting
, __kmp_i18n_catgets(msg_id
));
4446 } else if (__kmp_affinity_top_method
== affinity_top_method_apicid
) {
4447 success
= __kmp_affinity_create_apicid_map(&msg_id
);
4449 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4450 KMP_FATAL(MsgExiting
, __kmp_i18n_catgets(msg_id
));
4453 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4455 else if (__kmp_affinity_top_method
== affinity_top_method_cpuinfo
) {
4457 success
= __kmp_affinity_create_cpuinfo_map(&line
, &msg_id
);
4459 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4460 const char *filename
= __kmp_cpuinfo_get_filename();
4462 KMP_FATAL(FileLineMsgExiting
, filename
, line
,
4463 __kmp_i18n_catgets(msg_id
));
4465 KMP_FATAL(FileMsgExiting
, filename
, __kmp_i18n_catgets(msg_id
));
4470 #if KMP_GROUP_AFFINITY
4471 else if (__kmp_affinity_top_method
== affinity_top_method_group
) {
4472 success
= __kmp_affinity_create_proc_group_map(&msg_id
);
4473 KMP_ASSERT(success
);
4475 KMP_ASSERT(msg_id
!= kmp_i18n_null
);
4476 KMP_FATAL(MsgExiting
, __kmp_i18n_catgets(msg_id
));
4479 #endif /* KMP_GROUP_AFFINITY */
4481 else if (__kmp_affinity_top_method
== affinity_top_method_flat
) {
4482 success
= __kmp_affinity_create_flat_map(&msg_id
);
4484 KMP_ASSERT(success
);
4487 // Early exit if topology could not be created
4488 if (!__kmp_topology
) {
4489 if (KMP_AFFINITY_CAPABLE()) {
4490 KMP_AFF_WARNING(affinity
, ErrorInitializeAffinity
);
4492 if (nPackages
> 0 && nCoresPerPkg
> 0 && __kmp_nThreadsPerCore
> 0 &&
4494 __kmp_topology
= kmp_topology_t::allocate(0, 0, NULL
);
4495 __kmp_topology
->canonicalize(nPackages
, nCoresPerPkg
,
4496 __kmp_nThreadsPerCore
, __kmp_ncores
);
4498 __kmp_topology
->print(env_var
);
4504 // Canonicalize, print (if requested), apply KMP_HW_SUBSET
4505 __kmp_topology
->canonicalize();
4507 __kmp_topology
->print(env_var
);
4508 bool filtered
= __kmp_topology
->filter_hw_subset();
4509 if (filtered
&& verbose
)
4510 __kmp_topology
->print("KMP_HW_SUBSET");
4514 static void __kmp_aux_affinity_initialize(kmp_affinity_t
&affinity
) {
4515 bool is_regular_affinity
= (&affinity
== &__kmp_affinity
);
4516 bool is_hidden_helper_affinity
= (&affinity
== &__kmp_hh_affinity
);
4517 const char *env_var
= __kmp_get_affinity_env_var(affinity
);
4519 if (affinity
.flags
.initialized
) {
4520 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4524 if (is_regular_affinity
&& (!__kmp_affin_fullMask
|| !__kmp_affin_origMask
))
4525 __kmp_aux_affinity_initialize_masks(affinity
);
4527 if (is_regular_affinity
&& !__kmp_topology
) {
4528 bool success
= __kmp_aux_affinity_initialize_topology(affinity
);
4530 KMP_ASSERT(__kmp_avail_proc
== __kmp_topology
->get_num_hw_threads());
4532 affinity
.type
= affinity_none
;
4533 KMP_AFFINITY_DISABLE();
4537 // If KMP_AFFINITY=none, then only create the single "none" place
4538 // which is the process's initial affinity mask or the number of
4539 // hardware threads depending on respect,norespect
4540 if (affinity
.type
== affinity_none
) {
4541 __kmp_create_affinity_none_places(affinity
);
4542 #if KMP_USE_HIER_SCHED
4543 __kmp_dispatch_set_hierarchy_values();
4545 affinity
.flags
.initialized
= TRUE
;
4549 __kmp_topology
->set_granularity(affinity
);
4550 int depth
= __kmp_topology
->get_depth();
4552 // Create the table of masks, indexed by thread Id.
4554 int numAddrs
= __kmp_topology
->get_num_hw_threads();
4555 // If OMP_PLACES=cores:<attribute> specified, then attempt
4556 // to make OS Id mask table using those attributes
4557 if (affinity
.core_attr_gran
.valid
) {
4558 __kmp_create_os_id_masks(&numUnique
, affinity
, [&](int idx
) {
4559 KMP_ASSERT(idx
>= -1);
4560 for (int i
= idx
+ 1; i
< numAddrs
; ++i
)
4561 if (__kmp_topology
->at(i
).attrs
.contains(affinity
.core_attr_gran
))
4565 if (!affinity
.os_id_masks
) {
4566 const char *core_attribute
;
4567 if (affinity
.core_attr_gran
.core_eff
!= kmp_hw_attr_t::UNKNOWN_CORE_EFF
)
4568 core_attribute
= "core_efficiency";
4570 core_attribute
= "core_type";
4571 KMP_AFF_WARNING(affinity
, AffIgnoringNotAvailable
, env_var
,
4573 __kmp_hw_get_catalog_string(KMP_HW_CORE
, /*plural=*/true))
4576 // If core attributes did not work, or none were specified,
4577 // then make OS Id mask table using typical incremental way.
4578 if (!affinity
.os_id_masks
) {
4579 __kmp_create_os_id_masks(&numUnique
, affinity
, [](int idx
) {
4580 KMP_ASSERT(idx
>= -1);
4584 if (affinity
.gran_levels
== 0) {
4585 KMP_DEBUG_ASSERT((int)numUnique
== __kmp_avail_proc
);
4588 switch (affinity
.type
) {
4590 case affinity_explicit
:
4591 KMP_DEBUG_ASSERT(affinity
.proclist
!= NULL
);
4592 if (is_hidden_helper_affinity
||
4593 __kmp_nested_proc_bind
.bind_types
[0] == proc_bind_intel
) {
4594 __kmp_affinity_process_proclist(affinity
);
4596 __kmp_affinity_process_placelist(affinity
);
4598 if (affinity
.num_masks
== 0) {
4599 KMP_AFF_WARNING(affinity
, AffNoValidProcID
);
4600 affinity
.type
= affinity_none
;
4601 __kmp_create_affinity_none_places(affinity
);
4602 affinity
.flags
.initialized
= TRUE
;
4607 // The other affinity types rely on sorting the hardware threads according to
4608 // some permutation of the machine topology tree. Set affinity.compact
4609 // and affinity.offset appropriately, then jump to a common code
4610 // fragment to do the sort and create the array of affinity masks.
4611 case affinity_logical
:
4612 affinity
.compact
= 0;
4613 if (affinity
.offset
) {
4615 __kmp_nThreadsPerCore
* affinity
.offset
% __kmp_avail_proc
;
4619 case affinity_physical
:
4620 if (__kmp_nThreadsPerCore
> 1) {
4621 affinity
.compact
= 1;
4622 if (affinity
.compact
>= depth
) {
4623 affinity
.compact
= 0;
4626 affinity
.compact
= 0;
4628 if (affinity
.offset
) {
4630 __kmp_nThreadsPerCore
* affinity
.offset
% __kmp_avail_proc
;
4634 case affinity_scatter
:
4635 if (affinity
.compact
>= depth
) {
4636 affinity
.compact
= 0;
4638 affinity
.compact
= depth
- 1 - affinity
.compact
;
4642 case affinity_compact
:
4643 if (affinity
.compact
>= depth
) {
4644 affinity
.compact
= depth
- 1;
4648 case affinity_balanced
:
4649 if (depth
<= 1 || is_hidden_helper_affinity
) {
4650 KMP_AFF_WARNING(affinity
, AffBalancedNotAvail
, env_var
);
4651 affinity
.type
= affinity_none
;
4652 __kmp_create_affinity_none_places(affinity
);
4653 affinity
.flags
.initialized
= TRUE
;
4655 } else if (!__kmp_topology
->is_uniform()) {
4656 // Save the depth for further usage
4657 __kmp_aff_depth
= depth
;
4660 __kmp_affinity_find_core_level(__kmp_avail_proc
, depth
- 1);
4661 int ncores
= __kmp_affinity_compute_ncores(__kmp_avail_proc
, depth
- 1,
4663 int maxprocpercore
= __kmp_affinity_max_proc_per_core(
4664 __kmp_avail_proc
, depth
- 1, core_level
);
4666 int nproc
= ncores
* maxprocpercore
;
4667 if ((nproc
< 2) || (nproc
< __kmp_avail_proc
)) {
4668 KMP_AFF_WARNING(affinity
, AffBalancedNotAvail
, env_var
);
4669 affinity
.type
= affinity_none
;
4670 __kmp_create_affinity_none_places(affinity
);
4671 affinity
.flags
.initialized
= TRUE
;
4675 procarr
= (int *)__kmp_allocate(sizeof(int) * nproc
);
4676 for (int i
= 0; i
< nproc
; i
++) {
4682 for (int i
= 0; i
< __kmp_avail_proc
; i
++) {
4683 int proc
= __kmp_topology
->at(i
).os_id
;
4684 int core
= __kmp_affinity_find_core(i
, depth
- 1, core_level
);
4686 if (core
== lastcore
) {
4693 procarr
[core
* maxprocpercore
+ inlastcore
] = proc
;
4696 if (affinity
.compact
>= depth
) {
4697 affinity
.compact
= depth
- 1;
4701 // Allocate the gtid->affinity mask table.
4702 if (affinity
.flags
.dups
) {
4703 affinity
.num_masks
= __kmp_avail_proc
;
4705 affinity
.num_masks
= numUnique
;
4708 if ((__kmp_nested_proc_bind
.bind_types
[0] != proc_bind_intel
) &&
4709 (__kmp_affinity_num_places
> 0) &&
4710 ((unsigned)__kmp_affinity_num_places
< affinity
.num_masks
) &&
4711 !is_hidden_helper_affinity
) {
4712 affinity
.num_masks
= __kmp_affinity_num_places
;
4715 KMP_CPU_ALLOC_ARRAY(affinity
.masks
, affinity
.num_masks
);
4717 // Sort the topology table according to the current setting of
4718 // affinity.compact, then fill out affinity.masks.
4719 __kmp_topology
->sort_compact(affinity
);
4723 int num_hw_threads
= __kmp_topology
->get_num_hw_threads();
4724 kmp_full_mask_modifier_t full_mask
;
4725 for (i
= 0, j
= 0; i
< num_hw_threads
; i
++) {
4726 if ((!affinity
.flags
.dups
) && (!__kmp_topology
->at(i
).leader
)) {
4729 int osId
= __kmp_topology
->at(i
).os_id
;
4731 kmp_affin_mask_t
*src
= KMP_CPU_INDEX(affinity
.os_id_masks
, osId
);
4732 kmp_affin_mask_t
*dest
= KMP_CPU_INDEX(affinity
.masks
, j
);
4733 KMP_ASSERT(KMP_CPU_ISSET(osId
, src
));
4734 KMP_CPU_COPY(dest
, src
);
4735 full_mask
.include(src
);
4736 if (++j
>= affinity
.num_masks
) {
4740 KMP_DEBUG_ASSERT(j
== affinity
.num_masks
);
4741 // See if the places list further restricts or changes the full mask
4742 if (full_mask
.restrict_to_mask() && affinity
.flags
.verbose
) {
4743 __kmp_topology
->print(env_var
);
4746 // Sort the topology back using ids
4747 __kmp_topology
->sort_ids();
4751 KMP_ASSERT2(0, "Unexpected affinity setting");
4753 __kmp_aux_affinity_initialize_other_data(affinity
);
4754 affinity
.flags
.initialized
= TRUE
;
4757 void __kmp_affinity_initialize(kmp_affinity_t
&affinity
) {
4758 // Much of the code above was written assuming that if a machine was not
4759 // affinity capable, then affinity type == affinity_none.
4760 // We now explicitly represent this as affinity type == affinity_disabled.
4761 // There are too many checks for affinity type == affinity_none in this code.
4762 // Instead of trying to change them all, check if
4763 // affinity type == affinity_disabled, and if so, slam it with affinity_none,
4764 // call the real initialization routine, then restore affinity type to
4765 // affinity_disabled.
4766 int disabled
= (affinity
.type
== affinity_disabled
);
4767 if (!KMP_AFFINITY_CAPABLE())
4768 KMP_ASSERT(disabled
);
4770 affinity
.type
= affinity_none
;
4771 __kmp_aux_affinity_initialize(affinity
);
4773 affinity
.type
= affinity_disabled
;
4776 void __kmp_affinity_uninitialize(void) {
4777 for (kmp_affinity_t
*affinity
: __kmp_affinities
) {
4778 if (affinity
->masks
!= NULL
)
4779 KMP_CPU_FREE_ARRAY(affinity
->masks
, affinity
->num_masks
);
4780 if (affinity
->os_id_masks
!= NULL
)
4781 KMP_CPU_FREE_ARRAY(affinity
->os_id_masks
, affinity
->num_os_id_masks
);
4782 if (affinity
->proclist
!= NULL
)
4783 __kmp_free(affinity
->proclist
);
4784 if (affinity
->ids
!= NULL
)
4785 __kmp_free(affinity
->ids
);
4786 if (affinity
->attrs
!= NULL
)
4787 __kmp_free(affinity
->attrs
);
4788 *affinity
= KMP_AFFINITY_INIT(affinity
->env_var
);
4790 if (__kmp_affin_origMask
!= NULL
) {
4791 if (KMP_AFFINITY_CAPABLE()) {
4792 __kmp_set_system_affinity(__kmp_affin_origMask
, FALSE
);
4794 KMP_CPU_FREE(__kmp_affin_origMask
);
4795 __kmp_affin_origMask
= NULL
;
4797 __kmp_affinity_num_places
= 0;
4798 if (procarr
!= NULL
) {
4799 __kmp_free(procarr
);
4802 if (__kmp_osid_to_hwthread_map
) {
4803 __kmp_free(__kmp_osid_to_hwthread_map
);
4804 __kmp_osid_to_hwthread_map
= NULL
;
4807 if (__kmp_hwloc_topology
!= NULL
) {
4808 hwloc_topology_destroy(__kmp_hwloc_topology
);
4809 __kmp_hwloc_topology
= NULL
;
4812 if (__kmp_hw_subset
) {
4813 kmp_hw_subset_t::deallocate(__kmp_hw_subset
);
4814 __kmp_hw_subset
= nullptr;
4816 if (__kmp_topology
) {
4817 kmp_topology_t::deallocate(__kmp_topology
);
4818 __kmp_topology
= nullptr;
4820 KMPAffinity::destroy_api();
4823 static void __kmp_select_mask_by_gtid(int gtid
, const kmp_affinity_t
*affinity
,
4824 int *place
, kmp_affin_mask_t
**mask
) {
4826 bool is_hidden_helper
= KMP_HIDDEN_HELPER_THREAD(gtid
);
4827 if (is_hidden_helper
)
4828 // The first gtid is the regular primary thread, the second gtid is the main
4829 // thread of hidden team which does not participate in task execution.
4830 mask_idx
= gtid
- 2;
4832 mask_idx
= __kmp_adjust_gtid_for_hidden_helpers(gtid
);
4833 KMP_DEBUG_ASSERT(affinity
->num_masks
> 0);
4834 *place
= (mask_idx
+ affinity
->offset
) % affinity
->num_masks
;
4835 *mask
= KMP_CPU_INDEX(affinity
->masks
, *place
);
4838 // This function initializes the per-thread data concerning affinity including
4839 // the mask and topology information
4840 void __kmp_affinity_set_init_mask(int gtid
, int isa_root
) {
4842 kmp_info_t
*th
= (kmp_info_t
*)TCR_SYNC_PTR(__kmp_threads
[gtid
]);
4844 // Set the thread topology information to default of unknown
4845 for (int id
= 0; id
< KMP_HW_LAST
; ++id
)
4846 th
->th
.th_topology_ids
[id
] = kmp_hw_thread_t::UNKNOWN_ID
;
4847 th
->th
.th_topology_attrs
= KMP_AFFINITY_ATTRS_UNKNOWN
;
4849 if (!KMP_AFFINITY_CAPABLE()) {
4853 if (th
->th
.th_affin_mask
== NULL
) {
4854 KMP_CPU_ALLOC(th
->th
.th_affin_mask
);
4856 KMP_CPU_ZERO(th
->th
.th_affin_mask
);
4859 // Copy the thread mask to the kmp_info_t structure. If
4860 // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
4861 // one that has all of the OS proc ids set, or if
4862 // __kmp_affinity.flags.respect is set, then the full mask is the
4863 // same as the mask of the initialization thread.
4864 kmp_affin_mask_t
*mask
;
4866 const kmp_affinity_t
*affinity
;
4867 bool is_hidden_helper
= KMP_HIDDEN_HELPER_THREAD(gtid
);
4869 if (is_hidden_helper
)
4870 affinity
= &__kmp_hh_affinity
;
4872 affinity
= &__kmp_affinity
;
4874 if (KMP_AFFINITY_NON_PROC_BIND
|| is_hidden_helper
) {
4875 if ((affinity
->type
== affinity_none
) ||
4876 (affinity
->type
== affinity_balanced
) ||
4877 KMP_HIDDEN_HELPER_MAIN_THREAD(gtid
)) {
4878 #if KMP_GROUP_AFFINITY
4879 if (__kmp_num_proc_groups
> 1) {
4883 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4885 mask
= __kmp_affin_fullMask
;
4887 __kmp_select_mask_by_gtid(gtid
, affinity
, &i
, &mask
);
4890 if (!isa_root
|| __kmp_nested_proc_bind
.bind_types
[0] == proc_bind_false
) {
4891 #if KMP_GROUP_AFFINITY
4892 if (__kmp_num_proc_groups
> 1) {
4896 KMP_ASSERT(__kmp_affin_fullMask
!= NULL
);
4898 mask
= __kmp_affin_fullMask
;
4900 __kmp_select_mask_by_gtid(gtid
, affinity
, &i
, &mask
);
4904 th
->th
.th_current_place
= i
;
4905 if (isa_root
&& !is_hidden_helper
) {
4906 th
->th
.th_new_place
= i
;
4907 th
->th
.th_first_place
= 0;
4908 th
->th
.th_last_place
= affinity
->num_masks
- 1;
4909 } else if (KMP_AFFINITY_NON_PROC_BIND
) {
4910 // When using a Non-OMP_PROC_BIND affinity method,
4911 // set all threads' place-partition-var to the entire place list
4912 th
->th
.th_first_place
= 0;
4913 th
->th
.th_last_place
= affinity
->num_masks
- 1;
4915 // Copy topology information associated with the place
4917 th
->th
.th_topology_ids
= __kmp_affinity
.ids
[i
];
4918 th
->th
.th_topology_attrs
= __kmp_affinity
.attrs
[i
];
4921 if (i
== KMP_PLACE_ALL
) {
4922 KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
4925 KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
4929 KMP_CPU_COPY(th
->th
.th_affin_mask
, mask
);
4932 void __kmp_affinity_bind_init_mask(int gtid
) {
4933 if (!KMP_AFFINITY_CAPABLE()) {
4936 kmp_info_t
*th
= (kmp_info_t
*)TCR_SYNC_PTR(__kmp_threads
[gtid
]);
4937 const kmp_affinity_t
*affinity
;
4938 const char *env_var
;
4939 bool is_hidden_helper
= KMP_HIDDEN_HELPER_THREAD(gtid
);
4941 if (is_hidden_helper
)
4942 affinity
= &__kmp_hh_affinity
;
4944 affinity
= &__kmp_affinity
;
4945 env_var
= __kmp_get_affinity_env_var(*affinity
, /*for_binding=*/true);
4946 /* to avoid duplicate printing (will be correctly printed on barrier) */
4947 if (affinity
->flags
.verbose
&& (affinity
->type
== affinity_none
||
4948 (th
->th
.th_current_place
!= KMP_PLACE_ALL
&&
4949 affinity
->type
!= affinity_balanced
)) &&
4950 !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid
)) {
4951 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
4952 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
4953 th
->th
.th_affin_mask
);
4954 KMP_INFORM(BoundToOSProcSet
, env_var
, (kmp_int32
)getpid(), __kmp_gettid(),
4959 // On Windows* OS, the process affinity mask might have changed. If the user
4960 // didn't request affinity and this call fails, just continue silently.
4962 if (affinity
->type
== affinity_none
) {
4963 __kmp_set_system_affinity(th
->th
.th_affin_mask
, FALSE
);
4966 __kmp_set_system_affinity(th
->th
.th_affin_mask
, TRUE
);
4969 void __kmp_affinity_bind_place(int gtid
) {
4970 // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
4971 if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid
)) {
4975 kmp_info_t
*th
= (kmp_info_t
*)TCR_SYNC_PTR(__kmp_threads
[gtid
]);
4977 KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
4979 gtid
, th
->th
.th_new_place
, th
->th
.th_current_place
));
4981 // Check that the new place is within this thread's partition.
4982 KMP_DEBUG_ASSERT(th
->th
.th_affin_mask
!= NULL
);
4983 KMP_ASSERT(th
->th
.th_new_place
>= 0);
4984 KMP_ASSERT((unsigned)th
->th
.th_new_place
<= __kmp_affinity
.num_masks
);
4985 if (th
->th
.th_first_place
<= th
->th
.th_last_place
) {
4986 KMP_ASSERT((th
->th
.th_new_place
>= th
->th
.th_first_place
) &&
4987 (th
->th
.th_new_place
<= th
->th
.th_last_place
));
4989 KMP_ASSERT((th
->th
.th_new_place
<= th
->th
.th_first_place
) ||
4990 (th
->th
.th_new_place
>= th
->th
.th_last_place
));
4993 // Copy the thread mask to the kmp_info_t structure,
4994 // and set this thread's affinity.
4995 kmp_affin_mask_t
*mask
=
4996 KMP_CPU_INDEX(__kmp_affinity
.masks
, th
->th
.th_new_place
);
4997 KMP_CPU_COPY(th
->th
.th_affin_mask
, mask
);
4998 th
->th
.th_current_place
= th
->th
.th_new_place
;
5000 if (__kmp_affinity
.flags
.verbose
) {
5001 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5002 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5003 th
->th
.th_affin_mask
);
5004 KMP_INFORM(BoundToOSProcSet
, "OMP_PROC_BIND", (kmp_int32
)getpid(),
5005 __kmp_gettid(), gtid
, buf
);
5007 __kmp_set_system_affinity(th
->th
.th_affin_mask
, TRUE
);
5010 int __kmp_aux_set_affinity(void **mask
) {
5015 if (!KMP_AFFINITY_CAPABLE()) {
5019 gtid
= __kmp_entry_gtid();
5022 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5023 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5024 (kmp_affin_mask_t
*)(*mask
));
5026 "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
5030 if (__kmp_env_consistency_check
) {
5031 if ((mask
== NULL
) || (*mask
== NULL
)) {
5032 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
5037 KMP_CPU_SET_ITERATE(proc
, ((kmp_affin_mask_t
*)(*mask
))) {
5038 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
5039 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
5041 if (!KMP_CPU_ISSET(proc
, (kmp_affin_mask_t
*)(*mask
))) {
5046 if (num_procs
== 0) {
5047 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
5050 #if KMP_GROUP_AFFINITY
5051 if (__kmp_get_proc_group((kmp_affin_mask_t
*)(*mask
)) < 0) {
5052 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity");
5054 #endif /* KMP_GROUP_AFFINITY */
5058 th
= __kmp_threads
[gtid
];
5059 KMP_DEBUG_ASSERT(th
->th
.th_affin_mask
!= NULL
);
5060 retval
= __kmp_set_system_affinity((kmp_affin_mask_t
*)(*mask
), FALSE
);
5062 KMP_CPU_COPY(th
->th
.th_affin_mask
, (kmp_affin_mask_t
*)(*mask
));
5065 th
->th
.th_current_place
= KMP_PLACE_UNDEFINED
;
5066 th
->th
.th_new_place
= KMP_PLACE_UNDEFINED
;
5067 th
->th
.th_first_place
= 0;
5068 th
->th
.th_last_place
= __kmp_affinity
.num_masks
- 1;
5070 // Turn off 4.0 affinity for the current tread at this parallel level.
5071 th
->th
.th_current_task
->td_icvs
.proc_bind
= proc_bind_false
;
5076 int __kmp_aux_get_affinity(void **mask
) {
5079 #if KMP_OS_WINDOWS || KMP_DEBUG
5082 if (!KMP_AFFINITY_CAPABLE()) {
5086 gtid
= __kmp_entry_gtid();
5087 #if KMP_OS_WINDOWS || KMP_DEBUG
5088 th
= __kmp_threads
[gtid
];
5090 (void)gtid
; // unused variable
5092 KMP_DEBUG_ASSERT(th
->th
.th_affin_mask
!= NULL
);
5096 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5097 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5098 th
->th
.th_affin_mask
);
5100 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid
,
5104 if (__kmp_env_consistency_check
) {
5105 if ((mask
== NULL
) || (*mask
== NULL
)) {
5106 KMP_FATAL(AffinityInvalidMask
, "kmp_get_affinity");
5112 retval
= __kmp_get_system_affinity((kmp_affin_mask_t
*)(*mask
), FALSE
);
5115 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5116 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5117 (kmp_affin_mask_t
*)(*mask
));
5119 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid
,
5127 KMP_CPU_COPY((kmp_affin_mask_t
*)(*mask
), th
->th
.th_affin_mask
);
5130 #endif /* KMP_OS_WINDOWS */
5133 int __kmp_aux_get_affinity_max_proc() {
5134 if (!KMP_AFFINITY_CAPABLE()) {
5137 #if KMP_GROUP_AFFINITY
5138 if (__kmp_num_proc_groups
> 1) {
5139 return (int)(__kmp_num_proc_groups
* sizeof(DWORD_PTR
) * CHAR_BIT
);
5145 int __kmp_aux_set_affinity_mask_proc(int proc
, void **mask
) {
5146 if (!KMP_AFFINITY_CAPABLE()) {
5152 int gtid
= __kmp_entry_gtid();
5153 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5154 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5155 (kmp_affin_mask_t
*)(*mask
));
5156 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
5157 "affinity mask for thread %d = %s\n",
5161 if (__kmp_env_consistency_check
) {
5162 if ((mask
== NULL
) || (*mask
== NULL
)) {
5163 KMP_FATAL(AffinityInvalidMask
, "kmp_set_affinity_mask_proc");
5167 if ((proc
< 0) || (proc
>= __kmp_aux_get_affinity_max_proc())) {
5170 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
5174 KMP_CPU_SET(proc
, (kmp_affin_mask_t
*)(*mask
));
5178 int __kmp_aux_unset_affinity_mask_proc(int proc
, void **mask
) {
5179 if (!KMP_AFFINITY_CAPABLE()) {
5185 int gtid
= __kmp_entry_gtid();
5186 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5187 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5188 (kmp_affin_mask_t
*)(*mask
));
5189 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5190 "affinity mask for thread %d = %s\n",
5194 if (__kmp_env_consistency_check
) {
5195 if ((mask
== NULL
) || (*mask
== NULL
)) {
5196 KMP_FATAL(AffinityInvalidMask
, "kmp_unset_affinity_mask_proc");
5200 if ((proc
< 0) || (proc
>= __kmp_aux_get_affinity_max_proc())) {
5203 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
5207 KMP_CPU_CLR(proc
, (kmp_affin_mask_t
*)(*mask
));
5211 int __kmp_aux_get_affinity_mask_proc(int proc
, void **mask
) {
5212 if (!KMP_AFFINITY_CAPABLE()) {
5218 int gtid
= __kmp_entry_gtid();
5219 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5220 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
,
5221 (kmp_affin_mask_t
*)(*mask
));
5222 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5223 "affinity mask for thread %d = %s\n",
5227 if (__kmp_env_consistency_check
) {
5228 if ((mask
== NULL
) || (*mask
== NULL
)) {
5229 KMP_FATAL(AffinityInvalidMask
, "kmp_get_affinity_mask_proc");
5233 if ((proc
< 0) || (proc
>= __kmp_aux_get_affinity_max_proc())) {
5236 if (!KMP_CPU_ISSET(proc
, __kmp_affin_fullMask
)) {
5240 return KMP_CPU_ISSET(proc
, (kmp_affin_mask_t
*)(*mask
));
5243 // Dynamic affinity settings - Affinity balanced
5244 void __kmp_balanced_affinity(kmp_info_t
*th
, int nthreads
) {
5245 KMP_DEBUG_ASSERT(th
);
5246 bool fine_gran
= true;
5247 int tid
= th
->th
.th_info
.ds
.ds_tid
;
5248 const char *env_var
= "KMP_AFFINITY";
5250 // Do not perform balanced affinity for the hidden helper threads
5251 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th
)))
5254 switch (__kmp_affinity
.gran
) {
5258 if (__kmp_nThreadsPerCore
> 1) {
5263 if (nCoresPerPkg
> 1) {
5271 if (__kmp_topology
->is_uniform()) {
5274 // Number of hyper threads per core in HT machine
5275 int __kmp_nth_per_core
= __kmp_avail_proc
/ __kmp_ncores
;
5277 int ncores
= __kmp_ncores
;
5278 if ((nPackages
> 1) && (__kmp_nth_per_core
<= 1)) {
5279 __kmp_nth_per_core
= __kmp_avail_proc
/ nPackages
;
5282 // How many threads will be bound to each core
5283 int chunk
= nthreads
/ ncores
;
5284 // How many cores will have an additional thread bound to it - "big cores"
5285 int big_cores
= nthreads
% ncores
;
5286 // Number of threads on the big cores
5287 int big_nth
= (chunk
+ 1) * big_cores
;
5288 if (tid
< big_nth
) {
5289 coreID
= tid
/ (chunk
+ 1);
5290 threadID
= (tid
% (chunk
+ 1)) % __kmp_nth_per_core
;
5291 } else { // tid >= big_nth
5292 coreID
= (tid
- big_cores
) / chunk
;
5293 threadID
= ((tid
- big_cores
) % chunk
) % __kmp_nth_per_core
;
5295 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5296 "Illegal set affinity operation when not capable");
5298 kmp_affin_mask_t
*mask
= th
->th
.th_affin_mask
;
5303 __kmp_topology
->at(coreID
* __kmp_nth_per_core
+ threadID
).os_id
;
5304 KMP_CPU_SET(osID
, mask
);
5306 for (int i
= 0; i
< __kmp_nth_per_core
; i
++) {
5308 osID
= __kmp_topology
->at(coreID
* __kmp_nth_per_core
+ i
).os_id
;
5309 KMP_CPU_SET(osID
, mask
);
5312 if (__kmp_affinity
.flags
.verbose
) {
5313 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5314 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, mask
);
5315 KMP_INFORM(BoundToOSProcSet
, env_var
, (kmp_int32
)getpid(), __kmp_gettid(),
5318 __kmp_affinity_get_thread_topology_info(th
);
5319 __kmp_set_system_affinity(mask
, TRUE
);
5320 } else { // Non-uniform topology
5322 kmp_affin_mask_t
*mask
= th
->th
.th_affin_mask
;
5326 __kmp_affinity_find_core_level(__kmp_avail_proc
, __kmp_aff_depth
- 1);
5327 int ncores
= __kmp_affinity_compute_ncores(__kmp_avail_proc
,
5328 __kmp_aff_depth
- 1, core_level
);
5329 int nth_per_core
= __kmp_affinity_max_proc_per_core(
5330 __kmp_avail_proc
, __kmp_aff_depth
- 1, core_level
);
5332 // For performance gain consider the special case nthreads ==
5334 if (nthreads
== __kmp_avail_proc
) {
5336 int osID
= __kmp_topology
->at(tid
).os_id
;
5337 KMP_CPU_SET(osID
, mask
);
5340 __kmp_affinity_find_core(tid
, __kmp_aff_depth
- 1, core_level
);
5341 for (int i
= 0; i
< __kmp_avail_proc
; i
++) {
5342 int osID
= __kmp_topology
->at(i
).os_id
;
5343 if (__kmp_affinity_find_core(i
, __kmp_aff_depth
- 1, core_level
) ==
5345 KMP_CPU_SET(osID
, mask
);
5349 } else if (nthreads
<= ncores
) {
5352 for (int i
= 0; i
< ncores
; i
++) {
5353 // Check if this core from procarr[] is in the mask
5355 for (int j
= 0; j
< nth_per_core
; j
++) {
5356 if (procarr
[i
* nth_per_core
+ j
] != -1) {
5363 for (int j
= 0; j
< nth_per_core
; j
++) {
5364 int osID
= procarr
[i
* nth_per_core
+ j
];
5366 KMP_CPU_SET(osID
, mask
);
5367 // For fine granularity it is enough to set the first available
5368 // osID for this core
5380 } else { // nthreads > ncores
5381 // Array to save the number of processors at each core
5382 int *nproc_at_core
= (int *)KMP_ALLOCA(sizeof(int) * ncores
);
5383 // Array to save the number of cores with "x" available processors;
5384 int *ncores_with_x_procs
=
5385 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core
+ 1));
5386 // Array to save the number of cores with # procs from x to nth_per_core
5387 int *ncores_with_x_to_max_procs
=
5388 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core
+ 1));
5390 for (int i
= 0; i
<= nth_per_core
; i
++) {
5391 ncores_with_x_procs
[i
] = 0;
5392 ncores_with_x_to_max_procs
[i
] = 0;
5395 for (int i
= 0; i
< ncores
; i
++) {
5397 for (int j
= 0; j
< nth_per_core
; j
++) {
5398 if (procarr
[i
* nth_per_core
+ j
] != -1) {
5402 nproc_at_core
[i
] = cnt
;
5403 ncores_with_x_procs
[cnt
]++;
5406 for (int i
= 0; i
<= nth_per_core
; i
++) {
5407 for (int j
= i
; j
<= nth_per_core
; j
++) {
5408 ncores_with_x_to_max_procs
[i
] += ncores_with_x_procs
[j
];
5412 // Max number of processors
5413 int nproc
= nth_per_core
* ncores
;
5414 // An array to keep number of threads per each context
5415 int *newarr
= (int *)__kmp_allocate(sizeof(int) * nproc
);
5416 for (int i
= 0; i
< nproc
; i
++) {
5423 for (int j
= 1; j
<= nth_per_core
; j
++) {
5424 int cnt
= ncores_with_x_to_max_procs
[j
];
5425 for (int i
= 0; i
< ncores
; i
++) {
5426 // Skip the core with 0 processors
5427 if (nproc_at_core
[i
] == 0) {
5430 for (int k
= 0; k
< nth_per_core
; k
++) {
5431 if (procarr
[i
* nth_per_core
+ k
] != -1) {
5432 if (newarr
[i
* nth_per_core
+ k
] == 0) {
5433 newarr
[i
* nth_per_core
+ k
] = 1;
5439 newarr
[i
* nth_per_core
+ k
]++;
5447 if (cnt
== 0 || nth
== 0) {
5458 for (int i
= 0; i
< nproc
; i
++) {
5462 int osID
= procarr
[i
];
5463 KMP_CPU_SET(osID
, mask
);
5465 int coreID
= i
/ nth_per_core
;
5466 for (int ii
= 0; ii
< nth_per_core
; ii
++) {
5467 int osID
= procarr
[coreID
* nth_per_core
+ ii
];
5469 KMP_CPU_SET(osID
, mask
);
5479 if (__kmp_affinity
.flags
.verbose
) {
5480 char buf
[KMP_AFFIN_MASK_PRINT_LEN
];
5481 __kmp_affinity_print_mask(buf
, KMP_AFFIN_MASK_PRINT_LEN
, mask
);
5482 KMP_INFORM(BoundToOSProcSet
, env_var
, (kmp_int32
)getpid(), __kmp_gettid(),
5485 __kmp_affinity_get_thread_topology_info(th
);
5486 __kmp_set_system_affinity(mask
, TRUE
);
5490 #if KMP_OS_LINUX || KMP_OS_FREEBSD
5491 // We don't need this entry for Windows because
5492 // there is GetProcessAffinityMask() api
5494 // The intended usage is indicated by these steps:
5495 // 1) The user gets the current affinity mask
5496 // 2) Then sets the affinity by calling this function
5497 // 3) Error check the return value
5498 // 4) Use non-OpenMP parallelization
5499 // 5) Reset the affinity to what was stored in step 1)
5504 kmp_set_thread_affinity_mask_initial()
5505 // the function returns 0 on success,
5506 // -1 if we cannot bind thread
5507 // >0 (errno) if an error happened during binding
5509 int gtid
= __kmp_get_gtid();
5511 // Do not touch non-omp threads
5512 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5513 "non-omp thread, returning\n"));
5516 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle
) {
5517 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5518 "affinity not initialized, returning\n"));
5521 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5522 "set full mask for thread %d\n",
5524 KMP_DEBUG_ASSERT(__kmp_affin_fullMask
!= NULL
);
5525 return __kmp_set_system_affinity(__kmp_affin_fullMask
, FALSE
);
5529 #endif // KMP_AFFINITY_SUPPORTED