2 * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
13 #ifndef KMP_DISPATCH_HIER_H
14 #define KMP_DISPATCH_HIER_H
16 #include "kmp_dispatch.h"
18 // Layer type for scheduling hierarchy
19 enum kmp_hier_layer_e
{
29 // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
30 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type
) {
32 case kmp_hier_layer_e::LAYER_THREAD
:
34 case kmp_hier_layer_e::LAYER_L1
:
36 case kmp_hier_layer_e::LAYER_L2
:
38 case kmp_hier_layer_e::LAYER_L3
:
40 case kmp_hier_layer_e::LAYER_NUMA
:
42 case kmp_hier_layer_e::LAYER_LOOP
:
44 case kmp_hier_layer_e::LAYER_LAST
:
48 // Appease compilers, should never get here
52 // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
53 typedef struct kmp_hier_sched_env_t
{
56 enum sched_type
*scheds
;
57 kmp_int32
*small_chunks
;
58 kmp_int64
*large_chunks
;
59 kmp_hier_layer_e
*layers
;
60 // Append a level of the hierarchy
61 void append(enum sched_type sched
, kmp_int32 chunk
, kmp_hier_layer_e layer
) {
63 scheds
= (enum sched_type
*)__kmp_allocate(sizeof(enum sched_type
) *
64 kmp_hier_layer_e::LAYER_LAST
);
65 small_chunks
= (kmp_int32
*)__kmp_allocate(sizeof(kmp_int32
) *
66 kmp_hier_layer_e::LAYER_LAST
);
67 large_chunks
= (kmp_int64
*)__kmp_allocate(sizeof(kmp_int64
) *
68 kmp_hier_layer_e::LAYER_LAST
);
69 layers
= (kmp_hier_layer_e
*)__kmp_allocate(sizeof(kmp_hier_layer_e
) *
70 kmp_hier_layer_e::LAYER_LAST
);
71 capacity
= kmp_hier_layer_e::LAYER_LAST
;
73 int current_size
= size
;
74 KMP_DEBUG_ASSERT(current_size
< kmp_hier_layer_e::LAYER_LAST
);
75 scheds
[current_size
] = sched
;
76 layers
[current_size
] = layer
;
77 small_chunks
[current_size
] = chunk
;
78 large_chunks
[current_size
] = (kmp_int64
)chunk
;
81 // Sort the hierarchy using selection sort, size will always be small
82 // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
86 for (int i
= 0; i
< size
; ++i
) {
88 for (int j
= i
+ 1; j
< size
; ++j
) {
89 if (layers
[j
] < layers
[switch_index
])
92 if (switch_index
!= i
) {
93 kmp_hier_layer_e temp1
= layers
[i
];
94 enum sched_type temp2
= scheds
[i
];
95 kmp_int32 temp3
= small_chunks
[i
];
96 kmp_int64 temp4
= large_chunks
[i
];
97 layers
[i
] = layers
[switch_index
];
98 scheds
[i
] = scheds
[switch_index
];
99 small_chunks
[i
] = small_chunks
[switch_index
];
100 large_chunks
[i
] = large_chunks
[switch_index
];
101 layers
[switch_index
] = temp1
;
102 scheds
[switch_index
] = temp2
;
103 small_chunks
[switch_index
] = temp3
;
104 large_chunks
[switch_index
] = temp4
;
113 __kmp_free(small_chunks
);
114 __kmp_free(large_chunks
);
123 } kmp_hier_sched_env_t
;
125 extern int __kmp_dispatch_hand_threading
;
126 extern kmp_hier_sched_env_t __kmp_hier_scheds
;
128 // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
129 extern int __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_LAST
+ 1];
130 extern int __kmp_hier_threads_per
[kmp_hier_layer_e::LAYER_LAST
+ 1];
132 extern int __kmp_dispatch_get_index(int tid
, kmp_hier_layer_e type
);
133 extern int __kmp_dispatch_get_id(int gtid
, kmp_hier_layer_e type
);
134 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1
,
135 kmp_hier_layer_e t2
);
136 extern void __kmp_dispatch_free_hierarchies(kmp_team_t
*team
);
138 template <typename T
> struct kmp_hier_shared_bdata_t
{
139 typedef typename traits_t
<T
>::signed_t ST
;
140 volatile kmp_uint64 val
[2];
145 dispatch_shared_info_template
<T
> sh
[2];
148 status
[0] = status
[1] = 0;
152 sh
[0].u
.s
.iteration
= sh
[1].u
.s
.iteration
= 0;
154 void set_next_hand_thread(T nlb
, T nub
, ST nst
, kmp_int32 nstatus
,
159 status
[1 - index
] = nstatus
;
161 void set_next(T nlb
, T nub
, ST nst
, kmp_int32 nstatus
, kmp_uint64 index
) {
165 status
[1 - index
] = nstatus
;
166 sh
[1 - index
].u
.s
.iteration
= 0;
169 kmp_int32
get_next_status(kmp_uint64 index
) const {
170 return status
[1 - index
];
172 T
get_next_lb(kmp_uint64 index
) const { return lb
[1 - index
]; }
173 T
get_next_ub(kmp_uint64 index
) const { return ub
[1 - index
]; }
174 ST
get_next_st(kmp_uint64 index
) const { return st
[1 - index
]; }
175 dispatch_shared_info_template
<T
> volatile *get_next_sh(kmp_uint64 index
) {
176 return &(sh
[1 - index
]);
179 kmp_int32
get_curr_status(kmp_uint64 index
) const { return status
[index
]; }
180 T
get_curr_lb(kmp_uint64 index
) const { return lb
[index
]; }
181 T
get_curr_ub(kmp_uint64 index
) const { return ub
[index
]; }
182 ST
get_curr_st(kmp_uint64 index
) const { return st
[index
]; }
183 dispatch_shared_info_template
<T
> volatile *get_curr_sh(kmp_uint64 index
) {
189 * In the barrier implementations, num_active is the number of threads that are
190 * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
191 * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
192 * structure. tdata is the thread private data that resides on the thread
195 * The reset_shared() method is used to initialize the barrier data on the
196 * kmp_hier_top_unit_t hierarchy structure
198 * The reset_private() method is used to initialize the barrier data on the
199 * thread's private dispatch buffer structure
201 * The barrier() method takes an id, which is that thread's id for the
202 * kmp_hier_top_unit_t structure, and implements the barrier. All threads wait
203 * inside barrier() until all fellow threads who are attached to that
204 * kmp_hier_top_unit_t structure have arrived.
207 // Core barrier implementation
208 // Can be used in a unit with between 2 to 8 threads
209 template <typename T
> class core_barrier_impl
{
210 static inline kmp_uint64
get_wait_val(int num_active
) {
211 kmp_uint64 wait_val
= 0LL;
212 switch (num_active
) {
217 wait_val
= 0x010101LL
;
220 wait_val
= 0x01010101LL
;
223 wait_val
= 0x0101010101LL
;
226 wait_val
= 0x010101010101LL
;
229 wait_val
= 0x01010101010101LL
;
232 wait_val
= 0x0101010101010101LL
;
235 // don't use the core_barrier_impl for more than 8 threads
242 static void reset_private(kmp_int32 num_active
,
243 kmp_hier_private_bdata_t
*tdata
);
244 static void reset_shared(kmp_int32 num_active
,
245 kmp_hier_shared_bdata_t
<T
> *bdata
);
246 static void barrier(kmp_int32 id
, kmp_hier_shared_bdata_t
<T
> *bdata
,
247 kmp_hier_private_bdata_t
*tdata
);
250 template <typename T
>
251 void core_barrier_impl
<T
>::reset_private(kmp_int32 num_active
,
252 kmp_hier_private_bdata_t
*tdata
) {
253 tdata
->num_active
= num_active
;
255 tdata
->wait_val
[0] = tdata
->wait_val
[1] = get_wait_val(num_active
);
257 template <typename T
>
258 void core_barrier_impl
<T
>::reset_shared(kmp_int32 num_active
,
259 kmp_hier_shared_bdata_t
<T
> *bdata
) {
260 bdata
->val
[0] = bdata
->val
[1] = 0LL;
261 bdata
->status
[0] = bdata
->status
[1] = 0LL;
263 template <typename T
>
264 void core_barrier_impl
<T
>::barrier(kmp_int32 id
,
265 kmp_hier_shared_bdata_t
<T
> *bdata
,
266 kmp_hier_private_bdata_t
*tdata
) {
267 kmp_uint64 current_index
= tdata
->index
;
268 kmp_uint64 next_index
= 1 - current_index
;
269 kmp_uint64 current_wait_value
= tdata
->wait_val
[current_index
];
270 kmp_uint64 next_wait_value
=
271 (current_wait_value
? 0 : get_wait_val(tdata
->num_active
));
272 KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
273 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
274 __kmp_get_gtid(), current_index
, next_index
, current_wait_value
,
276 char v
= (current_wait_value
? '\1' : '\0');
277 (RCAST(volatile char *, &(bdata
->val
[current_index
])))[id
] = v
;
278 __kmp_wait
<kmp_uint64
>(&(bdata
->val
[current_index
]), current_wait_value
,
279 __kmp_eq
<kmp_uint64
> USE_ITT_BUILD_ARG(NULL
));
280 tdata
->wait_val
[current_index
] = next_wait_value
;
281 tdata
->index
= next_index
;
284 // Counter barrier implementation
285 // Can be used in a unit with arbitrary number of active threads
286 template <typename T
> class counter_barrier_impl
{
288 static void reset_private(kmp_int32 num_active
,
289 kmp_hier_private_bdata_t
*tdata
);
290 static void reset_shared(kmp_int32 num_active
,
291 kmp_hier_shared_bdata_t
<T
> *bdata
);
292 static void barrier(kmp_int32 id
, kmp_hier_shared_bdata_t
<T
> *bdata
,
293 kmp_hier_private_bdata_t
*tdata
);
296 template <typename T
>
297 void counter_barrier_impl
<T
>::reset_private(kmp_int32 num_active
,
298 kmp_hier_private_bdata_t
*tdata
) {
299 tdata
->num_active
= num_active
;
301 tdata
->wait_val
[0] = tdata
->wait_val
[1] = (kmp_uint64
)num_active
;
303 template <typename T
>
304 void counter_barrier_impl
<T
>::reset_shared(kmp_int32 num_active
,
305 kmp_hier_shared_bdata_t
<T
> *bdata
) {
306 bdata
->val
[0] = bdata
->val
[1] = 0LL;
307 bdata
->status
[0] = bdata
->status
[1] = 0LL;
309 template <typename T
>
310 void counter_barrier_impl
<T
>::barrier(kmp_int32 id
,
311 kmp_hier_shared_bdata_t
<T
> *bdata
,
312 kmp_hier_private_bdata_t
*tdata
) {
313 volatile kmp_int64
*val
;
314 kmp_uint64 current_index
= tdata
->index
;
315 kmp_uint64 next_index
= 1 - current_index
;
316 kmp_uint64 current_wait_value
= tdata
->wait_val
[current_index
];
317 kmp_uint64 next_wait_value
= current_wait_value
+ tdata
->num_active
;
319 KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
320 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
321 __kmp_get_gtid(), current_index
, next_index
, current_wait_value
,
323 val
= RCAST(volatile kmp_int64
*, &(bdata
->val
[current_index
]));
324 KMP_TEST_THEN_INC64(val
);
325 __kmp_wait
<kmp_uint64
>(&(bdata
->val
[current_index
]), current_wait_value
,
326 __kmp_ge
<kmp_uint64
> USE_ITT_BUILD_ARG(NULL
));
327 tdata
->wait_val
[current_index
] = next_wait_value
;
328 tdata
->index
= next_index
;
331 // Data associated with topology unit within a layer
332 // For example, one kmp_hier_top_unit_t corresponds to one L1 cache
333 template <typename T
> struct kmp_hier_top_unit_t
{
334 typedef typename traits_t
<T
>::signed_t ST
;
335 typedef typename traits_t
<T
>::unsigned_t UT
;
336 kmp_int32 active
; // number of topology units that communicate with this unit
337 // chunk information (lower/upper bound, stride, etc.)
338 dispatch_private_info_template
<T
> hier_pr
;
339 kmp_hier_top_unit_t
<T
> *hier_parent
; // pointer to parent unit
340 kmp_hier_shared_bdata_t
<T
> hier_barrier
; // shared barrier data for this unit
342 kmp_int32
get_hier_id() const { return hier_pr
.hier_id
; }
343 void reset_shared_barrier() {
344 KMP_DEBUG_ASSERT(active
> 0);
348 if (active
>= 2 && active
<= 8) {
349 core_barrier_impl
<T
>::reset_shared(active
, &hier_barrier
);
351 counter_barrier_impl
<T
>::reset_shared(active
, &hier_barrier
);
354 void reset_private_barrier(kmp_hier_private_bdata_t
*tdata
) {
355 KMP_DEBUG_ASSERT(tdata
);
356 KMP_DEBUG_ASSERT(active
> 0);
359 if (active
>= 2 && active
<= 8) {
360 core_barrier_impl
<T
>::reset_private(active
, tdata
);
362 counter_barrier_impl
<T
>::reset_private(active
, tdata
);
365 void barrier(kmp_int32 id
, kmp_hier_private_bdata_t
*tdata
) {
366 KMP_DEBUG_ASSERT(tdata
);
367 KMP_DEBUG_ASSERT(active
> 0);
368 KMP_DEBUG_ASSERT(id
>= 0 && id
< active
);
370 tdata
->index
= 1 - tdata
->index
;
373 if (active
>= 2 && active
<= 8) {
374 core_barrier_impl
<T
>::barrier(id
, &hier_barrier
, tdata
);
376 counter_barrier_impl
<T
>::barrier(id
, &hier_barrier
, tdata
);
380 kmp_int32
get_next_status(kmp_uint64 index
) const {
381 return hier_barrier
.get_next_status(index
);
383 T
get_next_lb(kmp_uint64 index
) const {
384 return hier_barrier
.get_next_lb(index
);
386 T
get_next_ub(kmp_uint64 index
) const {
387 return hier_barrier
.get_next_ub(index
);
389 ST
get_next_st(kmp_uint64 index
) const {
390 return hier_barrier
.get_next_st(index
);
392 dispatch_shared_info_template
<T
> volatile *get_next_sh(kmp_uint64 index
) {
393 return hier_barrier
.get_next_sh(index
);
396 kmp_int32
get_curr_status(kmp_uint64 index
) const {
397 return hier_barrier
.get_curr_status(index
);
399 T
get_curr_lb(kmp_uint64 index
) const {
400 return hier_barrier
.get_curr_lb(index
);
402 T
get_curr_ub(kmp_uint64 index
) const {
403 return hier_barrier
.get_curr_ub(index
);
405 ST
get_curr_st(kmp_uint64 index
) const {
406 return hier_barrier
.get_curr_st(index
);
408 dispatch_shared_info_template
<T
> volatile *get_curr_sh(kmp_uint64 index
) {
409 return hier_barrier
.get_curr_sh(index
);
412 void set_next_hand_thread(T lb
, T ub
, ST st
, kmp_int32 status
,
414 hier_barrier
.set_next_hand_thread(lb
, ub
, st
, status
, index
);
416 void set_next(T lb
, T ub
, ST st
, kmp_int32 status
, kmp_uint64 index
) {
417 hier_barrier
.set_next(lb
, ub
, st
, status
, index
);
419 dispatch_private_info_template
<T
> *get_my_pr() { return &hier_pr
; }
420 kmp_hier_top_unit_t
<T
> *get_parent() { return hier_parent
; }
421 dispatch_private_info_template
<T
> *get_parent_pr() {
422 return &(hier_parent
->hier_pr
);
425 kmp_int32
is_active() const { return active
; }
426 kmp_int32
get_num_active() const { return active
; }
431 (" kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
432 active
, &hier_pr
, hier_pr
.u
.p
.lb
, hier_pr
.u
.p
.ub
, hier_pr
.u
.p
.st
,
438 // Information regarding a single layer within the scheduling hierarchy
439 template <typename T
> struct kmp_hier_layer_info_t
{
440 int num_active
; // number of threads active in this level
441 kmp_hier_layer_e type
; // LAYER_L1, LAYER_L2, etc.
442 enum sched_type sched
; // static, dynamic, guided, etc.
443 typename traits_t
<T
>::signed_t chunk
; // chunk size associated with schedule
444 int length
; // length of the kmp_hier_top_unit_t array
447 // Print this layer's information
449 const char *t
= __kmp_get_hier_str(type
);
452 (" kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
454 num_active
, t
, sched
, chunk
, length
));
460 * Structure to implement entire hierarchy
462 * The hierarchy is kept as an array of arrays to represent the different
463 * layers. Layer 0 is the lowest layer to layer num_layers - 1 which is the
466 * [ 2 ] -> [ L3 | L3 ]
467 * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
468 * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
469 * There is also an array of layer_info_t which has information regarding
472 template <typename T
> struct kmp_hier_t
{
474 typedef typename traits_t
<T
>::unsigned_t UT
;
475 typedef typename traits_t
<T
>::signed_t ST
;
478 int next_recurse(ident_t
*loc
, int gtid
, kmp_hier_top_unit_t
<T
> *current
,
479 kmp_int32
*p_last
, T
*p_lb
, T
*p_ub
, ST
*p_st
,
480 kmp_int32 previous_id
, int hier_level
) {
482 kmp_info_t
*th
= __kmp_threads
[gtid
];
483 auto parent
= current
->get_parent();
484 bool last_layer
= (hier_level
== get_num_layers() - 1);
485 KMP_DEBUG_ASSERT(th
);
486 kmp_hier_private_bdata_t
*tdata
= &(th
->th
.th_hier_bar_data
[hier_level
]);
487 KMP_DEBUG_ASSERT(current
);
488 KMP_DEBUG_ASSERT(hier_level
>= 0);
489 KMP_DEBUG_ASSERT(hier_level
< get_num_layers());
490 KMP_DEBUG_ASSERT(tdata
);
491 KMP_DEBUG_ASSERT(parent
|| last_layer
);
494 1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid
, hier_level
));
496 T hier_id
= (T
)current
->get_hier_id();
497 // Attempt to grab next iteration range for this level
498 if (previous_id
== 0) {
499 KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
501 kmp_int32 contains_last
;
505 dispatch_shared_info_template
<T
> volatile *my_sh
;
506 dispatch_private_info_template
<T
> *my_pr
;
508 // last layer below the very top uses the single shared buffer
509 // from the team struct.
511 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
513 my_sh
= reinterpret_cast<dispatch_shared_info_template
<T
> volatile *>(
514 th
->th
.th_dispatch
->th_dispatch_sh_current
);
515 nproc
= (T
)get_top_level_nproc();
517 // middle layers use the shared buffer inside the kmp_hier_top_unit_t
519 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
522 parent
->get_curr_sh(th
->th
.th_hier_bar_data
[hier_level
+ 1].index
);
523 nproc
= (T
)parent
->get_num_active();
525 my_pr
= current
->get_my_pr();
526 KMP_DEBUG_ASSERT(my_sh
);
527 KMP_DEBUG_ASSERT(my_pr
);
528 enum sched_type schedule
= get_sched(hier_level
);
529 ST chunk
= (ST
)get_chunk(hier_level
);
530 status
= __kmp_dispatch_next_algorithm
<T
>(gtid
, my_pr
, my_sh
,
531 &contains_last
, &my_lb
, &my_ub
,
532 &my_st
, nproc
, hier_id
);
535 ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
536 gtid
, hier_level
, status
));
537 // When no iterations are found (status == 0) and this is not the last
538 // layer, attempt to go up the hierarchy for more iterations
539 if (status
== 0 && !last_layer
) {
541 __kmp_type_convert(hier_id
, &hid
);
542 status
= next_recurse(loc
, gtid
, parent
, &contains_last
, &my_lb
, &my_ub
,
543 &my_st
, hid
, hier_level
+ 1);
546 ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
547 gtid
, hier_level
, status
));
549 kmp_hier_private_bdata_t
*upper_tdata
=
550 &(th
->th
.th_hier_bar_data
[hier_level
+ 1]);
551 my_sh
= parent
->get_curr_sh(upper_tdata
->index
);
552 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
554 __kmp_dispatch_init_algorithm(loc
, gtid
, my_pr
, schedule
,
555 parent
->get_curr_lb(upper_tdata
->index
),
556 parent
->get_curr_ub(upper_tdata
->index
),
557 parent
->get_curr_st(upper_tdata
->index
),
561 chunk
, nproc
, hier_id
);
562 status
= __kmp_dispatch_next_algorithm
<T
>(
563 gtid
, my_pr
, my_sh
, &contains_last
, &my_lb
, &my_ub
, &my_st
, nproc
,
566 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
573 current
->set_next(my_lb
, my_ub
, my_st
, status
, tdata
->index
);
574 // Propagate whether a unit holds the actual global last iteration
575 // The contains_last attribute is sent downwards from the top to the
576 // bottom of the hierarchy via the contains_last flag inside the
577 // private dispatch buffers in the hierarchy's middle layers
579 // If the next_algorithm() method returns 1 for p_last and it is the
580 // last layer or our parent contains the last serial chunk, then the
581 // chunk must contain the last serial iteration.
582 if (last_layer
|| parent
->hier_pr
.flags
.contains_last
) {
583 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
584 "to contain last.\n",
586 current
->hier_pr
.flags
.contains_last
= contains_last
;
588 if (!current
->hier_pr
.flags
.contains_last
)
589 contains_last
= FALSE
;
592 *p_last
= contains_last
;
593 } // if primary thread of this unit
594 if (hier_level
> 0 || !__kmp_dispatch_hand_threading
) {
596 ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
598 current
->barrier(previous_id
, tdata
);
600 ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
601 gtid
, hier_level
, current
->get_curr_status(tdata
->index
)));
603 KMP_DEBUG_ASSERT(previous_id
== 0);
606 return current
->get_curr_status(tdata
->index
);
614 kmp_hier_layer_info_t
<T
> *info
;
615 kmp_hier_top_unit_t
<T
> **layers
;
616 // Deallocate all memory from this hierarchy
618 for (int i
= 0; i
< num_layers
; ++i
)
619 if (layers
[i
] != NULL
) {
620 __kmp_free(layers
[i
]);
622 if (layers
!= NULL
) {
633 // Returns true if reallocation is needed else false
634 bool need_to_reallocate(int n
, const kmp_hier_layer_e
*new_layers
,
635 const enum sched_type
*new_scheds
,
636 const ST
*new_chunks
) const {
637 if (!valid
|| layers
== NULL
|| info
== NULL
||
638 traits_t
<T
>::type_size
!= type_size
|| n
!= num_layers
)
640 for (int i
= 0; i
< n
; ++i
) {
641 if (info
[i
].type
!= new_layers
[i
])
643 if (info
[i
].sched
!= new_scheds
[i
])
645 if (info
[i
].chunk
!= new_chunks
[i
])
650 // A single thread should call this function while the other threads wait
651 // create a new scheduling hierarchy consisting of new_layers, new_scheds
652 // and new_chunks. These should come pre-sorted according to
653 // kmp_hier_layer_e value. This function will try to avoid reallocation
655 void allocate_hier(int n
, const kmp_hier_layer_e
*new_layers
,
656 const enum sched_type
*new_scheds
, const ST
*new_chunks
) {
658 if (!need_to_reallocate(n
, new_layers
, new_scheds
, new_chunks
)) {
661 ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
662 for (int i
= 0; i
< n
; ++i
) {
663 info
[i
].num_active
= 0;
664 for (int j
= 0; j
< get_length(i
); ++j
)
665 layers
[i
][j
].active
= 0;
669 KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
671 type_size
= traits_t
<T
>::type_size
;
673 info
= (kmp_hier_layer_info_t
<T
> *)__kmp_allocate(
674 sizeof(kmp_hier_layer_info_t
<T
>) * n
);
675 layers
= (kmp_hier_top_unit_t
<T
> **)__kmp_allocate(
676 sizeof(kmp_hier_top_unit_t
<T
> *) * n
);
677 for (int i
= 0; i
< n
; ++i
) {
679 kmp_hier_layer_e layer
= new_layers
[i
];
680 info
[i
].num_active
= 0;
681 info
[i
].type
= layer
;
682 info
[i
].sched
= new_scheds
[i
];
683 info
[i
].chunk
= new_chunks
[i
];
684 max
= __kmp_hier_max_units
[layer
+ 1];
687 KMP_WARNING(HierSchedInvalid
, __kmp_get_hier_str(layer
));
691 info
[i
].length
= max
;
692 layers
[i
] = (kmp_hier_top_unit_t
<T
> *)__kmp_allocate(
693 sizeof(kmp_hier_top_unit_t
<T
>) * max
);
694 for (int j
= 0; j
< max
; ++j
) {
695 layers
[i
][j
].active
= 0;
696 layers
[i
][j
].hier_pr
.flags
.use_hier
= TRUE
;
701 // loc - source file location
702 // gtid - global thread identifier
703 // pr - this thread's private dispatch buffer (corresponding with gtid)
704 // p_last (return value) - pointer to flag indicating this set of iterations
707 // p_lb (return value) - lower bound for this chunk of iterations
708 // p_ub (return value) - upper bound for this chunk of iterations
709 // p_st (return value) - stride for this chunk of iterations
711 // Returns 1 if there are more iterations to perform, 0 otherwise
712 int next(ident_t
*loc
, int gtid
, dispatch_private_info_template
<T
> *pr
,
713 kmp_int32
*p_last
, T
*p_lb
, T
*p_ub
, ST
*p_st
) {
715 kmp_int32 contains_last
= 0;
716 kmp_info_t
*th
= __kmp_threads
[gtid
];
717 kmp_hier_private_bdata_t
*tdata
= &(th
->th
.th_hier_bar_data
[0]);
718 auto parent
= pr
->get_parent();
719 KMP_DEBUG_ASSERT(parent
);
720 KMP_DEBUG_ASSERT(th
);
721 KMP_DEBUG_ASSERT(tdata
);
722 KMP_DEBUG_ASSERT(parent
);
723 T nproc
= (T
)parent
->get_num_active();
724 T unit_id
= (T
)pr
->get_hier_id();
727 ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
728 gtid
, nproc
, unit_id
));
729 // Handthreading implementation
730 // Each iteration is performed by all threads on last unit (typically
732 // e.g., threads 0,1,2,3 all execute iteration 0
733 // threads 0,1,2,3 all execute iteration 1
734 // threads 4,5,6,7 all execute iteration 2
735 // threads 4,5,6,7 all execute iteration 3
737 if (__kmp_dispatch_hand_threading
) {
739 ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
742 // For hand threading, the sh buffer on the lowest level is only ever
743 // modified and read by the primary thread on that level. Because of
744 // this, we can always use the first sh buffer.
745 auto sh
= &(parent
->hier_barrier
.sh
[0]);
746 KMP_DEBUG_ASSERT(sh
);
747 status
= __kmp_dispatch_next_algorithm
<T
>(
748 gtid
, pr
, sh
, &contains_last
, p_lb
, p_ub
, p_st
, nproc
, unit_id
);
754 __kmp_type_convert(unit_id
, &uid
);
755 status
= next_recurse(loc
, gtid
, parent
, &contains_last
, p_lb
, p_ub
,
758 __kmp_dispatch_init_algorithm(loc
, gtid
, pr
, pr
->schedule
,
759 parent
->get_next_lb(tdata
->index
),
760 parent
->get_next_ub(tdata
->index
),
761 parent
->get_next_st(tdata
->index
),
765 pr
->u
.p
.parm1
, nproc
, unit_id
);
766 sh
->u
.s
.iteration
= 0;
767 status
= __kmp_dispatch_next_algorithm
<T
>(
768 gtid
, pr
, sh
, &contains_last
, p_lb
, p_ub
, p_st
, nproc
,
772 ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
778 } else if (status
== 2) {
779 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
786 parent
->set_next_hand_thread(*p_lb
, *p_ub
, *p_st
, status
, tdata
->index
);
787 } // if primary thread of lowest unit level
788 parent
->barrier(pr
->get_hier_id(), tdata
);
790 *p_lb
= parent
->get_curr_lb(tdata
->index
);
791 *p_ub
= parent
->get_curr_ub(tdata
->index
);
792 *p_st
= parent
->get_curr_st(tdata
->index
);
793 status
= parent
->get_curr_status(tdata
->index
);
796 // Normal implementation
797 // Each thread grabs an iteration chunk and executes it (no cooperation)
798 auto sh
= parent
->get_curr_sh(tdata
->index
);
799 KMP_DEBUG_ASSERT(sh
);
800 status
= __kmp_dispatch_next_algorithm
<T
>(
801 gtid
, pr
, sh
, &contains_last
, p_lb
, p_ub
, p_st
, nproc
, unit_id
);
803 ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
804 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
805 gtid
, status
, contains_last
, *p_lb
, *p_ub
, *p_st
));
811 __kmp_type_convert(unit_id
, &uid
);
812 status
= next_recurse(loc
, gtid
, parent
, &contains_last
, p_lb
, p_ub
,
815 sh
= parent
->get_curr_sh(tdata
->index
);
816 __kmp_dispatch_init_algorithm(loc
, gtid
, pr
, pr
->schedule
,
817 parent
->get_curr_lb(tdata
->index
),
818 parent
->get_curr_ub(tdata
->index
),
819 parent
->get_curr_st(tdata
->index
),
823 pr
->u
.p
.parm1
, nproc
, unit_id
);
824 status
= __kmp_dispatch_next_algorithm
<T
>(
825 gtid
, pr
, sh
, &contains_last
, p_lb
, p_ub
, p_st
, nproc
, unit_id
);
827 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
833 } else if (status
== 2) {
834 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
842 if (contains_last
&& !parent
->hier_pr
.flags
.contains_last
) {
843 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
844 "contains_last to FALSE\n",
846 contains_last
= FALSE
;
849 *p_last
= contains_last
;
850 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid
,
854 // These functions probe the layer info structure
855 // Returns the type of topology unit given level
856 kmp_hier_layer_e
get_type(int level
) const {
857 KMP_DEBUG_ASSERT(level
>= 0);
858 KMP_DEBUG_ASSERT(level
< num_layers
);
859 return info
[level
].type
;
861 // Returns the schedule type at given level
862 enum sched_type
get_sched(int level
) const {
863 KMP_DEBUG_ASSERT(level
>= 0);
864 KMP_DEBUG_ASSERT(level
< num_layers
);
865 return info
[level
].sched
;
867 // Returns the chunk size at given level
868 ST
get_chunk(int level
) const {
869 KMP_DEBUG_ASSERT(level
>= 0);
870 KMP_DEBUG_ASSERT(level
< num_layers
);
871 return info
[level
].chunk
;
873 // Returns the number of active threads at given level
874 int get_num_active(int level
) const {
875 KMP_DEBUG_ASSERT(level
>= 0);
876 KMP_DEBUG_ASSERT(level
< num_layers
);
877 return info
[level
].num_active
;
879 // Returns the length of topology unit array at given level
880 int get_length(int level
) const {
881 KMP_DEBUG_ASSERT(level
>= 0);
882 KMP_DEBUG_ASSERT(level
< num_layers
);
883 return info
[level
].length
;
885 // Returns the topology unit given the level and index
886 kmp_hier_top_unit_t
<T
> *get_unit(int level
, int index
) {
887 KMP_DEBUG_ASSERT(level
>= 0);
888 KMP_DEBUG_ASSERT(level
< num_layers
);
889 KMP_DEBUG_ASSERT(index
>= 0);
890 KMP_DEBUG_ASSERT(index
< get_length(level
));
891 return &(layers
[level
][index
]);
893 // Returns the number of layers in the hierarchy
894 int get_num_layers() const { return num_layers
; }
895 // Returns the number of threads in the top layer
896 // This is necessary because we don't store a topology unit as
897 // the very top level and the scheduling algorithms need this information
898 int get_top_level_nproc() const { return top_level_nproc
; }
899 // Return whether this hierarchy is valid or not
900 bool is_valid() const { return valid
; }
902 // Print the hierarchy
904 KD_TRACE(10, ("kmp_hier_t:\n"));
905 for (int i
= num_layers
- 1; i
>= 0; --i
) {
906 KD_TRACE(10, ("Info[%d] = ", i
));
909 for (int i
= num_layers
- 1; i
>= 0; --i
) {
910 KD_TRACE(10, ("Layer[%d] =\n", i
));
911 for (int j
= 0; j
< info
[i
].length
; ++j
) {
912 layers
[i
][j
].print();
919 template <typename T
>
920 void __kmp_dispatch_init_hierarchy(ident_t
*loc
, int n
,
921 kmp_hier_layer_e
*new_layers
,
922 enum sched_type
*new_scheds
,
923 typename traits_t
<T
>::signed_t
*new_chunks
,
925 typename traits_t
<T
>::signed_t st
) {
926 int tid
, gtid
, num_hw_threads
, num_threads_per_layer1
, active
;
927 unsigned int my_buffer_index
;
930 dispatch_private_info_template
<T
> *pr
;
931 dispatch_shared_info_template
<T
> volatile *sh
;
932 gtid
= __kmp_entry_gtid();
933 tid
= __kmp_tid_from_gtid(gtid
);
935 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
937 for (int i
= 0; i
< n
; ++i
) {
938 const char *layer
= __kmp_get_hier_str(new_layers
[i
]);
939 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
940 "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
941 gtid
, i
, layer
, i
, (int)new_scheds
[i
], i
, new_chunks
[i
]));
944 KMP_DEBUG_ASSERT(n
> 0);
945 KMP_DEBUG_ASSERT(new_layers
);
946 KMP_DEBUG_ASSERT(new_scheds
);
947 KMP_DEBUG_ASSERT(new_chunks
);
948 if (!TCR_4(__kmp_init_parallel
))
949 __kmp_parallel_initialize();
950 __kmp_resume_if_soft_paused();
952 th
= __kmp_threads
[gtid
];
953 team
= th
->th
.th_team
;
954 active
= !team
->t
.t_serialized
;
955 th
->th
.th_ident
= loc
;
956 num_hw_threads
= __kmp_hier_max_units
[kmp_hier_layer_e::LAYER_THREAD
+ 1];
957 KMP_DEBUG_ASSERT(th
->th
.th_dispatch
==
958 &th
->th
.th_team
->t
.t_dispatch
[th
->th
.th_info
.ds
.ds_tid
]);
959 my_buffer_index
= th
->th
.th_dispatch
->th_disp_index
;
960 pr
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
962 ->th_disp_buffer
[my_buffer_index
% __kmp_dispatch_num_buffers
]);
963 sh
= reinterpret_cast<dispatch_shared_info_template
<T
> volatile *>(
964 &team
->t
.t_disp_buffer
[my_buffer_index
% __kmp_dispatch_num_buffers
]);
966 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
967 "Using normal dispatch functions.\n",
969 KMP_DEBUG_ASSERT(pr
);
970 pr
->flags
.use_hier
= FALSE
;
971 pr
->flags
.contains_last
= FALSE
;
974 KMP_DEBUG_ASSERT(pr
);
975 KMP_DEBUG_ASSERT(sh
);
976 pr
->flags
.use_hier
= TRUE
;
978 // Have primary thread allocate the hierarchy
979 if (__kmp_tid_from_gtid(gtid
) == 0) {
980 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
983 if (sh
->hier
== NULL
) {
984 sh
->hier
= (kmp_hier_t
<T
> *)__kmp_allocate(sizeof(kmp_hier_t
<T
>));
986 sh
->hier
->allocate_hier(n
, new_layers
, new_scheds
, new_chunks
);
987 sh
->u
.s
.iteration
= 0;
989 __kmp_barrier(bs_plain_barrier
, gtid
, FALSE
, 0, NULL
, NULL
);
990 // Check to make sure the hierarchy is valid
991 kmp_hier_t
<T
> *hier
= sh
->hier
;
992 if (!sh
->hier
->is_valid()) {
993 pr
->flags
.use_hier
= FALSE
;
996 // Have threads allocate their thread-private barrier data if it hasn't
997 // already been allocated
998 if (th
->th
.th_hier_bar_data
== NULL
) {
999 th
->th
.th_hier_bar_data
= (kmp_hier_private_bdata_t
*)__kmp_allocate(
1000 sizeof(kmp_hier_private_bdata_t
) * kmp_hier_layer_e::LAYER_LAST
);
1002 // Have threads "register" themselves by modifying the active count for each
1003 // level they are involved in. The active count will act as nthreads for that
1004 // level regarding the scheduling algorithms
1005 for (int i
= 0; i
< n
; ++i
) {
1006 int index
= __kmp_dispatch_get_index(tid
, hier
->get_type(i
));
1007 kmp_hier_top_unit_t
<T
> *my_unit
= hier
->get_unit(i
, index
);
1008 // Setup the thread's private dispatch buffer's hierarchy pointers
1010 pr
->hier_parent
= my_unit
;
1011 // If this unit is already active, then increment active count and wait
1012 if (my_unit
->is_active()) {
1013 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1014 "is already active (%d)\n",
1015 gtid
, my_unit
, my_unit
->active
));
1016 KMP_TEST_THEN_INC32(&(my_unit
->active
));
1019 // Flag that this unit is active
1020 if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit
->active
), 0, 1)) {
1021 // Do not setup parent pointer for top level unit since it has no parent
1023 // Setup middle layer pointers to parents
1024 my_unit
->get_my_pr()->hier_id
=
1025 index
% __kmp_dispatch_get_t1_per_t2(hier
->get_type(i
),
1026 hier
->get_type(i
+ 1));
1027 int parent_index
= __kmp_dispatch_get_index(tid
, hier
->get_type(i
+ 1));
1028 my_unit
->hier_parent
= hier
->get_unit(i
+ 1, parent_index
);
1030 // Setup top layer information (no parent pointers are set)
1031 my_unit
->get_my_pr()->hier_id
=
1032 index
% __kmp_dispatch_get_t1_per_t2(hier
->get_type(i
),
1033 kmp_hier_layer_e::LAYER_LOOP
);
1034 KMP_TEST_THEN_INC32(&(hier
->top_level_nproc
));
1035 my_unit
->hier_parent
= nullptr;
1037 // Set trip count to 0 so that next() operation will initially climb up
1038 // the hierarchy to get more iterations (early exit in next() for tc == 0)
1039 my_unit
->get_my_pr()->u
.p
.tc
= 0;
1040 // Increment this layer's number of active units
1041 KMP_TEST_THEN_INC32(&(hier
->info
[i
].num_active
));
1042 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1043 "incrementing num_active\n",
1046 KMP_TEST_THEN_INC32(&(my_unit
->active
));
1050 // Set this thread's id
1051 num_threads_per_layer1
= __kmp_dispatch_get_t1_per_t2(
1052 kmp_hier_layer_e::LAYER_THREAD
, hier
->get_type(0));
1053 pr
->hier_id
= tid
% num_threads_per_layer1
;
1054 // For oversubscribed threads, increment their index within the lowest unit
1055 // This is done to prevent having two or more threads with id 0, id 1, etc.
1056 if (tid
>= num_hw_threads
)
1057 pr
->hier_id
+= ((tid
/ num_hw_threads
) * num_threads_per_layer1
);
1059 10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1060 gtid
, pr
->hier_id
));
1062 pr
->flags
.contains_last
= FALSE
;
1063 __kmp_barrier(bs_plain_barrier
, gtid
, FALSE
, 0, NULL
, NULL
);
1065 // Now that the number of active threads at each level is determined,
1066 // the barrier data for each unit can be initialized and the last layer's
1067 // loop information can be initialized.
1068 int prev_id
= pr
->get_hier_id();
1069 for (int i
= 0; i
< n
; ++i
) {
1072 int index
= __kmp_dispatch_get_index(tid
, hier
->get_type(i
));
1073 kmp_hier_top_unit_t
<T
> *my_unit
= hier
->get_unit(i
, index
);
1074 // Only primary threads of this unit within the hierarchy do initialization
1075 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1077 my_unit
->reset_shared_barrier();
1078 my_unit
->hier_pr
.flags
.contains_last
= FALSE
;
1079 // Last layer, initialize the private buffers with entire loop information
1080 // Now the next next_algorithm() call will get the first chunk of
1081 // iterations properly
1083 __kmp_dispatch_init_algorithm
<T
>(
1084 loc
, gtid
, my_unit
->get_my_pr(), hier
->get_sched(i
), lb
, ub
, st
,
1088 hier
->get_chunk(i
), hier
->get_num_active(i
), my_unit
->get_hier_id());
1090 prev_id
= my_unit
->get_hier_id();
1092 // Initialize each layer of the thread's private barrier data
1093 kmp_hier_top_unit_t
<T
> *unit
= pr
->hier_parent
;
1094 for (int i
= 0; i
< n
&& unit
; ++i
, unit
= unit
->get_parent()) {
1095 kmp_hier_private_bdata_t
*tdata
= &(th
->th
.th_hier_bar_data
[i
]);
1096 unit
->reset_private_barrier(tdata
);
1098 __kmp_barrier(bs_plain_barrier
, gtid
, FALSE
, 0, NULL
, NULL
);
1101 if (__kmp_tid_from_gtid(gtid
) == 0) {
1102 for (int i
= 0; i
< n
; ++i
) {
1104 ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1105 gtid
, i
, hier
->get_num_active(i
)));
1109 __kmp_barrier(bs_plain_barrier
, gtid
, FALSE
, 0, NULL
, NULL
);