2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
13 /* Dynamic scheduling initialization and dispatch.
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
21 #include "kmp_error.h"
24 #include "kmp_stats.h"
26 #if KMP_USE_X87CONTROL
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
36 #include "ompt-specific.h"
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
42 void __kmp_dispatch_deo_error(int *gtid_ref
, int *cid_ref
, ident_t
*loc_ref
) {
45 KMP_DEBUG_ASSERT(gtid_ref
);
47 if (__kmp_env_consistency_check
) {
48 th
= __kmp_threads
[*gtid_ref
];
49 if (th
->th
.th_root
->r
.r_active
&&
50 (th
->th
.th_dispatch
->th_dispatch_pr_current
->pushed_ws
!= ct_none
)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref
, ct_ordered_in_pdo
, loc_ref
, NULL
, 0);
54 __kmp_push_sync(*gtid_ref
, ct_ordered_in_pdo
, loc_ref
, NULL
);
60 void __kmp_dispatch_dxo_error(int *gtid_ref
, int *cid_ref
, ident_t
*loc_ref
) {
63 if (__kmp_env_consistency_check
) {
64 th
= __kmp_threads
[*gtid_ref
];
65 if (th
->th
.th_dispatch
->th_dispatch_pr_current
->pushed_ws
!= ct_none
) {
66 __kmp_pop_sync(*gtid_ref
, ct_ordered_in_pdo
, loc_ref
);
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule
,
73 bool use_hier
= false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
76 // default to monotonic
77 monotonicity
= SCHEDULE_MONOTONIC
;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule
))
79 monotonicity
= SCHEDULE_NONMONOTONIC
;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule
))
81 monotonicity
= SCHEDULE_MONOTONIC
;
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk. The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride). nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used). tid is the id of the thread calling
90 // the function within the group of nproc threads. It will have a value
91 // between 0 and nproc - 1. This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
96 void __kmp_dispatch_init_algorithm(ident_t
*loc
, int gtid
,
97 dispatch_private_info_template
<T
> *pr
,
98 enum sched_type schedule
, T lb
, T ub
,
99 typename traits_t
<T
>::signed_t st
,
101 kmp_uint64
*cur_chunk
,
103 typename traits_t
<T
>::signed_t chunk
,
105 typedef typename traits_t
<T
>::unsigned_t UT
;
106 typedef typename traits_t
<T
>::floating_t DBL
;
116 typedef typename traits_t
<T
>::signed_t ST
;
119 // create format specifiers before the debug output
120 buff
= __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t
<T
>::spec
, traits_t
<T
>::spec
,
124 traits_t
<ST
>::spec
, traits_t
<ST
>::spec
,
125 traits_t
<T
>::spec
, traits_t
<T
>::spec
);
126 KD_TRACE(10, (buff
, gtid
, pr
, lb
, ub
, st
, schedule
, chunk
, nproc
, tid
));
127 __kmp_str_free(&buff
);
131 th
= __kmp_threads
[gtid
];
132 team
= th
->th
.th_team
;
133 active
= !team
->t
.t_serialized
;
136 int itt_need_metadata_reporting
=
137 __itt_metadata_add_ptr
&& __kmp_forkjoin_frames_mode
== 3 &&
138 KMP_MASTER_GTID(gtid
) && th
->th
.th_teams_microtask
== NULL
&&
139 team
->t
.t_active_level
== 1;
142 #if KMP_USE_HIER_SCHED
143 use_hier
= pr
->flags
.use_hier
;
148 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149 monotonicity
= __kmp_get_monotonicity(schedule
, use_hier
);
150 schedule
= SCHEDULE_WITHOUT_MODIFIERS(schedule
);
152 /* Pick up the nomerge/ordered bits from the scheduling type */
153 if ((schedule
>= kmp_nm_lower
) && (schedule
< kmp_nm_upper
)) {
154 pr
->flags
.nomerge
= TRUE
;
156 (enum sched_type
)(((int)schedule
) - (kmp_nm_lower
- kmp_sch_lower
));
158 pr
->flags
.nomerge
= FALSE
;
160 pr
->type_size
= traits_t
<T
>::type_size
; // remember the size of variables
161 if (kmp_ord_lower
& schedule
) {
162 pr
->flags
.ordered
= TRUE
;
164 (enum sched_type
)(((int)schedule
) - (kmp_ord_lower
- kmp_sch_lower
));
166 pr
->flags
.ordered
= FALSE
;
168 // Ordered overrides nonmonotonic
169 if (pr
->flags
.ordered
) {
170 monotonicity
= SCHEDULE_MONOTONIC
;
173 if (schedule
== kmp_sch_static
) {
174 schedule
= __kmp_static
;
176 if (schedule
== kmp_sch_runtime
) {
177 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
179 schedule
= team
->t
.t_sched
.r_sched_type
;
180 monotonicity
= __kmp_get_monotonicity(schedule
, use_hier
);
181 schedule
= SCHEDULE_WITHOUT_MODIFIERS(schedule
);
182 // Detail the schedule if needed (global controls are differentiated
184 if (schedule
== kmp_sch_guided_chunked
) {
185 schedule
= __kmp_guided
;
186 } else if (schedule
== kmp_sch_static
) {
187 schedule
= __kmp_static
;
189 // Use the chunk size specified by OMP_SCHEDULE (or default if not
191 chunk
= team
->t
.t_sched
.chunk
;
199 // create format specifiers before the debug output
200 buff
= __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201 "schedule:%%d chunk:%%%s\n",
203 KD_TRACE(10, (buff
, gtid
, schedule
, chunk
));
204 __kmp_str_free(&buff
);
208 if (schedule
== kmp_sch_guided_chunked
) {
209 schedule
= __kmp_guided
;
212 chunk
= KMP_DEFAULT_CHUNK
;
216 if (schedule
== kmp_sch_auto
) {
217 // mapping and differentiation: in the __kmp_do_serial_initialize()
218 schedule
= __kmp_auto
;
222 // create format specifiers before the debug output
223 buff
= __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225 "schedule:%%d chunk:%%%s\n",
227 KD_TRACE(10, (buff
, gtid
, schedule
, chunk
));
228 __kmp_str_free(&buff
);
232 #if KMP_STATIC_STEAL_ENABLED
233 // map nonmonotonic:dynamic to static steal
234 if (schedule
== kmp_sch_dynamic_chunked
) {
235 if (monotonicity
== SCHEDULE_NONMONOTONIC
)
236 schedule
= kmp_sch_static_steal
;
239 /* guided analytical not safe for too many threads */
240 if (schedule
== kmp_sch_guided_analytical_chunked
&& nproc
> 1 << 20) {
241 schedule
= kmp_sch_guided_iterative_chunked
;
242 KMP_WARNING(DispatchManyThreads
);
244 if (schedule
== kmp_sch_runtime_simd
) {
245 // compiler provides simd_width in the chunk parameter
246 schedule
= team
->t
.t_sched
.r_sched_type
;
247 monotonicity
= __kmp_get_monotonicity(schedule
, use_hier
);
248 schedule
= SCHEDULE_WITHOUT_MODIFIERS(schedule
);
249 // Detail the schedule if needed (global controls are differentiated
251 if (schedule
== kmp_sch_static
|| schedule
== kmp_sch_auto
||
252 schedule
== __kmp_static
) {
253 schedule
= kmp_sch_static_balanced_chunked
;
255 if (schedule
== kmp_sch_guided_chunked
|| schedule
== __kmp_guided
) {
256 schedule
= kmp_sch_guided_simd
;
258 chunk
= team
->t
.t_sched
.chunk
* chunk
;
267 // create format specifiers before the debug output
268 buff
= __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
272 KD_TRACE(10, (buff
, gtid
, schedule
, chunk
));
273 __kmp_str_free(&buff
);
277 pr
->u
.p
.parm1
= chunk
;
279 KMP_ASSERT2((kmp_sch_lower
< schedule
&& schedule
< kmp_sch_upper
),
280 "unknown scheduling type");
284 if (__kmp_env_consistency_check
) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited
,
287 (pr
->flags
.ordered
? ct_pdo_ordered
: ct_pdo
), loc
);
290 // compute trip count
291 if (st
== 1) { // most common case
299 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300 // where the division needs to be unsigned regardless of the result type
301 tc
= (UT
)(lb
- ub
) / (-st
) + 1;
307 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308 // where the division needs to be unsigned regardless of the result type
309 tc
= (UT
)(ub
- lb
) / st
+ 1;
315 #if KMP_STATS_ENABLED
316 if (KMP_MASTER_GTID(gtid
)) {
317 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations
, tc
);
327 pr
->u
.p
.last_upper
= ub
+ st
;
328 #endif /* KMP_OS_WINDOWS */
330 /* NOTE: only the active parallel region(s) has active ordered sections */
333 if (pr
->flags
.ordered
) {
334 pr
->ordered_bumped
= 0;
335 pr
->u
.p
.ordered_lower
= 1;
336 pr
->u
.p
.ordered_upper
= 0;
341 #if (KMP_STATIC_STEAL_ENABLED)
342 case kmp_sch_static_steal
: {
346 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
349 ntc
= (tc
% chunk
? 1 : 0) + tc
/ chunk
;
350 if (nproc
> 1 && ntc
>= nproc
) {
351 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL
);
353 T small_chunk
, extras
;
355 small_chunk
= ntc
/ nproc
;
356 extras
= ntc
% nproc
;
358 init
= id
* small_chunk
+ (id
< extras
? id
: extras
);
359 pr
->u
.p
.count
= init
;
360 pr
->u
.p
.ub
= init
+ small_chunk
+ (id
< extras
? 1 : 0);
363 // parm3 is the number of times to attempt stealing which is
364 // proportional to the number of chunks per thread up until
365 // the maximum value of nproc.
366 pr
->u
.p
.parm3
= KMP_MIN(small_chunk
+ extras
, nproc
);
367 pr
->u
.p
.parm4
= (id
+ 1) % nproc
; // remember neighbour tid
369 if (traits_t
<T
>::type_size
> 4) {
370 // AC: TODO: check if 16-byte CAS available and use it to
371 // improve performance (probably wait for explicit request
372 // before spending time on this).
373 // For now use dynamically allocated per-thread lock,
374 // free memory in __kmp_dispatch_next when status==0.
375 KMP_DEBUG_ASSERT(pr
->u
.p
.th_steal_lock
== NULL
);
376 pr
->u
.p
.th_steal_lock
=
377 (kmp_lock_t
*)__kmp_allocate(sizeof(kmp_lock_t
));
378 __kmp_init_lock(pr
->u
.p
.th_steal_lock
);
382 /* too few chunks: switching to kmp_sch_dynamic_chunked */
383 schedule
= kmp_sch_dynamic_chunked
;
384 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385 "kmp_sch_dynamic_chunked\n",
387 if (pr
->u
.p
.parm1
<= 0)
388 pr
->u
.p
.parm1
= KMP_DEFAULT_CHUNK
;
393 case kmp_sch_static_balanced
: {
398 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
408 pr
->u
.p
.parm1
= (id
== tc
- 1); /* parm1 stores *plastiter */
410 pr
->u
.p
.count
= 1; /* means no more chunks to execute */
411 pr
->u
.p
.parm1
= FALSE
;
415 T small_chunk
= tc
/ nproc
;
416 T extras
= tc
% nproc
;
417 init
= id
* small_chunk
+ (id
< extras
? id
: extras
);
418 limit
= init
+ small_chunk
- (id
< extras
? 0 : 1);
419 pr
->u
.p
.parm1
= (id
== nproc
- 1);
425 pr
->u
.p
.parm1
= TRUE
;
428 pr
->u
.p
.count
= 1; /* means no more chunks to execute */
429 pr
->u
.p
.parm1
= FALSE
;
434 // Calculate chunk for metadata report
435 if (itt_need_metadata_reporting
)
437 *cur_chunk
= limit
- init
+ 1;
440 pr
->u
.p
.lb
= lb
+ init
;
441 pr
->u
.p
.ub
= lb
+ limit
;
443 // calculated upper bound, "ub" is user-defined upper bound
444 T ub_tmp
= lb
+ limit
* st
;
445 pr
->u
.p
.lb
= lb
+ init
* st
;
446 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
449 pr
->u
.p
.ub
= (ub_tmp
+ st
> ub
? ub
: ub_tmp
);
451 pr
->u
.p
.ub
= (ub_tmp
+ st
< ub
? ub
: ub_tmp
);
454 if (pr
->flags
.ordered
) {
455 pr
->u
.p
.ordered_lower
= init
;
456 pr
->u
.p
.ordered_upper
= limit
;
460 case kmp_sch_static_balanced_chunked
: {
461 // similar to balanced, but chunk adjusted to multiple of simd width
463 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464 " -> falling-through to static_greedy\n",
466 schedule
= kmp_sch_static_greedy
;
468 pr
->u
.p
.parm1
= ((tc
+ nth
- 1) / nth
+ chunk
- 1) & ~(chunk
- 1);
473 case kmp_sch_guided_simd
:
474 case kmp_sch_guided_iterative_chunked
: {
477 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
482 if ((2L * chunk
+ 1) * nproc
>= tc
) {
483 /* chunk size too large, switch to dynamic */
484 schedule
= kmp_sch_dynamic_chunked
;
486 // when remaining iters become less than parm2 - switch to dynamic
487 pr
->u
.p
.parm2
= guided_int_param
* nproc
* (chunk
+ 1);
488 *(double *)&pr
->u
.p
.parm3
=
489 guided_flt_param
/ nproc
; // may occupy parm3 and parm4
492 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493 "kmp_sch_static_greedy\n",
495 schedule
= kmp_sch_static_greedy
;
496 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
499 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
505 case kmp_sch_guided_analytical_chunked
: {
506 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507 "kmp_sch_guided_analytical_chunked case\n",
511 if ((2L * chunk
+ 1) * nproc
>= tc
) {
512 /* chunk size too large, switch to dynamic */
513 schedule
= kmp_sch_dynamic_chunked
;
515 /* commonly used term: (2 nproc - 1)/(2 nproc) */
518 #if KMP_USE_X87CONTROL
519 /* Linux* OS already has 64-bit computation by default for long double,
520 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522 instead of the default 53-bit. Even though long double doesn't work
523 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524 expected to impact the correctness of the algorithm, but this has not
525 been mathematically proven. */
526 // save original FPCW and set precision to 64-bit, as
527 // Windows* OS on IA-32 architecture defaults to 53-bit
528 unsigned int oldFpcw
= _control87(0, 0);
529 _control87(_PC_64
, _MCW_PC
); // 0,0x30000
531 /* value used for comparison in solver for cross-over point */
532 long double target
= ((long double)chunk
* 2 + 1) * nproc
/ tc
;
534 /* crossover point--chunk indexes equal to or greater than
535 this point switch to dynamic-style scheduling */
538 /* commonly used term: (2 nproc - 1)/(2 nproc) */
539 x
= (long double)1.0 - (long double)0.5 / nproc
;
542 { // test natural alignment
550 ptrdiff_t natural_alignment
=
551 (ptrdiff_t)&t
.b
- (ptrdiff_t)&t
- (ptrdiff_t)1;
552 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553 // long)natural_alignment );
555 (((ptrdiff_t)&pr
->u
.p
.parm3
) & (natural_alignment
)) == 0);
559 /* save the term in thread private dispatch structure */
560 *(DBL
*)&pr
->u
.p
.parm3
= x
;
562 /* solve for the crossover point to the nearest integer i for which C_i
568 /* estimate initial upper and lower bound */
570 /* doesn't matter what value right is as long as it is positive, but
571 it affects performance of the solver */
573 p
= __kmp_pow
<UT
>(x
, right
);
578 } while (p
> target
&& right
< (1 << 27));
579 /* lower bound is previous (failed) estimate of upper bound */
585 /* bisection root-finding method */
586 while (left
+ 1 < right
) {
587 mid
= (left
+ right
) / 2;
588 if (__kmp_pow
<UT
>(x
, mid
) > target
) {
596 /* assert sanity of computed crossover point */
597 KMP_ASSERT(cross
&& __kmp_pow
<UT
>(x
, cross
- 1) > target
&&
598 __kmp_pow
<UT
>(x
, cross
) <= target
);
600 /* save the crossover point in thread private dispatch structure */
601 pr
->u
.p
.parm2
= cross
;
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
609 /* dynamic-style scheduling offset */
610 pr
->u
.p
.count
= tc
- __kmp_dispatch_guided_remaining(
611 tc
, GUIDED_ANALYTICAL_WORKAROUND
, cross
) -
613 #if KMP_USE_X87CONTROL
615 _control87(oldFpcw
, _MCW_PC
);
619 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620 "kmp_sch_static_greedy\n",
622 schedule
= kmp_sch_static_greedy
;
623 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
628 case kmp_sch_static_greedy
:
631 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
633 pr
->u
.p
.parm1
= (nproc
> 1) ? (tc
+ nproc
- 1) / nproc
: tc
;
635 case kmp_sch_static_chunked
:
636 case kmp_sch_dynamic_chunked
:
637 if (pr
->u
.p
.parm1
<= 0) {
638 pr
->u
.p
.parm1
= KMP_DEFAULT_CHUNK
;
640 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
644 case kmp_sch_trapezoidal
: {
645 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
647 T parm1
, parm2
, parm3
, parm4
;
649 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
654 /* F : size of the first cycle */
655 parm2
= (tc
/ (2 * nproc
));
661 /* L : size of the last cycle. Make sure the last cycle is not larger
662 than the first cycle. */
665 } else if (parm1
> parm2
) {
669 /* N : number of cycles */
670 parm3
= (parm2
+ parm1
);
671 parm3
= (2 * tc
+ parm3
- 1) / parm3
;
677 /* sigma : decreasing incr of the trapezoid */
679 parm4
= (parm2
- parm1
) / parm4
;
681 // pointless check, because parm4 >= 0 always
682 // if ( parm4 < 0 ) {
686 pr
->u
.p
.parm1
= parm1
;
687 pr
->u
.p
.parm2
= parm2
;
688 pr
->u
.p
.parm3
= parm3
;
689 pr
->u
.p
.parm4
= parm4
;
694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected
), // Primary message
695 KMP_HNT(GetNewerLibrary
), // Hint
696 __kmp_msg_null
// Variadic argument list terminator
700 pr
->schedule
= schedule
;
703 #if KMP_USE_HIER_SCHED
704 template <typename T
>
705 inline void __kmp_dispatch_init_hier_runtime(ident_t
*loc
, T lb
, T ub
,
706 typename traits_t
<T
>::signed_t st
);
709 __kmp_dispatch_init_hier_runtime
<kmp_int32
>(ident_t
*loc
, kmp_int32 lb
,
710 kmp_int32 ub
, kmp_int32 st
) {
711 __kmp_dispatch_init_hierarchy
<kmp_int32
>(
712 loc
, __kmp_hier_scheds
.size
, __kmp_hier_scheds
.layers
,
713 __kmp_hier_scheds
.scheds
, __kmp_hier_scheds
.small_chunks
, lb
, ub
, st
);
717 __kmp_dispatch_init_hier_runtime
<kmp_uint32
>(ident_t
*loc
, kmp_uint32 lb
,
718 kmp_uint32 ub
, kmp_int32 st
) {
719 __kmp_dispatch_init_hierarchy
<kmp_uint32
>(
720 loc
, __kmp_hier_scheds
.size
, __kmp_hier_scheds
.layers
,
721 __kmp_hier_scheds
.scheds
, __kmp_hier_scheds
.small_chunks
, lb
, ub
, st
);
725 __kmp_dispatch_init_hier_runtime
<kmp_int64
>(ident_t
*loc
, kmp_int64 lb
,
726 kmp_int64 ub
, kmp_int64 st
) {
727 __kmp_dispatch_init_hierarchy
<kmp_int64
>(
728 loc
, __kmp_hier_scheds
.size
, __kmp_hier_scheds
.layers
,
729 __kmp_hier_scheds
.scheds
, __kmp_hier_scheds
.large_chunks
, lb
, ub
, st
);
733 __kmp_dispatch_init_hier_runtime
<kmp_uint64
>(ident_t
*loc
, kmp_uint64 lb
,
734 kmp_uint64 ub
, kmp_int64 st
) {
735 __kmp_dispatch_init_hierarchy
<kmp_uint64
>(
736 loc
, __kmp_hier_scheds
.size
, __kmp_hier_scheds
.layers
,
737 __kmp_hier_scheds
.scheds
, __kmp_hier_scheds
.large_chunks
, lb
, ub
, st
);
740 // free all the hierarchy scheduling memory associated with the team
741 void __kmp_dispatch_free_hierarchies(kmp_team_t
*team
) {
742 int num_disp_buff
= team
->t
.t_max_nproc
> 1 ? __kmp_dispatch_num_buffers
: 2;
743 for (int i
= 0; i
< num_disp_buff
; ++i
) {
744 // type does not matter here so use kmp_int32
746 reinterpret_cast<dispatch_shared_info_template
<kmp_int32
> volatile *>(
747 &team
->t
.t_disp_buffer
[i
]);
749 sh
->hier
->deallocate();
750 __kmp_free(sh
->hier
);
756 // UT - unsigned flavor of T, ST - signed flavor of T,
757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758 template <typename T
>
760 __kmp_dispatch_init(ident_t
*loc
, int gtid
, enum sched_type schedule
, T lb
,
761 T ub
, typename traits_t
<T
>::signed_t st
,
762 typename traits_t
<T
>::signed_t chunk
, int push_ws
) {
763 typedef typename traits_t
<T
>::unsigned_t UT
;
768 kmp_uint32 my_buffer_index
;
769 dispatch_private_info_template
<T
> *pr
;
770 dispatch_shared_info_template
<T
> volatile *sh
;
772 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template
<T
>) ==
773 sizeof(dispatch_private_info
));
774 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template
<UT
>) ==
775 sizeof(dispatch_shared_info
));
776 __kmp_assert_valid_gtid(gtid
);
778 if (!TCR_4(__kmp_init_parallel
))
779 __kmp_parallel_initialize();
781 __kmp_resume_if_soft_paused();
783 #if INCLUDE_SSC_MARKS
784 SSC_MARK_DISPATCH_INIT();
787 typedef typename traits_t
<T
>::signed_t ST
;
790 // create format specifiers before the debug output
791 buff
= __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
792 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
793 traits_t
<ST
>::spec
, traits_t
<T
>::spec
,
794 traits_t
<T
>::spec
, traits_t
<ST
>::spec
);
795 KD_TRACE(10, (buff
, gtid
, schedule
, chunk
, lb
, ub
, st
));
796 __kmp_str_free(&buff
);
800 th
= __kmp_threads
[gtid
];
801 team
= th
->th
.th_team
;
802 active
= !team
->t
.t_serialized
;
803 th
->th
.th_ident
= loc
;
805 // Any half-decent optimizer will remove this test when the blocks are empty
806 // since the macros expand to nothing
807 // when statistics are disabled.
808 if (schedule
== __kmp_static
) {
809 KMP_COUNT_BLOCK(OMP_LOOP_STATIC
);
811 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC
);
814 #if KMP_USE_HIER_SCHED
815 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
816 // Hierarchical scheduling does not work with ordered, so if ordered is
817 // detected, then revert back to threaded scheduling.
819 enum sched_type my_sched
= schedule
;
820 my_buffer_index
= th
->th
.th_dispatch
->th_disp_index
;
821 pr
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
823 ->th_disp_buffer
[my_buffer_index
% __kmp_dispatch_num_buffers
]);
824 my_sched
= SCHEDULE_WITHOUT_MODIFIERS(my_sched
);
825 if ((my_sched
>= kmp_nm_lower
) && (my_sched
< kmp_nm_upper
))
827 (enum sched_type
)(((int)my_sched
) - (kmp_nm_lower
- kmp_sch_lower
));
828 ordered
= (kmp_ord_lower
& my_sched
);
829 if (pr
->flags
.use_hier
) {
831 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
832 "Disabling hierarchical scheduling.\n",
834 pr
->flags
.use_hier
= FALSE
;
837 if (schedule
== kmp_sch_runtime
&& __kmp_hier_scheds
.size
> 0) {
838 // Don't use hierarchical for ordered parallel loops and don't
839 // use the runtime hierarchy if one was specified in the program
840 if (!ordered
&& !pr
->flags
.use_hier
)
841 __kmp_dispatch_init_hier_runtime
<T
>(loc
, lb
, ub
, st
);
843 #endif // KMP_USE_HIER_SCHED
846 kmp_uint64 cur_chunk
= chunk
;
847 int itt_need_metadata_reporting
=
848 __itt_metadata_add_ptr
&& __kmp_forkjoin_frames_mode
== 3 &&
849 KMP_MASTER_GTID(gtid
) && th
->th
.th_teams_microtask
== NULL
&&
850 team
->t
.t_active_level
== 1;
853 pr
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
854 th
->th
.th_dispatch
->th_disp_buffer
); /* top of the stack */
856 KMP_DEBUG_ASSERT(th
->th
.th_dispatch
==
857 &th
->th
.th_team
->t
.t_dispatch
[th
->th
.th_info
.ds
.ds_tid
]);
859 my_buffer_index
= th
->th
.th_dispatch
->th_disp_index
++;
861 /* What happens when number of threads changes, need to resize buffer? */
862 pr
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
864 ->th_disp_buffer
[my_buffer_index
% __kmp_dispatch_num_buffers
]);
865 sh
= reinterpret_cast<dispatch_shared_info_template
<T
> volatile *>(
866 &team
->t
.t_disp_buffer
[my_buffer_index
% __kmp_dispatch_num_buffers
]);
867 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid
,
871 __kmp_dispatch_init_algorithm(loc
, gtid
, pr
, schedule
, lb
, ub
, st
,
875 chunk
, (T
)th
->th
.th_team_nproc
,
876 (T
)th
->th
.th_info
.ds
.ds_tid
);
878 if (pr
->flags
.ordered
== 0) {
879 th
->th
.th_dispatch
->th_deo_fcn
= __kmp_dispatch_deo_error
;
880 th
->th
.th_dispatch
->th_dxo_fcn
= __kmp_dispatch_dxo_error
;
882 th
->th
.th_dispatch
->th_deo_fcn
= __kmp_dispatch_deo
<UT
>;
883 th
->th
.th_dispatch
->th_dxo_fcn
= __kmp_dispatch_dxo
<UT
>;
888 /* The name of this buffer should be my_buffer_index when it's free to use
891 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
892 "sh->buffer_index:%d\n",
893 gtid
, my_buffer_index
, sh
->buffer_index
));
894 __kmp_wait
<kmp_uint32
>(&sh
->buffer_index
, my_buffer_index
,
895 __kmp_eq
<kmp_uint32
> USE_ITT_BUILD_ARG(NULL
));
896 // Note: KMP_WAIT() cannot be used there: buffer index and
897 // my_buffer_index are *always* 32-bit integers.
898 KMP_MB(); /* is this necessary? */
899 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
900 "sh->buffer_index:%d\n",
901 gtid
, my_buffer_index
, sh
->buffer_index
));
903 th
->th
.th_dispatch
->th_dispatch_pr_current
= (dispatch_private_info_t
*)pr
;
904 th
->th
.th_dispatch
->th_dispatch_sh_current
=
905 CCAST(dispatch_shared_info_t
*, (volatile dispatch_shared_info_t
*)sh
);
907 if (pr
->flags
.ordered
) {
908 __kmp_itt_ordered_init(gtid
);
910 // Report loop metadata
911 if (itt_need_metadata_reporting
) {
912 // Only report metadata by master of active team at level 1
913 kmp_uint64 schedtype
= 0;
915 case kmp_sch_static_chunked
:
916 case kmp_sch_static_balanced
: // Chunk is calculated in the switch above
918 case kmp_sch_static_greedy
:
919 cur_chunk
= pr
->u
.p
.parm1
;
921 case kmp_sch_dynamic_chunked
:
924 case kmp_sch_guided_iterative_chunked
:
925 case kmp_sch_guided_analytical_chunked
:
926 case kmp_sch_guided_simd
:
930 // Should we put this case under "static"?
931 // case kmp_sch_static_steal:
935 __kmp_itt_metadata_loop(loc
, schedtype
, pr
->u
.p
.tc
, cur_chunk
);
937 #if KMP_USE_HIER_SCHED
938 if (pr
->flags
.use_hier
) {
940 pr
->u
.p
.ub
= pr
->u
.p
.lb
= pr
->u
.p
.st
= pr
->u
.p
.tc
= 0;
942 #endif // KMP_USER_HIER_SCHED
943 #endif /* USE_ITT_BUILD */
949 // create format specifiers before the debug output
950 buff
= __kmp_str_format(
951 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
953 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
954 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
955 traits_t
<UT
>::spec
, traits_t
<T
>::spec
, traits_t
<T
>::spec
,
956 traits_t
<ST
>::spec
, traits_t
<UT
>::spec
, traits_t
<UT
>::spec
,
957 traits_t
<UT
>::spec
, traits_t
<UT
>::spec
, traits_t
<T
>::spec
,
958 traits_t
<T
>::spec
, traits_t
<T
>::spec
, traits_t
<T
>::spec
);
959 KD_TRACE(10, (buff
, gtid
, pr
->schedule
, pr
->flags
.ordered
, pr
->u
.p
.lb
,
960 pr
->u
.p
.ub
, pr
->u
.p
.st
, pr
->u
.p
.tc
, pr
->u
.p
.count
,
961 pr
->u
.p
.ordered_lower
, pr
->u
.p
.ordered_upper
, pr
->u
.p
.parm1
,
962 pr
->u
.p
.parm2
, pr
->u
.p
.parm3
, pr
->u
.p
.parm4
));
963 __kmp_str_free(&buff
);
966 #if (KMP_STATIC_STEAL_ENABLED)
967 // It cannot be guaranteed that after execution of a loop with some other
968 // schedule kind all the parm3 variables will contain the same value. Even if
969 // all parm3 will be the same, it still exists a bad case like using 0 and 1
970 // rather than program life-time increment. So the dedicated variable is
971 // required. The 'static_steal_counter' is used.
972 if (pr
->schedule
== kmp_sch_static_steal
) {
973 // Other threads will inspect this variable when searching for a victim.
974 // This is a flag showing that other threads may steal from this thread
976 volatile T
*p
= &pr
->u
.p
.static_steal_counter
;
979 #endif // ( KMP_STATIC_STEAL_ENABLED )
981 #if OMPT_SUPPORT && OMPT_OPTIONAL
982 if (ompt_enabled
.ompt_callback_work
) {
983 ompt_team_info_t
*team_info
= __ompt_get_teaminfo(0, NULL
);
984 ompt_task_info_t
*task_info
= __ompt_get_task_info_object(0);
985 ompt_callbacks
.ompt_callback(ompt_callback_work
)(
986 ompt_work_loop
, ompt_scope_begin
, &(team_info
->parallel_data
),
987 &(task_info
->task_data
), pr
->u
.p
.tc
, OMPT_LOAD_RETURN_ADDRESS(gtid
));
990 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic
);
993 /* For ordered loops, either __kmp_dispatch_finish() should be called after
994 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
995 * every chunk of iterations. If the ordered section(s) were not executed
996 * for this iteration (or every iteration in this chunk), we need to set the
997 * ordered iteration counters so that the next thread can proceed. */
998 template <typename UT
>
999 static void __kmp_dispatch_finish(int gtid
, ident_t
*loc
) {
1000 typedef typename traits_t
<UT
>::signed_t ST
;
1001 __kmp_assert_valid_gtid(gtid
);
1002 kmp_info_t
*th
= __kmp_threads
[gtid
];
1004 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid
));
1005 if (!th
->th
.th_team
->t
.t_serialized
) {
1007 dispatch_private_info_template
<UT
> *pr
=
1008 reinterpret_cast<dispatch_private_info_template
<UT
> *>(
1009 th
->th
.th_dispatch
->th_dispatch_pr_current
);
1010 dispatch_shared_info_template
<UT
> volatile *sh
=
1011 reinterpret_cast<dispatch_shared_info_template
<UT
> volatile *>(
1012 th
->th
.th_dispatch
->th_dispatch_sh_current
);
1013 KMP_DEBUG_ASSERT(pr
);
1014 KMP_DEBUG_ASSERT(sh
);
1015 KMP_DEBUG_ASSERT(th
->th
.th_dispatch
==
1016 &th
->th
.th_team
->t
.t_dispatch
[th
->th
.th_info
.ds
.ds_tid
]);
1018 if (pr
->ordered_bumped
) {
1021 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1023 pr
->ordered_bumped
= 0;
1025 UT lower
= pr
->u
.p
.ordered_lower
;
1030 // create format specifiers before the debug output
1031 buff
= __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1032 "ordered_iteration:%%%s lower:%%%s\n",
1033 traits_t
<UT
>::spec
, traits_t
<UT
>::spec
);
1034 KD_TRACE(1000, (buff
, gtid
, sh
->u
.s
.ordered_iteration
, lower
));
1035 __kmp_str_free(&buff
);
1039 __kmp_wait
<UT
>(&sh
->u
.s
.ordered_iteration
, lower
,
1040 __kmp_ge
<UT
> USE_ITT_BUILD_ARG(NULL
));
1041 KMP_MB(); /* is this necessary? */
1045 // create format specifiers before the debug output
1046 buff
= __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1047 "ordered_iteration:%%%s lower:%%%s\n",
1048 traits_t
<UT
>::spec
, traits_t
<UT
>::spec
);
1049 KD_TRACE(1000, (buff
, gtid
, sh
->u
.s
.ordered_iteration
, lower
));
1050 __kmp_str_free(&buff
);
1054 test_then_inc
<ST
>((volatile ST
*)&sh
->u
.s
.ordered_iteration
);
1057 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid
));
1060 #ifdef KMP_GOMP_COMPAT
1062 template <typename UT
>
1063 static void __kmp_dispatch_finish_chunk(int gtid
, ident_t
*loc
) {
1064 typedef typename traits_t
<UT
>::signed_t ST
;
1065 __kmp_assert_valid_gtid(gtid
);
1066 kmp_info_t
*th
= __kmp_threads
[gtid
];
1068 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid
));
1069 if (!th
->th
.th_team
->t
.t_serialized
) {
1071 dispatch_private_info_template
<UT
> *pr
=
1072 reinterpret_cast<dispatch_private_info_template
<UT
> *>(
1073 th
->th
.th_dispatch
->th_dispatch_pr_current
);
1074 dispatch_shared_info_template
<UT
> volatile *sh
=
1075 reinterpret_cast<dispatch_shared_info_template
<UT
> volatile *>(
1076 th
->th
.th_dispatch
->th_dispatch_sh_current
);
1077 KMP_DEBUG_ASSERT(pr
);
1078 KMP_DEBUG_ASSERT(sh
);
1079 KMP_DEBUG_ASSERT(th
->th
.th_dispatch
==
1080 &th
->th
.th_team
->t
.t_dispatch
[th
->th
.th_info
.ds
.ds_tid
]);
1082 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1083 UT lower
= pr
->u
.p
.ordered_lower
;
1084 UT upper
= pr
->u
.p
.ordered_upper
;
1085 UT inc
= upper
- lower
+ 1;
1087 if (pr
->ordered_bumped
== inc
) {
1090 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1092 pr
->ordered_bumped
= 0;
1094 inc
-= pr
->ordered_bumped
;
1099 // create format specifiers before the debug output
1100 buff
= __kmp_str_format(
1101 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1102 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1103 traits_t
<UT
>::spec
, traits_t
<UT
>::spec
, traits_t
<UT
>::spec
);
1104 KD_TRACE(1000, (buff
, gtid
, sh
->u
.s
.ordered_iteration
, lower
, upper
));
1105 __kmp_str_free(&buff
);
1109 __kmp_wait
<UT
>(&sh
->u
.s
.ordered_iteration
, lower
,
1110 __kmp_ge
<UT
> USE_ITT_BUILD_ARG(NULL
));
1112 KMP_MB(); /* is this necessary? */
1113 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1114 "ordered_bumped to zero\n",
1116 pr
->ordered_bumped
= 0;
1117 //!!!!! TODO check if the inc should be unsigned, or signed???
1121 // create format specifiers before the debug output
1122 buff
= __kmp_str_format(
1123 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1124 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1125 traits_t
<UT
>::spec
, traits_t
<UT
>::spec
, traits_t
<UT
>::spec
,
1126 traits_t
<UT
>::spec
);
1128 (buff
, gtid
, sh
->u
.s
.ordered_iteration
, inc
, lower
, upper
));
1129 __kmp_str_free(&buff
);
1133 test_then_add
<ST
>((volatile ST
*)&sh
->u
.s
.ordered_iteration
, inc
);
1137 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid
));
1140 #endif /* KMP_GOMP_COMPAT */
1142 template <typename T
>
1143 int __kmp_dispatch_next_algorithm(int gtid
,
1144 dispatch_private_info_template
<T
> *pr
,
1145 dispatch_shared_info_template
<T
> volatile *sh
,
1146 kmp_int32
*p_last
, T
*p_lb
, T
*p_ub
,
1147 typename traits_t
<T
>::signed_t
*p_st
, T nproc
,
1149 typedef typename traits_t
<T
>::unsigned_t UT
;
1150 typedef typename traits_t
<T
>::signed_t ST
;
1151 typedef typename traits_t
<T
>::floating_t DBL
;
1156 UT limit
, trip
, init
;
1157 kmp_info_t
*th
= __kmp_threads
[gtid
];
1158 kmp_team_t
*team
= th
->th
.th_team
;
1160 KMP_DEBUG_ASSERT(th
->th
.th_dispatch
==
1161 &th
->th
.th_team
->t
.t_dispatch
[th
->th
.th_info
.ds
.ds_tid
]);
1162 KMP_DEBUG_ASSERT(pr
);
1163 KMP_DEBUG_ASSERT(sh
);
1164 KMP_DEBUG_ASSERT(tid
>= 0 && tid
< nproc
);
1168 // create format specifiers before the debug output
1170 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1171 "sh:%%p nproc:%%%s tid:%%%s\n",
1172 traits_t
<T
>::spec
, traits_t
<T
>::spec
);
1173 KD_TRACE(10, (buff
, gtid
, pr
, sh
, nproc
, tid
));
1174 __kmp_str_free(&buff
);
1179 if (pr
->u
.p
.tc
== 0) {
1181 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1187 switch (pr
->schedule
) {
1188 #if (KMP_STATIC_STEAL_ENABLED)
1189 case kmp_sch_static_steal
: {
1190 T chunk
= pr
->u
.p
.parm1
;
1193 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1196 trip
= pr
->u
.p
.tc
- 1;
1198 if (traits_t
<T
>::type_size
> 4) {
1199 // use lock for 8-byte and CAS for 4-byte induction
1200 // variable. TODO (optional): check and use 16-byte CAS
1201 kmp_lock_t
*lck
= pr
->u
.p
.th_steal_lock
;
1202 KMP_DEBUG_ASSERT(lck
!= NULL
);
1203 if (pr
->u
.p
.count
< (UT
)pr
->u
.p
.ub
) {
1204 __kmp_acquire_lock(lck
, gtid
);
1205 // try to get own chunk of iterations
1206 init
= (pr
->u
.p
.count
)++;
1207 status
= (init
< (UT
)pr
->u
.p
.ub
);
1208 __kmp_release_lock(lck
, gtid
);
1210 status
= 0; // no own chunks
1212 if (!status
) { // try to steal
1213 kmp_info_t
**other_threads
= team
->t
.t_threads
;
1214 int while_limit
= pr
->u
.p
.parm3
;
1215 int while_index
= 0;
1216 T id
= pr
->u
.p
.static_steal_counter
; // loop id
1217 int idx
= (th
->th
.th_dispatch
->th_disp_index
- 1) %
1218 __kmp_dispatch_num_buffers
; // current loop index
1219 // note: victim thread can potentially execute another loop
1220 // TODO: algorithm of searching for a victim
1221 // should be cleaned up and measured
1222 while ((!status
) && (while_limit
!= ++while_index
)) {
1223 dispatch_private_info_template
<T
> *victim
;
1225 T victimIdx
= pr
->u
.p
.parm4
;
1226 T oldVictimIdx
= victimIdx
? victimIdx
- 1 : nproc
- 1;
1227 victim
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
1228 &other_threads
[victimIdx
]->th
.th_dispatch
->th_disp_buffer
[idx
]);
1229 KMP_DEBUG_ASSERT(victim
);
1230 while ((victim
== pr
|| id
!= victim
->u
.p
.static_steal_counter
) &&
1231 oldVictimIdx
!= victimIdx
) {
1232 victimIdx
= (victimIdx
+ 1) % nproc
;
1233 victim
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
1234 &other_threads
[victimIdx
]->th
.th_dispatch
->th_disp_buffer
[idx
]);
1235 KMP_DEBUG_ASSERT(victim
);
1237 if (victim
== pr
|| id
!= victim
->u
.p
.static_steal_counter
) {
1238 continue; // try once more (nproc attempts in total)
1239 // no victim is ready yet to participate in stealing
1240 // because no victim passed kmp_init_dispatch yet
1242 if (victim
->u
.p
.count
+ 2 > (UT
)victim
->u
.p
.ub
) {
1243 pr
->u
.p
.parm4
= (victimIdx
+ 1) % nproc
; // shift start tid
1244 continue; // not enough chunks to steal, goto next victim
1247 lck
= victim
->u
.p
.th_steal_lock
;
1248 KMP_ASSERT(lck
!= NULL
);
1249 __kmp_acquire_lock(lck
, gtid
);
1250 limit
= victim
->u
.p
.ub
; // keep initial ub
1251 if (victim
->u
.p
.count
>= limit
||
1252 (remaining
= limit
- victim
->u
.p
.count
) < 2) {
1253 __kmp_release_lock(lck
, gtid
);
1254 pr
->u
.p
.parm4
= (victimIdx
+ 1) % nproc
; // next victim
1255 continue; // not enough chunks to steal
1257 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1259 if (remaining
> 3) {
1260 // steal 1/4 of remaining
1261 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen
, remaining
>> 2);
1262 init
= (victim
->u
.p
.ub
-= (remaining
>> 2));
1264 // steal 1 chunk of 2 or 3 remaining
1265 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen
, 1);
1266 init
= (victim
->u
.p
.ub
-= 1);
1268 __kmp_release_lock(lck
, gtid
);
1270 KMP_DEBUG_ASSERT(init
+ 1 <= limit
);
1271 pr
->u
.p
.parm4
= victimIdx
; // remember victim to steal from
1274 // now update own count and ub with stolen range but init chunk
1275 __kmp_acquire_lock(pr
->u
.p
.th_steal_lock
, gtid
);
1276 pr
->u
.p
.count
= init
+ 1;
1278 __kmp_release_lock(pr
->u
.p
.th_steal_lock
, gtid
);
1279 } // while (search for victim)
1280 } // if (try to find victim and steal)
1282 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1290 // All operations on 'count' or 'ub' must be combined atomically
1293 union_i4 vold
, vnew
;
1294 vold
.b
= *(volatile kmp_int64
*)(&pr
->u
.p
.count
);
1297 while (!KMP_COMPARE_AND_STORE_ACQ64(
1298 (volatile kmp_int64
*)&pr
->u
.p
.count
,
1299 *VOLATILE_CAST(kmp_int64
*) & vold
.b
,
1300 *VOLATILE_CAST(kmp_int64
*) & vnew
.b
)) {
1302 vold
.b
= *(volatile kmp_int64
*)(&pr
->u
.p
.count
);
1307 init
= vnew
.p
.count
;
1308 status
= (init
< (UT
)vnew
.p
.ub
);
1312 kmp_info_t
**other_threads
= team
->t
.t_threads
;
1313 int while_limit
= pr
->u
.p
.parm3
;
1314 int while_index
= 0;
1315 T id
= pr
->u
.p
.static_steal_counter
; // loop id
1316 int idx
= (th
->th
.th_dispatch
->th_disp_index
- 1) %
1317 __kmp_dispatch_num_buffers
; // current loop index
1318 // note: victim thread can potentially execute another loop
1319 // TODO: algorithm of searching for a victim
1320 // should be cleaned up and measured
1321 while ((!status
) && (while_limit
!= ++while_index
)) {
1322 dispatch_private_info_template
<T
> *victim
;
1323 union_i4 vold
, vnew
;
1324 kmp_int32 remaining
;
1325 T victimIdx
= pr
->u
.p
.parm4
;
1326 T oldVictimIdx
= victimIdx
? victimIdx
- 1 : nproc
- 1;
1327 victim
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
1328 &other_threads
[victimIdx
]->th
.th_dispatch
->th_disp_buffer
[idx
]);
1329 KMP_DEBUG_ASSERT(victim
);
1330 while ((victim
== pr
|| id
!= victim
->u
.p
.static_steal_counter
) &&
1331 oldVictimIdx
!= victimIdx
) {
1332 victimIdx
= (victimIdx
+ 1) % nproc
;
1333 victim
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
1334 &other_threads
[victimIdx
]->th
.th_dispatch
->th_disp_buffer
[idx
]);
1335 KMP_DEBUG_ASSERT(victim
);
1337 if (victim
== pr
|| id
!= victim
->u
.p
.static_steal_counter
) {
1338 continue; // try once more (nproc attempts in total)
1339 // no victim is ready yet to participate in stealing
1340 // because no victim passed kmp_init_dispatch yet
1342 pr
->u
.p
.parm4
= victimIdx
; // new victim found
1343 while (1) { // CAS loop if victim has enough chunks to steal
1344 vold
.b
= *(volatile kmp_int64
*)(&victim
->u
.p
.count
);
1347 KMP_DEBUG_ASSERT((vnew
.p
.ub
- 1) * (UT
)chunk
<= trip
);
1348 if (vnew
.p
.count
>= (UT
)vnew
.p
.ub
||
1349 (remaining
= vnew
.p
.ub
- vnew
.p
.count
) < 2) {
1350 pr
->u
.p
.parm4
= (victimIdx
+ 1) % nproc
; // shift start victim id
1351 break; // not enough chunks to steal, goto next victim
1353 if (remaining
> 3) {
1354 vnew
.p
.ub
-= (remaining
>> 2); // try to steal 1/4 of remaining
1356 vnew
.p
.ub
-= 1; // steal 1 chunk of 2 or 3 remaining
1358 KMP_DEBUG_ASSERT((vnew
.p
.ub
- 1) * (UT
)chunk
<= trip
);
1359 // TODO: Should this be acquire or release?
1360 if (KMP_COMPARE_AND_STORE_ACQ64(
1361 (volatile kmp_int64
*)&victim
->u
.p
.count
,
1362 *VOLATILE_CAST(kmp_int64
*) & vold
.b
,
1363 *VOLATILE_CAST(kmp_int64
*) & vnew
.b
)) {
1364 // stealing succeeded
1365 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen
,
1366 vold
.p
.ub
- vnew
.p
.ub
);
1369 // now update own count and ub
1371 vold
.p
.count
= init
+ 1;
1373 KMP_XCHG_FIXED64((volatile kmp_int64
*)(&pr
->u
.p
.count
), vold
.b
);
1375 *(volatile kmp_int64
*)(&pr
->u
.p
.count
) = vold
.b
;
1378 } // if (check CAS result)
1379 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1380 } // while (try to steal from particular victim)
1381 } // while (search for victim)
1382 } // if (try to find victim and steal)
1383 } // if (4-byte induction variable)
1390 start
= pr
->u
.p
.parm2
;
1392 limit
= chunk
+ init
- 1;
1394 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks
, 1);
1396 KMP_DEBUG_ASSERT(init
<= trip
);
1397 if ((last
= (limit
>= trip
)) != 0)
1403 *p_lb
= start
+ init
;
1404 *p_ub
= start
+ limit
;
1406 *p_lb
= start
+ init
* incr
;
1407 *p_ub
= start
+ limit
* incr
;
1410 if (pr
->flags
.ordered
) {
1411 pr
->u
.p
.ordered_lower
= init
;
1412 pr
->u
.p
.ordered_upper
= limit
;
1417 #endif // ( KMP_STATIC_STEAL_ENABLED )
1418 case kmp_sch_static_balanced
: {
1421 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1423 /* check if thread has any iteration to do */
1424 if ((status
= !pr
->u
.p
.count
) != 0) {
1428 last
= pr
->u
.p
.parm1
;
1431 } else { /* no iterations to do */
1432 pr
->u
.p
.lb
= pr
->u
.p
.ub
+ pr
->u
.p
.st
;
1436 case kmp_sch_static_greedy
: /* original code for kmp_sch_static_greedy was
1438 case kmp_sch_static_chunked
: {
1441 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1442 "kmp_sch_static_[affinity|chunked] case\n",
1444 parm1
= pr
->u
.p
.parm1
;
1446 trip
= pr
->u
.p
.tc
- 1;
1447 init
= parm1
* (pr
->u
.p
.count
+ tid
);
1449 if ((status
= (init
<= trip
)) != 0) {
1452 limit
= parm1
+ init
- 1;
1454 if ((last
= (limit
>= trip
)) != 0)
1460 pr
->u
.p
.count
+= nproc
;
1463 *p_lb
= start
+ init
;
1464 *p_ub
= start
+ limit
;
1466 *p_lb
= start
+ init
* incr
;
1467 *p_ub
= start
+ limit
* incr
;
1470 if (pr
->flags
.ordered
) {
1471 pr
->u
.p
.ordered_lower
= init
;
1472 pr
->u
.p
.ordered_upper
= limit
;
1478 case kmp_sch_dynamic_chunked
: {
1479 T chunk
= pr
->u
.p
.parm1
;
1483 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1486 init
= chunk
* test_then_inc_acq
<ST
>((volatile ST
*)&sh
->u
.s
.iteration
);
1487 trip
= pr
->u
.p
.tc
- 1;
1489 if ((status
= (init
<= trip
)) == 0) {
1496 limit
= chunk
+ init
- 1;
1499 if ((last
= (limit
>= trip
)) != 0)
1506 *p_lb
= start
+ init
;
1507 *p_ub
= start
+ limit
;
1509 *p_lb
= start
+ init
* incr
;
1510 *p_ub
= start
+ limit
* incr
;
1513 if (pr
->flags
.ordered
) {
1514 pr
->u
.p
.ordered_lower
= init
;
1515 pr
->u
.p
.ordered_upper
= limit
;
1521 case kmp_sch_guided_iterative_chunked
: {
1522 T chunkspec
= pr
->u
.p
.parm1
;
1523 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1527 // Start atomic part of calculations
1529 ST remaining
; // signed, because can be < 0
1530 init
= sh
->u
.s
.iteration
; // shared value
1531 remaining
= trip
- init
;
1532 if (remaining
<= 0) { // AC: need to compare with 0 first
1533 // nothing to do, don't try atomic op
1538 pr
->u
.p
.parm2
) { // compare with K*nproc*(chunk+1), K=2 by default
1539 // use dynamic-style schedule
1540 // atomically increment iterations, get old value
1541 init
= test_then_add
<ST
>(RCAST(volatile ST
*, &sh
->u
.s
.iteration
),
1543 remaining
= trip
- init
;
1544 if (remaining
<= 0) {
1545 status
= 0; // all iterations got by other threads
1547 // got some iterations to work on
1549 if ((T
)remaining
> chunkspec
) {
1550 limit
= init
+ chunkspec
- 1;
1552 last
= 1; // the last chunk
1553 limit
= init
+ remaining
- 1;
1559 (UT
)(remaining
* *(double *)&pr
->u
.p
.parm3
); // divide by K*nproc
1560 if (compare_and_swap
<ST
>(RCAST(volatile ST
*, &sh
->u
.s
.iteration
),
1561 (ST
)init
, (ST
)limit
)) {
1562 // CAS was successful, chunk obtained
1573 *p_lb
= start
+ init
* incr
;
1574 *p_ub
= start
+ limit
* incr
;
1575 if (pr
->flags
.ordered
) {
1576 pr
->u
.p
.ordered_lower
= init
;
1577 pr
->u
.p
.ordered_upper
= limit
;
1588 case kmp_sch_guided_simd
: {
1589 // same as iterative but curr-chunk adjusted to be multiple of given
1591 T chunk
= pr
->u
.p
.parm1
;
1593 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1596 // Start atomic part of calculations
1598 ST remaining
; // signed, because can be < 0
1599 init
= sh
->u
.s
.iteration
; // shared value
1600 remaining
= trip
- init
;
1601 if (remaining
<= 0) { // AC: need to compare with 0 first
1602 status
= 0; // nothing to do, don't try atomic op
1605 KMP_DEBUG_ASSERT(init
% chunk
== 0);
1606 // compare with K*nproc*(chunk+1), K=2 by default
1607 if ((T
)remaining
< pr
->u
.p
.parm2
) {
1608 // use dynamic-style schedule
1609 // atomically increment iterations, get old value
1610 init
= test_then_add
<ST
>(RCAST(volatile ST
*, &sh
->u
.s
.iteration
),
1612 remaining
= trip
- init
;
1613 if (remaining
<= 0) {
1614 status
= 0; // all iterations got by other threads
1616 // got some iterations to work on
1618 if ((T
)remaining
> chunk
) {
1619 limit
= init
+ chunk
- 1;
1621 last
= 1; // the last chunk
1622 limit
= init
+ remaining
- 1;
1627 // divide by K*nproc
1628 UT span
= remaining
* (*(double *)&pr
->u
.p
.parm3
);
1629 UT rem
= span
% chunk
;
1630 if (rem
) // adjust so that span%chunk == 0
1631 span
+= chunk
- rem
;
1632 limit
= init
+ span
;
1633 if (compare_and_swap
<ST
>(RCAST(volatile ST
*, &sh
->u
.s
.iteration
),
1634 (ST
)init
, (ST
)limit
)) {
1635 // CAS was successful, chunk obtained
1646 *p_lb
= start
+ init
* incr
;
1647 *p_ub
= start
+ limit
* incr
;
1648 if (pr
->flags
.ordered
) {
1649 pr
->u
.p
.ordered_lower
= init
;
1650 pr
->u
.p
.ordered_upper
= limit
;
1661 case kmp_sch_guided_analytical_chunked
: {
1662 T chunkspec
= pr
->u
.p
.parm1
;
1664 #if KMP_USE_X87CONTROL
1665 /* for storing original FPCW value for Windows* OS on
1666 IA-32 architecture 8-byte version */
1667 unsigned int oldFpcw
;
1668 unsigned int fpcwSet
= 0;
1670 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1671 "kmp_sch_guided_analytical_chunked case\n",
1676 KMP_DEBUG_ASSERT(nproc
> 1);
1677 KMP_DEBUG_ASSERT((2UL * chunkspec
+ 1) * (UT
)nproc
< trip
);
1679 while (1) { /* this while loop is a safeguard against unexpected zero
1681 chunkIdx
= test_then_inc_acq
<ST
>((volatile ST
*)&sh
->u
.s
.iteration
);
1682 if (chunkIdx
>= (UT
)pr
->u
.p
.parm2
) {
1684 /* use dynamic-style scheduling */
1685 init
= chunkIdx
* chunkspec
+ pr
->u
.p
.count
;
1686 /* need to verify init > 0 in case of overflow in the above
1688 if ((status
= (init
> 0 && init
<= trip
)) != 0) {
1689 limit
= init
+ chunkspec
- 1;
1691 if ((last
= (limit
>= trip
)) != 0)
1696 /* use exponential-style scheduling */
1697 /* The following check is to workaround the lack of long double precision on
1699 This check works around the possible effect that init != 0 for chunkIdx == 0.
1701 #if KMP_USE_X87CONTROL
1702 /* If we haven't already done so, save original
1703 FPCW and set precision to 64-bit, as Windows* OS
1704 on IA-32 architecture defaults to 53-bit */
1706 oldFpcw
= _control87(0, 0);
1707 _control87(_PC_64
, _MCW_PC
);
1712 init
= __kmp_dispatch_guided_remaining
<T
>(
1713 trip
, *(DBL
*)&pr
->u
.p
.parm3
, chunkIdx
);
1714 KMP_DEBUG_ASSERT(init
);
1718 limit
= trip
- __kmp_dispatch_guided_remaining
<T
>(
1719 trip
, *(DBL
*)&pr
->u
.p
.parm3
, chunkIdx
+ 1);
1720 KMP_ASSERT(init
<= limit
);
1722 KMP_DEBUG_ASSERT(limit
<= trip
);
1729 #if KMP_USE_X87CONTROL
1730 /* restore FPCW if necessary
1731 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1733 if (fpcwSet
&& (oldFpcw
& fpcwSet
))
1734 _control87(oldFpcw
, _MCW_PC
);
1741 *p_lb
= start
+ init
* incr
;
1742 *p_ub
= start
+ limit
* incr
;
1743 if (pr
->flags
.ordered
) {
1744 pr
->u
.p
.ordered_lower
= init
;
1745 pr
->u
.p
.ordered_upper
= limit
;
1756 case kmp_sch_trapezoidal
: {
1758 T parm2
= pr
->u
.p
.parm2
;
1759 T parm3
= pr
->u
.p
.parm3
;
1760 T parm4
= pr
->u
.p
.parm4
;
1762 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1765 index
= test_then_inc
<ST
>((volatile ST
*)&sh
->u
.s
.iteration
);
1767 init
= (index
* ((2 * parm2
) - (index
- 1) * parm4
)) / 2;
1768 trip
= pr
->u
.p
.tc
- 1;
1770 if ((status
= ((T
)index
< parm3
&& init
<= trip
)) == 0) {
1777 limit
= ((index
+ 1) * (2 * parm2
- index
* parm4
)) / 2 - 1;
1780 if ((last
= (limit
>= trip
)) != 0)
1787 *p_lb
= start
+ init
;
1788 *p_ub
= start
+ limit
;
1790 *p_lb
= start
+ init
* incr
;
1791 *p_ub
= start
+ limit
* incr
;
1794 if (pr
->flags
.ordered
) {
1795 pr
->u
.p
.ordered_lower
= init
;
1796 pr
->u
.p
.ordered_upper
= limit
;
1802 status
= 0; // to avoid complaints on uninitialized variable use
1803 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected
), // Primary message
1804 KMP_HNT(GetNewerLibrary
), // Hint
1805 __kmp_msg_null
// Variadic argument list terminator
1812 if (pr
->flags
.ordered
) {
1814 // create format specifiers before the debug output
1815 buff
= __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1816 "ordered_lower:%%%s ordered_upper:%%%s\n",
1817 traits_t
<UT
>::spec
, traits_t
<UT
>::spec
);
1818 KD_TRACE(1000, (buff
, gtid
, pr
->u
.p
.ordered_lower
, pr
->u
.p
.ordered_upper
));
1819 __kmp_str_free(&buff
);
1823 // create format specifiers before the debug output
1824 buff
= __kmp_str_format(
1825 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1826 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1827 traits_t
<T
>::spec
, traits_t
<T
>::spec
, traits_t
<ST
>::spec
);
1828 KD_TRACE(10, (buff
, gtid
, status
, *p_last
, *p_lb
, *p_ub
, *p_st
));
1829 __kmp_str_free(&buff
);
1835 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1836 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1838 #if OMPT_SUPPORT && OMPT_OPTIONAL
1839 #define OMPT_LOOP_END \
1840 if (status == 0) { \
1841 if (ompt_enabled.ompt_callback_work) { \
1842 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1843 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1844 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1845 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1846 &(task_info->task_data), 0, codeptr); \
1849 // TODO: implement count
1851 #define OMPT_LOOP_END // no-op
1854 #if KMP_STATS_ENABLED
1855 #define KMP_STATS_LOOP_END \
1857 kmp_int64 u, l, t, i; \
1858 l = (kmp_int64)(*p_lb); \
1859 u = (kmp_int64)(*p_ub); \
1860 i = (kmp_int64)(pr->u.p.st); \
1861 if (status == 0) { \
1863 KMP_POP_PARTITIONED_TIMER(); \
1864 } else if (i == 1) { \
1869 } else if (i < 0) { \
1871 t = (l - u) / (-i) + 1; \
1876 t = (u - l) / i + 1; \
1880 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1883 #define KMP_STATS_LOOP_END /* Nothing */
1886 template <typename T
>
1887 static int __kmp_dispatch_next(ident_t
*loc
, int gtid
, kmp_int32
*p_last
,
1889 typename traits_t
<T
>::signed_t
*p_st
1890 #if OMPT_SUPPORT && OMPT_OPTIONAL
1896 typedef typename traits_t
<T
>::unsigned_t UT
;
1897 typedef typename traits_t
<T
>::signed_t ST
;
1898 // This is potentially slightly misleading, schedule(runtime) will appear here
1899 // even if the actual runtime schedule is static. (Which points out a
1900 // disadvantage of schedule(runtime): even when static scheduling is used it
1901 // costs more than a compile time choice to use static scheduling would.)
1902 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling
);
1905 dispatch_private_info_template
<T
> *pr
;
1906 __kmp_assert_valid_gtid(gtid
);
1907 kmp_info_t
*th
= __kmp_threads
[gtid
];
1908 kmp_team_t
*team
= th
->th
.th_team
;
1910 KMP_DEBUG_ASSERT(p_lb
&& p_ub
&& p_st
); // AC: these cannot be NULL
1913 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1914 gtid
, p_lb
, p_ub
, p_st
, p_last
));
1916 if (team
->t
.t_serialized
) {
1917 /* NOTE: serialize this dispatch because we are not at the active level */
1918 pr
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
1919 th
->th
.th_dispatch
->th_disp_buffer
); /* top of the stack */
1920 KMP_DEBUG_ASSERT(pr
);
1922 if ((status
= (pr
->u
.p
.tc
!= 0)) == 0) {
1925 // if ( p_last != NULL )
1929 if (__kmp_env_consistency_check
) {
1930 if (pr
->pushed_ws
!= ct_none
) {
1931 pr
->pushed_ws
= __kmp_pop_workshare(gtid
, pr
->pushed_ws
, loc
);
1934 } else if (pr
->flags
.nomerge
) {
1937 UT limit
, trip
, init
;
1939 T chunk
= pr
->u
.p
.parm1
;
1941 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1944 init
= chunk
* pr
->u
.p
.count
++;
1945 trip
= pr
->u
.p
.tc
- 1;
1947 if ((status
= (init
<= trip
)) == 0) {
1950 // if ( p_last != NULL )
1954 if (__kmp_env_consistency_check
) {
1955 if (pr
->pushed_ws
!= ct_none
) {
1956 pr
->pushed_ws
= __kmp_pop_workshare(gtid
, pr
->pushed_ws
, loc
);
1961 limit
= chunk
+ init
- 1;
1964 if ((last
= (limit
>= trip
)) != 0) {
1967 pr
->u
.p
.last_upper
= pr
->u
.p
.ub
;
1968 #endif /* KMP_OS_WINDOWS */
1975 *p_lb
= start
+ init
;
1976 *p_ub
= start
+ limit
;
1978 *p_lb
= start
+ init
* incr
;
1979 *p_ub
= start
+ limit
* incr
;
1982 if (pr
->flags
.ordered
) {
1983 pr
->u
.p
.ordered_lower
= init
;
1984 pr
->u
.p
.ordered_upper
= limit
;
1988 // create format specifiers before the debug output
1989 buff
= __kmp_str_format("__kmp_dispatch_next: T#%%d "
1990 "ordered_lower:%%%s ordered_upper:%%%s\n",
1991 traits_t
<UT
>::spec
, traits_t
<UT
>::spec
);
1992 KD_TRACE(1000, (buff
, gtid
, pr
->u
.p
.ordered_lower
,
1993 pr
->u
.p
.ordered_upper
));
1994 __kmp_str_free(&buff
);
2004 pr
->u
.p
.last_upper
= *p_ub
;
2005 #endif /* KMP_OS_WINDOWS */
2014 // create format specifiers before the debug output
2015 buff
= __kmp_str_format(
2016 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2017 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2018 traits_t
<T
>::spec
, traits_t
<T
>::spec
, traits_t
<ST
>::spec
);
2019 KD_TRACE(10, (buff
, gtid
, *p_lb
, *p_ub
, *p_st
, p_last
,
2020 (p_last
? *p_last
: 0), status
));
2021 __kmp_str_free(&buff
);
2024 #if INCLUDE_SSC_MARKS
2025 SSC_MARK_DISPATCH_NEXT();
2032 dispatch_shared_info_template
<T
> volatile *sh
;
2034 KMP_DEBUG_ASSERT(th
->th
.th_dispatch
==
2035 &th
->th
.th_team
->t
.t_dispatch
[th
->th
.th_info
.ds
.ds_tid
]);
2037 pr
= reinterpret_cast<dispatch_private_info_template
<T
> *>(
2038 th
->th
.th_dispatch
->th_dispatch_pr_current
);
2039 KMP_DEBUG_ASSERT(pr
);
2040 sh
= reinterpret_cast<dispatch_shared_info_template
<T
> volatile *>(
2041 th
->th
.th_dispatch
->th_dispatch_sh_current
);
2042 KMP_DEBUG_ASSERT(sh
);
2044 #if KMP_USE_HIER_SCHED
2045 if (pr
->flags
.use_hier
)
2046 status
= sh
->hier
->next(loc
, gtid
, pr
, &last
, p_lb
, p_ub
, p_st
);
2048 #endif // KMP_USE_HIER_SCHED
2049 status
= __kmp_dispatch_next_algorithm
<T
>(gtid
, pr
, sh
, &last
, p_lb
, p_ub
,
2050 p_st
, th
->th
.th_team_nproc
,
2051 th
->th
.th_info
.ds
.ds_tid
);
2052 // status == 0: no more iterations to execute
2056 num_done
= test_then_inc
<ST
>((volatile ST
*)&sh
->u
.s
.num_done
);
2060 // create format specifiers before the debug output
2061 buff
= __kmp_str_format(
2062 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2063 traits_t
<UT
>::spec
);
2064 KD_TRACE(10, (buff
, gtid
, sh
->u
.s
.num_done
));
2065 __kmp_str_free(&buff
);
2069 #if KMP_USE_HIER_SCHED
2070 pr
->flags
.use_hier
= FALSE
;
2072 if ((ST
)num_done
== th
->th
.th_team_nproc
- 1) {
2073 #if (KMP_STATIC_STEAL_ENABLED)
2074 if (pr
->schedule
== kmp_sch_static_steal
&&
2075 traits_t
<T
>::type_size
> 4) {
2077 int idx
= (th
->th
.th_dispatch
->th_disp_index
- 1) %
2078 __kmp_dispatch_num_buffers
; // current loop index
2079 kmp_info_t
**other_threads
= team
->t
.t_threads
;
2080 // loop complete, safe to destroy locks used for stealing
2081 for (i
= 0; i
< th
->th
.th_team_nproc
; ++i
) {
2082 dispatch_private_info_template
<T
> *buf
=
2083 reinterpret_cast<dispatch_private_info_template
<T
> *>(
2084 &other_threads
[i
]->th
.th_dispatch
->th_disp_buffer
[idx
]);
2085 kmp_lock_t
*lck
= buf
->u
.p
.th_steal_lock
;
2086 KMP_ASSERT(lck
!= NULL
);
2087 __kmp_destroy_lock(lck
);
2089 buf
->u
.p
.th_steal_lock
= NULL
;
2093 /* NOTE: release this buffer to be reused */
2095 KMP_MB(); /* Flush all pending memory write invalidates. */
2097 sh
->u
.s
.num_done
= 0;
2098 sh
->u
.s
.iteration
= 0;
2100 /* TODO replace with general release procedure? */
2101 if (pr
->flags
.ordered
) {
2102 sh
->u
.s
.ordered_iteration
= 0;
2105 KMP_MB(); /* Flush all pending memory write invalidates. */
2107 sh
->buffer_index
+= __kmp_dispatch_num_buffers
;
2108 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2109 gtid
, sh
->buffer_index
));
2111 KMP_MB(); /* Flush all pending memory write invalidates. */
2114 if (__kmp_env_consistency_check
) {
2115 if (pr
->pushed_ws
!= ct_none
) {
2116 pr
->pushed_ws
= __kmp_pop_workshare(gtid
, pr
->pushed_ws
, loc
);
2120 th
->th
.th_dispatch
->th_deo_fcn
= NULL
;
2121 th
->th
.th_dispatch
->th_dxo_fcn
= NULL
;
2122 th
->th
.th_dispatch
->th_dispatch_sh_current
= NULL
;
2123 th
->th
.th_dispatch
->th_dispatch_pr_current
= NULL
;
2124 } // if (status == 0)
2127 pr
->u
.p
.last_upper
= pr
->u
.p
.ub
;
2129 #endif /* KMP_OS_WINDOWS */
2130 if (p_last
!= NULL
&& status
!= 0)
2137 // create format specifiers before the debug output
2138 buff
= __kmp_str_format(
2139 "__kmp_dispatch_next: T#%%d normal case: "
2140 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2141 traits_t
<T
>::spec
, traits_t
<T
>::spec
, traits_t
<ST
>::spec
);
2142 KD_TRACE(10, (buff
, gtid
, *p_lb
, *p_ub
, p_st
? *p_st
: 0, p_last
,
2143 (p_last
? *p_last
: 0), status
));
2144 __kmp_str_free(&buff
);
2147 #if INCLUDE_SSC_MARKS
2148 SSC_MARK_DISPATCH_NEXT();
2155 template <typename T
>
2156 static void __kmp_dist_get_bounds(ident_t
*loc
, kmp_int32 gtid
,
2157 kmp_int32
*plastiter
, T
*plower
, T
*pupper
,
2158 typename traits_t
<T
>::signed_t incr
) {
2159 typedef typename traits_t
<T
>::unsigned_t UT
;
2166 KMP_DEBUG_ASSERT(plastiter
&& plower
&& pupper
);
2167 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid
));
2169 typedef typename traits_t
<T
>::signed_t ST
;
2172 // create format specifiers before the debug output
2173 buff
= __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2174 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2175 traits_t
<T
>::spec
, traits_t
<T
>::spec
,
2176 traits_t
<ST
>::spec
, traits_t
<T
>::spec
);
2177 KD_TRACE(100, (buff
, gtid
, *plastiter
, *plower
, *pupper
, incr
));
2178 __kmp_str_free(&buff
);
2182 if (__kmp_env_consistency_check
) {
2184 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited
, ct_pdo
,
2187 if (incr
> 0 ? (*pupper
< *plower
) : (*plower
< *pupper
)) {
2188 // The loop is illegal.
2189 // Some zero-trip loops maintained by compiler, e.g.:
2190 // for(i=10;i<0;++i) // lower >= upper - run-time check
2191 // for(i=0;i>10;--i) // lower <= upper - run-time check
2192 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2193 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2194 // Compiler does not check the following illegal loops:
2195 // for(i=0;i<10;i+=incr) // where incr<0
2196 // for(i=10;i>0;i-=incr) // where incr<0
2197 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal
, ct_pdo
, loc
);
2200 __kmp_assert_valid_gtid(gtid
);
2201 th
= __kmp_threads
[gtid
];
2202 team
= th
->th
.th_team
;
2203 KMP_DEBUG_ASSERT(th
->th
.th_teams_microtask
); // we are in the teams construct
2204 nteams
= th
->th
.th_teams_size
.nteams
;
2205 team_id
= team
->t
.t_master_tid
;
2206 KMP_DEBUG_ASSERT(nteams
== (kmp_uint32
)team
->t
.t_parent
->t
.t_nproc
);
2208 // compute global trip count
2210 trip_count
= *pupper
- *plower
+ 1;
2211 } else if (incr
== -1) {
2212 trip_count
= *plower
- *pupper
+ 1;
2213 } else if (incr
> 0) {
2214 // upper-lower can exceed the limit of signed type
2215 trip_count
= (UT
)(*pupper
- *plower
) / incr
+ 1;
2217 trip_count
= (UT
)(*plower
- *pupper
) / (-incr
) + 1;
2220 if (trip_count
<= nteams
) {
2222 __kmp_static
== kmp_sch_static_greedy
||
2224 kmp_sch_static_balanced
); // Unknown static scheduling type.
2225 // only some teams get single iteration, others get nothing
2226 if (team_id
< trip_count
) {
2227 *pupper
= *plower
= *plower
+ team_id
* incr
;
2229 *plower
= *pupper
+ incr
; // zero-trip loop
2231 if (plastiter
!= NULL
)
2232 *plastiter
= (team_id
== trip_count
- 1);
2234 if (__kmp_static
== kmp_sch_static_balanced
) {
2235 UT chunk
= trip_count
/ nteams
;
2236 UT extras
= trip_count
% nteams
;
2238 incr
* (team_id
* chunk
+ (team_id
< extras
? team_id
: extras
));
2239 *pupper
= *plower
+ chunk
* incr
- (team_id
< extras
? 0 : incr
);
2240 if (plastiter
!= NULL
)
2241 *plastiter
= (team_id
== nteams
- 1);
2244 (trip_count
/ nteams
+ ((trip_count
% nteams
) ? 1 : 0)) * incr
;
2246 KMP_DEBUG_ASSERT(__kmp_static
== kmp_sch_static_greedy
);
2247 // Unknown static scheduling type.
2248 *plower
+= team_id
* chunk_inc_count
;
2249 *pupper
= *plower
+ chunk_inc_count
- incr
;
2250 // Check/correct bounds if needed
2252 if (*pupper
< *plower
)
2253 *pupper
= traits_t
<T
>::max_value
;
2254 if (plastiter
!= NULL
)
2255 *plastiter
= *plower
<= upper
&& *pupper
> upper
- incr
;
2256 if (*pupper
> upper
)
2257 *pupper
= upper
; // tracker C73258
2259 if (*pupper
> *plower
)
2260 *pupper
= traits_t
<T
>::min_value
;
2261 if (plastiter
!= NULL
)
2262 *plastiter
= *plower
>= upper
&& *pupper
< upper
- incr
;
2263 if (*pupper
< upper
)
2264 *pupper
= upper
; // tracker C73258
2270 //-----------------------------------------------------------------------------
2271 // Dispatch routines
2272 // Transfer call to template< type T >
2273 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2274 // T lb, T ub, ST st, ST chunk )
2278 @ingroup WORK_SHARING
2280 @param loc Source location
2281 @param gtid Global thread id
2282 @param schedule Schedule type
2283 @param lb Lower bound
2284 @param ub Upper bound
2285 @param st Step (or increment if you prefer)
2286 @param chunk The chunk size to block with
2288 This function prepares the runtime to start a dynamically scheduled for loop,
2289 saving the loop arguments.
2290 These functions are all identical apart from the types of the arguments.
2293 void __kmpc_dispatch_init_4(ident_t
*loc
, kmp_int32 gtid
,
2294 enum sched_type schedule
, kmp_int32 lb
,
2295 kmp_int32 ub
, kmp_int32 st
, kmp_int32 chunk
) {
2296 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298 OMPT_STORE_RETURN_ADDRESS(gtid
);
2300 __kmp_dispatch_init
<kmp_int32
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2303 See @ref __kmpc_dispatch_init_4
2305 void __kmpc_dispatch_init_4u(ident_t
*loc
, kmp_int32 gtid
,
2306 enum sched_type schedule
, kmp_uint32 lb
,
2307 kmp_uint32 ub
, kmp_int32 st
, kmp_int32 chunk
) {
2308 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310 OMPT_STORE_RETURN_ADDRESS(gtid
);
2312 __kmp_dispatch_init
<kmp_uint32
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2316 See @ref __kmpc_dispatch_init_4
2318 void __kmpc_dispatch_init_8(ident_t
*loc
, kmp_int32 gtid
,
2319 enum sched_type schedule
, kmp_int64 lb
,
2320 kmp_int64 ub
, kmp_int64 st
, kmp_int64 chunk
) {
2321 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323 OMPT_STORE_RETURN_ADDRESS(gtid
);
2325 __kmp_dispatch_init
<kmp_int64
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2329 See @ref __kmpc_dispatch_init_4
2331 void __kmpc_dispatch_init_8u(ident_t
*loc
, kmp_int32 gtid
,
2332 enum sched_type schedule
, kmp_uint64 lb
,
2333 kmp_uint64 ub
, kmp_int64 st
, kmp_int64 chunk
) {
2334 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL
2336 OMPT_STORE_RETURN_ADDRESS(gtid
);
2338 __kmp_dispatch_init
<kmp_uint64
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2342 See @ref __kmpc_dispatch_init_4
2344 Difference from __kmpc_dispatch_init set of functions is these functions
2345 are called for composite distribute parallel for construct. Thus before
2346 regular iterations dispatching we need to calc per-team iteration space.
2348 These functions are all identical apart from the types of the arguments.
2350 void __kmpc_dist_dispatch_init_4(ident_t
*loc
, kmp_int32 gtid
,
2351 enum sched_type schedule
, kmp_int32
*p_last
,
2352 kmp_int32 lb
, kmp_int32 ub
, kmp_int32 st
,
2354 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356 OMPT_STORE_RETURN_ADDRESS(gtid
);
2358 __kmp_dist_get_bounds
<kmp_int32
>(loc
, gtid
, p_last
, &lb
, &ub
, st
);
2359 __kmp_dispatch_init
<kmp_int32
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2362 void __kmpc_dist_dispatch_init_4u(ident_t
*loc
, kmp_int32 gtid
,
2363 enum sched_type schedule
, kmp_int32
*p_last
,
2364 kmp_uint32 lb
, kmp_uint32 ub
, kmp_int32 st
,
2366 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368 OMPT_STORE_RETURN_ADDRESS(gtid
);
2370 __kmp_dist_get_bounds
<kmp_uint32
>(loc
, gtid
, p_last
, &lb
, &ub
, st
);
2371 __kmp_dispatch_init
<kmp_uint32
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2374 void __kmpc_dist_dispatch_init_8(ident_t
*loc
, kmp_int32 gtid
,
2375 enum sched_type schedule
, kmp_int32
*p_last
,
2376 kmp_int64 lb
, kmp_int64 ub
, kmp_int64 st
,
2378 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380 OMPT_STORE_RETURN_ADDRESS(gtid
);
2382 __kmp_dist_get_bounds
<kmp_int64
>(loc
, gtid
, p_last
, &lb
, &ub
, st
);
2383 __kmp_dispatch_init
<kmp_int64
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2386 void __kmpc_dist_dispatch_init_8u(ident_t
*loc
, kmp_int32 gtid
,
2387 enum sched_type schedule
, kmp_int32
*p_last
,
2388 kmp_uint64 lb
, kmp_uint64 ub
, kmp_int64 st
,
2390 KMP_DEBUG_ASSERT(__kmp_init_serial
);
2391 #if OMPT_SUPPORT && OMPT_OPTIONAL
2392 OMPT_STORE_RETURN_ADDRESS(gtid
);
2394 __kmp_dist_get_bounds
<kmp_uint64
>(loc
, gtid
, p_last
, &lb
, &ub
, st
);
2395 __kmp_dispatch_init
<kmp_uint64
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
, true);
2399 @param loc Source code location
2400 @param gtid Global thread id
2401 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2403 @param p_lb Pointer to the lower bound for the next chunk of work
2404 @param p_ub Pointer to the upper bound for the next chunk of work
2405 @param p_st Pointer to the stride for the next chunk of work
2406 @return one if there is work to be done, zero otherwise
2408 Get the next dynamically allocated chunk of work for this thread.
2409 If there is no more work, then the lb,ub and stride need not be modified.
2411 int __kmpc_dispatch_next_4(ident_t
*loc
, kmp_int32 gtid
, kmp_int32
*p_last
,
2412 kmp_int32
*p_lb
, kmp_int32
*p_ub
, kmp_int32
*p_st
) {
2413 #if OMPT_SUPPORT && OMPT_OPTIONAL
2414 OMPT_STORE_RETURN_ADDRESS(gtid
);
2416 return __kmp_dispatch_next
<kmp_int32
>(loc
, gtid
, p_last
, p_lb
, p_ub
, p_st
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419 OMPT_LOAD_RETURN_ADDRESS(gtid
)
2425 See @ref __kmpc_dispatch_next_4
2427 int __kmpc_dispatch_next_4u(ident_t
*loc
, kmp_int32 gtid
, kmp_int32
*p_last
,
2428 kmp_uint32
*p_lb
, kmp_uint32
*p_ub
,
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431 OMPT_STORE_RETURN_ADDRESS(gtid
);
2433 return __kmp_dispatch_next
<kmp_uint32
>(loc
, gtid
, p_last
, p_lb
, p_ub
, p_st
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436 OMPT_LOAD_RETURN_ADDRESS(gtid
)
2442 See @ref __kmpc_dispatch_next_4
2444 int __kmpc_dispatch_next_8(ident_t
*loc
, kmp_int32 gtid
, kmp_int32
*p_last
,
2445 kmp_int64
*p_lb
, kmp_int64
*p_ub
, kmp_int64
*p_st
) {
2446 #if OMPT_SUPPORT && OMPT_OPTIONAL
2447 OMPT_STORE_RETURN_ADDRESS(gtid
);
2449 return __kmp_dispatch_next
<kmp_int64
>(loc
, gtid
, p_last
, p_lb
, p_ub
, p_st
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452 OMPT_LOAD_RETURN_ADDRESS(gtid
)
2458 See @ref __kmpc_dispatch_next_4
2460 int __kmpc_dispatch_next_8u(ident_t
*loc
, kmp_int32 gtid
, kmp_int32
*p_last
,
2461 kmp_uint64
*p_lb
, kmp_uint64
*p_ub
,
2463 #if OMPT_SUPPORT && OMPT_OPTIONAL
2464 OMPT_STORE_RETURN_ADDRESS(gtid
);
2466 return __kmp_dispatch_next
<kmp_uint64
>(loc
, gtid
, p_last
, p_lb
, p_ub
, p_st
2467 #if OMPT_SUPPORT && OMPT_OPTIONAL
2469 OMPT_LOAD_RETURN_ADDRESS(gtid
)
2475 @param loc Source code location
2476 @param gtid Global thread id
2478 Mark the end of a dynamic loop.
2480 void __kmpc_dispatch_fini_4(ident_t
*loc
, kmp_int32 gtid
) {
2481 __kmp_dispatch_finish
<kmp_uint32
>(gtid
, loc
);
2485 See @ref __kmpc_dispatch_fini_4
2487 void __kmpc_dispatch_fini_8(ident_t
*loc
, kmp_int32 gtid
) {
2488 __kmp_dispatch_finish
<kmp_uint64
>(gtid
, loc
);
2492 See @ref __kmpc_dispatch_fini_4
2494 void __kmpc_dispatch_fini_4u(ident_t
*loc
, kmp_int32 gtid
) {
2495 __kmp_dispatch_finish
<kmp_uint32
>(gtid
, loc
);
2499 See @ref __kmpc_dispatch_fini_4
2501 void __kmpc_dispatch_fini_8u(ident_t
*loc
, kmp_int32 gtid
) {
2502 __kmp_dispatch_finish
<kmp_uint64
>(gtid
, loc
);
2506 //-----------------------------------------------------------------------------
2507 // Non-template routines from kmp_dispatch.cpp used in other sources
2509 kmp_uint32
__kmp_eq_4(kmp_uint32 value
, kmp_uint32 checker
) {
2510 return value
== checker
;
2513 kmp_uint32
__kmp_neq_4(kmp_uint32 value
, kmp_uint32 checker
) {
2514 return value
!= checker
;
2517 kmp_uint32
__kmp_lt_4(kmp_uint32 value
, kmp_uint32 checker
) {
2518 return value
< checker
;
2521 kmp_uint32
__kmp_ge_4(kmp_uint32 value
, kmp_uint32 checker
) {
2522 return value
>= checker
;
2525 kmp_uint32
__kmp_le_4(kmp_uint32 value
, kmp_uint32 checker
) {
2526 return value
<= checker
;
2530 __kmp_wait_4(volatile kmp_uint32
*spinner
, kmp_uint32 checker
,
2531 kmp_uint32 (*pred
)(kmp_uint32
, kmp_uint32
),
2532 void *obj
// Higher-level synchronization object, or NULL.
2534 // note: we may not belong to a team at this point
2535 volatile kmp_uint32
*spin
= spinner
;
2536 kmp_uint32 check
= checker
;
2538 kmp_uint32 (*f
)(kmp_uint32
, kmp_uint32
) = pred
;
2541 KMP_FSYNC_SPIN_INIT(obj
, CCAST(kmp_uint32
*, spin
));
2542 KMP_INIT_YIELD(spins
);
2543 // main wait spin loop
2544 while (!f(r
= TCR_4(*spin
), check
)) {
2545 KMP_FSYNC_SPIN_PREPARE(obj
);
2546 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2547 split. It causes problems with infinite recursion because of exit lock */
2548 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2549 __kmp_abort_thread(); */
2550 KMP_YIELD_OVERSUB_ELSE_SPIN(spins
);
2552 KMP_FSYNC_SPIN_ACQUIRED(obj
);
2556 void __kmp_wait_4_ptr(void *spinner
, kmp_uint32 checker
,
2557 kmp_uint32 (*pred
)(void *, kmp_uint32
),
2558 void *obj
// Higher-level synchronization object, or NULL.
2560 // note: we may not belong to a team at this point
2561 void *spin
= spinner
;
2562 kmp_uint32 check
= checker
;
2564 kmp_uint32 (*f
)(void *, kmp_uint32
) = pred
;
2566 KMP_FSYNC_SPIN_INIT(obj
, spin
);
2567 KMP_INIT_YIELD(spins
);
2568 // main wait spin loop
2569 while (!f(spin
, check
)) {
2570 KMP_FSYNC_SPIN_PREPARE(obj
);
2571 /* if we have waited a bit, or are noversubscribed, yield */
2572 /* pause is in the following code */
2573 KMP_YIELD_OVERSUB_ELSE_SPIN(spins
);
2575 KMP_FSYNC_SPIN_ACQUIRED(obj
);
2580 #ifdef KMP_GOMP_COMPAT
2582 void __kmp_aux_dispatch_init_4(ident_t
*loc
, kmp_int32 gtid
,
2583 enum sched_type schedule
, kmp_int32 lb
,
2584 kmp_int32 ub
, kmp_int32 st
, kmp_int32 chunk
,
2586 __kmp_dispatch_init
<kmp_int32
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
,
2590 void __kmp_aux_dispatch_init_4u(ident_t
*loc
, kmp_int32 gtid
,
2591 enum sched_type schedule
, kmp_uint32 lb
,
2592 kmp_uint32 ub
, kmp_int32 st
, kmp_int32 chunk
,
2594 __kmp_dispatch_init
<kmp_uint32
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
,
2598 void __kmp_aux_dispatch_init_8(ident_t
*loc
, kmp_int32 gtid
,
2599 enum sched_type schedule
, kmp_int64 lb
,
2600 kmp_int64 ub
, kmp_int64 st
, kmp_int64 chunk
,
2602 __kmp_dispatch_init
<kmp_int64
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
,
2606 void __kmp_aux_dispatch_init_8u(ident_t
*loc
, kmp_int32 gtid
,
2607 enum sched_type schedule
, kmp_uint64 lb
,
2608 kmp_uint64 ub
, kmp_int64 st
, kmp_int64 chunk
,
2610 __kmp_dispatch_init
<kmp_uint64
>(loc
, gtid
, schedule
, lb
, ub
, st
, chunk
,
2614 void __kmp_aux_dispatch_fini_chunk_4(ident_t
*loc
, kmp_int32 gtid
) {
2615 __kmp_dispatch_finish_chunk
<kmp_uint32
>(gtid
, loc
);
2618 void __kmp_aux_dispatch_fini_chunk_8(ident_t
*loc
, kmp_int32 gtid
) {
2619 __kmp_dispatch_finish_chunk
<kmp_uint64
>(gtid
, loc
);
2622 void __kmp_aux_dispatch_fini_chunk_4u(ident_t
*loc
, kmp_int32 gtid
) {
2623 __kmp_dispatch_finish_chunk
<kmp_uint32
>(gtid
, loc
);
2626 void __kmp_aux_dispatch_fini_chunk_8u(ident_t
*loc
, kmp_int32 gtid
) {
2627 __kmp_dispatch_finish_chunk
<kmp_uint64
>(gtid
, loc
);
2630 #endif /* KMP_GOMP_COMPAT */
2632 /* ------------------------------------------------------------------------ */