5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
27 #if KMP_MIC && USE_NGO_STORES
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
40 void __kmp_print_structure(void); // Forward declaration
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n
) {
50 int socket_level
= __kmp_topology
->get_level(KMP_HW_SOCKET
);
51 int core_level
= __kmp_topology
->get_level(KMP_HW_CORE
);
52 int ncores_per_socket
=
53 __kmp_topology
->calculate_ratio(core_level
, socket_level
);
54 nsockets
= __kmp_topology
->get_count(socket_level
);
58 if (ncores_per_socket
<= 0)
59 ncores_per_socket
= 1;
61 threads_per_go
= ncores_per_socket
>> 1;
62 if (!fix_threads_per_go
) {
64 if (threads_per_go
> 4) {
65 if (KMP_OPTIMIZE_FOR_REDUCTIONS
) {
66 threads_per_go
= threads_per_go
>> 1;
68 if (threads_per_go
> 4 && nsockets
== 1)
69 threads_per_go
= threads_per_go
>> 1;
72 if (threads_per_go
== 0)
74 fix_threads_per_go
= true;
75 num_gos
= n
/ threads_per_go
;
76 if (n
% threads_per_go
)
78 if (nsockets
== 1 || num_gos
== 1)
81 num_groups
= num_gos
/ nsockets
;
82 if (num_gos
% nsockets
)
87 gos_per_group
= num_gos
/ num_groups
;
88 if (num_gos
% num_groups
)
90 threads_per_group
= threads_per_go
* gos_per_group
;
92 num_gos
= n
/ threads_per_go
;
93 if (n
% threads_per_go
)
98 num_groups
= num_gos
/ 2;
102 gos_per_group
= num_gos
/ num_groups
;
103 if (num_gos
% num_groups
)
105 threads_per_group
= threads_per_go
* gos_per_group
;
109 void distributedBarrier::computeGo(size_t n
) {
111 for (num_gos
= 1;; num_gos
++)
112 if (IDEAL_CONTENTION
* num_gos
>= n
)
114 threads_per_go
= n
/ num_gos
;
117 while (num_gos
> MAX_GOS
) {
119 num_gos
= n
/ threads_per_go
;
120 if (n
% threads_per_go
)
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr
) {
129 KMP_DEBUG_ASSERT(nthr
> max_threads
);
131 // expand to requested size * 2
132 max_threads
= nthr
* 2;
134 // allocate arrays to new max threads
135 for (int i
= 0; i
< MAX_ITERS
; ++i
) {
137 flags
[i
] = (flags_s
*)KMP_INTERNAL_REALLOC(flags
[i
],
138 max_threads
* sizeof(flags_s
));
140 flags
[i
] = (flags_s
*)KMP_INTERNAL_MALLOC(max_threads
* sizeof(flags_s
));
144 go
= (go_s
*)KMP_INTERNAL_REALLOC(go
, max_threads
* sizeof(go_s
));
146 go
= (go_s
*)KMP_INTERNAL_MALLOC(max_threads
* sizeof(go_s
));
149 iter
= (iter_s
*)KMP_INTERNAL_REALLOC(iter
, max_threads
* sizeof(iter_s
));
151 iter
= (iter_s
*)KMP_INTERNAL_MALLOC(max_threads
* sizeof(iter_s
));
155 (sleep_s
*)KMP_INTERNAL_REALLOC(sleep
, max_threads
* sizeof(sleep_s
));
157 sleep
= (sleep_s
*)KMP_INTERNAL_MALLOC(max_threads
* sizeof(sleep_s
));
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64
distributedBarrier::go_release() {
164 kmp_uint64 next_go
= iter
[0].iter
+ distributedBarrier::MAX_ITERS
;
165 for (size_t j
= 0; j
< num_gos
; j
++) {
166 go
[j
].go
.store(next_go
);
171 void distributedBarrier::go_reset() {
172 for (size_t j
= 0; j
< max_threads
; ++j
) {
173 for (size_t i
= 0; i
< distributedBarrier::MAX_ITERS
; ++i
) {
174 flags
[i
][j
].stillNeed
= 1;
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr
) {
184 size_t old_max
= max_threads
;
185 if (nthr
> max_threads
) { // need more space in arrays
189 for (size_t i
= 0; i
< max_threads
; i
++) {
190 for (size_t j
= 0; j
< distributedBarrier::MAX_ITERS
; j
++) {
191 flags
[j
][i
].stillNeed
= 1;
196 sleep
[i
].sleep
= false;
199 // Recalculate num_gos, etc. based on new nthr
200 computeVarsForN(nthr
);
204 if (team_icvs
== NULL
)
205 team_icvs
= __kmp_allocate(sizeof(kmp_internal_control_t
));
208 // This function is used only when KMP_BLOCKTIME is not infinite.
210 void __kmp_dist_barrier_wakeup(enum barrier_type bt
, kmp_team_t
*team
,
211 size_t start
, size_t stop
, size_t inc
,
213 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
);
214 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
217 kmp_info_t
**other_threads
= team
->t
.t_threads
;
218 for (size_t thr
= start
; thr
< stop
; thr
+= inc
) {
219 KMP_DEBUG_ASSERT(other_threads
[thr
]);
220 int gtid
= other_threads
[thr
]->th
.th_info
.ds
.ds_gtid
;
221 // Wake up worker regardless of if it appears to be sleeping or not
222 __kmp_atomic_resume_64(gtid
, (kmp_atomic_flag_64
<> *)NULL
);
226 static void __kmp_dist_barrier_gather(
227 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
228 void (*reduce
)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
229 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather
);
231 distributedBarrier
*b
;
232 kmp_info_t
**other_threads
;
233 kmp_uint64 my_current_iter
, my_next_iter
;
237 team
= this_thr
->th
.th_team
;
238 nproc
= this_thr
->th
.th_team_nproc
;
239 other_threads
= team
->t
.t_threads
;
241 my_current_iter
= b
->iter
[tid
].iter
;
242 my_next_iter
= (my_current_iter
+ 1) % distributedBarrier::MAX_ITERS
;
243 group_leader
= ((tid
% b
->threads_per_group
) == 0);
246 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247 gtid
, team
->t
.t_id
, tid
, bt
));
249 #if USE_ITT_BUILD && USE_ITT_NOTIFY
250 // Barrier imbalance - save arrive time to the thread
251 if (__kmp_forkjoin_frames_mode
== 3 || __kmp_forkjoin_frames_mode
== 2) {
252 this_thr
->th
.th_bar_arrive_time
= this_thr
->th
.th_bar_min_time
=
253 __itt_get_timestamp();
258 // Start from the thread after the group leader
259 size_t group_start
= tid
+ 1;
260 size_t group_end
= tid
+ b
->threads_per_group
;
261 size_t threads_pending
= 0;
263 if (group_end
> nproc
)
265 do { // wait for threads in my group
267 // Check all the flags every time to avoid branch misspredict
268 for (size_t thr
= group_start
; thr
< group_end
; thr
++) {
269 // Each thread uses a different cache line
270 threads_pending
+= b
->flags
[my_current_iter
][thr
].stillNeed
;
272 // Execute tasks here
273 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
274 kmp_task_team_t
*task_team
= this_thr
->th
.th_task_team
;
275 if (task_team
!= NULL
) {
276 if (TCR_SYNC_4(task_team
->tt
.tt_active
)) {
277 if (KMP_TASKING_ENABLED(task_team
)) {
278 int tasks_completed
= FALSE
;
279 __kmp_atomic_execute_tasks_64(
280 this_thr
, gtid
, (kmp_atomic_flag_64
<> *)NULL
, FALSE
,
281 &tasks_completed
USE_ITT_BUILD_ARG(itt_sync_obj
), 0);
283 this_thr
->th
.th_reap_state
= KMP_SAFE_TO_REAP
;
286 this_thr
->th
.th_reap_state
= KMP_SAFE_TO_REAP
;
289 if (TCR_4(__kmp_global
.g
.g_done
)) {
290 if (__kmp_global
.g
.g_abort
)
291 __kmp_abort_thread();
293 } else if (__kmp_tasking_mode
!= tskm_immediate_exec
&&
294 this_thr
->th
.th_reap_state
== KMP_SAFE_TO_REAP
) {
295 this_thr
->th
.th_reap_state
= KMP_NOT_SAFE_TO_REAP
;
297 } while (threads_pending
> 0);
299 if (reduce
) { // Perform reduction if needed
300 OMPT_REDUCTION_DECL(this_thr
, gtid
);
301 OMPT_REDUCTION_BEGIN
;
302 // Group leader reduces all threads in group
303 for (size_t thr
= group_start
; thr
< group_end
; thr
++) {
304 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
305 other_threads
[thr
]->th
.th_local
.reduce_data
);
310 // Set flag for next iteration
311 b
->flags
[my_next_iter
][tid
].stillNeed
= 1;
312 // Each thread uses a different cache line; resets stillNeed to 0 to
313 // indicate it has reached the barrier
314 b
->flags
[my_current_iter
][tid
].stillNeed
= 0;
316 do { // wait for all group leaders
318 for (size_t thr
= 0; thr
< nproc
; thr
+= b
->threads_per_group
) {
319 threads_pending
+= b
->flags
[my_current_iter
][thr
].stillNeed
;
321 // Execute tasks here
322 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
323 kmp_task_team_t
*task_team
= this_thr
->th
.th_task_team
;
324 if (task_team
!= NULL
) {
325 if (TCR_SYNC_4(task_team
->tt
.tt_active
)) {
326 if (KMP_TASKING_ENABLED(task_team
)) {
327 int tasks_completed
= FALSE
;
328 __kmp_atomic_execute_tasks_64(
329 this_thr
, gtid
, (kmp_atomic_flag_64
<> *)NULL
, FALSE
,
330 &tasks_completed
USE_ITT_BUILD_ARG(itt_sync_obj
), 0);
332 this_thr
->th
.th_reap_state
= KMP_SAFE_TO_REAP
;
335 this_thr
->th
.th_reap_state
= KMP_SAFE_TO_REAP
;
338 if (TCR_4(__kmp_global
.g
.g_done
)) {
339 if (__kmp_global
.g
.g_abort
)
340 __kmp_abort_thread();
342 } else if (__kmp_tasking_mode
!= tskm_immediate_exec
&&
343 this_thr
->th
.th_reap_state
== KMP_SAFE_TO_REAP
) {
344 this_thr
->th
.th_reap_state
= KMP_NOT_SAFE_TO_REAP
;
346 } while (threads_pending
> 0);
348 if (reduce
) { // Perform reduction if needed
349 if (KMP_MASTER_TID(tid
)) { // Master reduces over group leaders
350 OMPT_REDUCTION_DECL(this_thr
, gtid
);
351 OMPT_REDUCTION_BEGIN
;
352 for (size_t thr
= b
->threads_per_group
; thr
< nproc
;
353 thr
+= b
->threads_per_group
) {
354 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
355 other_threads
[thr
]->th
.th_local
.reduce_data
);
361 // Set flag for next iteration
362 b
->flags
[my_next_iter
][tid
].stillNeed
= 1;
363 // Each thread uses a different cache line; resets stillNeed to 0 to
364 // indicate it has reached the barrier
365 b
->flags
[my_current_iter
][tid
].stillNeed
= 0;
371 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372 gtid
, team
->t
.t_id
, tid
, bt
));
375 static void __kmp_dist_barrier_release(
376 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
377 int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
378 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release
);
380 distributedBarrier
*b
;
381 kmp_bstate_t
*thr_bar
;
382 kmp_uint64 my_current_iter
, next_go
;
386 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
389 thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
391 if (!KMP_MASTER_TID(tid
)) {
392 // workers and non-master group leaders need to check their presence in team
394 if (this_thr
->th
.th_used_in_team
.load() != 1 &&
395 this_thr
->th
.th_used_in_team
.load() != 3) {
396 // Thread is not in use in a team. Wait on location in tid's thread
397 // struct. The 0 value tells anyone looking that this thread is spinning
398 // or sleeping until this location becomes 3 again; 3 is the transition
399 // state to get to 1 which is waiting on go and being in the team
400 kmp_flag_32
<false, false> my_flag(&(this_thr
->th
.th_used_in_team
), 3);
401 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr
->th
.th_used_in_team
), 2,
403 this_thr
->th
.th_used_in_team
.load() == 0) {
404 my_flag
.wait(this_thr
, true USE_ITT_BUILD_ARG(itt_sync_obj
));
406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
407 if ((__itt_sync_create_ptr
&& itt_sync_obj
== NULL
) || KMP_ITT_DEBUG
) {
408 // In fork barrier where we could not get the object reliably
410 __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
, 0, -1);
411 // Cancel wait on previous parallel region...
412 __kmp_itt_task_starting(itt_sync_obj
);
414 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
417 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
418 if (itt_sync_obj
!= NULL
)
419 // Call prepare as early as possible for "new" barrier
420 __kmp_itt_task_finished(itt_sync_obj
);
422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
426 if (this_thr
->th
.th_used_in_team
.load() != 1 &&
427 this_thr
->th
.th_used_in_team
.load() != 3) // spurious wake-up?
429 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
432 // At this point, the thread thinks it is in use in a team, or in
433 // transition to be used in a team, but it might have reached this barrier
434 // before it was marked unused by the team. Unused threads are awoken and
435 // shifted to wait on local thread struct elsewhere. It also might reach
436 // this point by being picked up for use by a different team. Either way,
437 // we need to update the tid.
438 tid
= __kmp_tid_from_gtid(gtid
);
439 team
= this_thr
->th
.th_team
;
440 KMP_DEBUG_ASSERT(tid
>= 0);
441 KMP_DEBUG_ASSERT(team
);
443 my_current_iter
= b
->iter
[tid
].iter
;
444 next_go
= my_current_iter
+ distributedBarrier::MAX_ITERS
;
445 my_go_index
= tid
/ b
->threads_per_go
;
446 if (this_thr
->th
.th_used_in_team
.load() == 3) {
447 (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr
->th
.th_used_in_team
), 3,
450 // Check if go flag is set
451 if (b
->go
[my_go_index
].go
.load() != next_go
) {
452 // Wait on go flag on team
453 kmp_atomic_flag_64
<false, true> my_flag(
454 &(b
->go
[my_go_index
].go
), next_go
, &(b
->sleep
[tid
].sleep
));
455 my_flag
.wait(this_thr
, true USE_ITT_BUILD_ARG(itt_sync_obj
));
456 KMP_DEBUG_ASSERT(my_current_iter
== b
->iter
[tid
].iter
||
457 b
->iter
[tid
].iter
== 0);
458 KMP_DEBUG_ASSERT(b
->sleep
[tid
].sleep
== false);
461 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
463 // At this point, the thread's go location was set. This means the primary
464 // thread is safely in the barrier, and so this thread's data is
465 // up-to-date, but we should check again that this thread is really in
466 // use in the team, as it could have been woken up for the purpose of
467 // changing team size, or reaping threads at shutdown.
468 if (this_thr
->th
.th_used_in_team
.load() == 1)
472 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
475 group_leader
= ((tid
% b
->threads_per_group
) == 0);
477 // Tell all the threads in my group they can go!
478 for (size_t go_idx
= my_go_index
+ 1;
479 go_idx
< my_go_index
+ b
->gos_per_group
; go_idx
++) {
480 b
->go
[go_idx
].go
.store(next_go
);
482 // Fence added so that workers can see changes to go. sfence inadequate.
486 #if KMP_BARRIER_ICV_PUSH
487 if (propagate_icvs
) { // copy ICVs to final dest
488 __kmp_init_implicit_task(team
->t
.t_ident
, team
->t
.t_threads
[tid
], team
,
490 copy_icvs(&team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
,
491 (kmp_internal_control_t
*)team
->t
.b
->team_icvs
);
492 copy_icvs(&thr_bar
->th_fixed_icvs
,
493 &team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
);
496 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
&& group_leader
) {
497 // This thread is now awake and participating in the barrier;
498 // wake up the other threads in the group
499 size_t nproc
= this_thr
->th
.th_team_nproc
;
500 size_t group_end
= tid
+ b
->threads_per_group
;
501 if (nproc
< group_end
)
503 __kmp_dist_barrier_wakeup(bt
, team
, tid
+ 1, group_end
, 1, tid
);
505 } else { // Primary thread
506 team
= this_thr
->th
.th_team
;
508 my_current_iter
= b
->iter
[tid
].iter
;
509 next_go
= my_current_iter
+ distributedBarrier::MAX_ITERS
;
510 #if KMP_BARRIER_ICV_PUSH
511 if (propagate_icvs
) {
512 // primary thread has ICVs in final destination; copy
513 copy_icvs(&thr_bar
->th_fixed_icvs
,
514 &team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
);
517 // Tell all the group leaders they can go!
518 for (size_t go_idx
= 0; go_idx
< b
->num_gos
; go_idx
+= b
->gos_per_group
) {
519 b
->go
[go_idx
].go
.store(next_go
);
522 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
) {
523 // Wake-up the group leaders
524 size_t nproc
= this_thr
->th
.th_team_nproc
;
525 __kmp_dist_barrier_wakeup(bt
, team
, tid
+ b
->threads_per_group
, nproc
,
526 b
->threads_per_group
, tid
);
529 // Tell all the threads in my group they can go!
530 for (size_t go_idx
= 1; go_idx
< b
->gos_per_group
; go_idx
++) {
531 b
->go
[go_idx
].go
.store(next_go
);
534 // Fence added so that workers can see changes to go. sfence inadequate.
537 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
) {
538 // Wake-up the other threads in my group
539 size_t nproc
= this_thr
->th
.th_team_nproc
;
540 size_t group_end
= tid
+ b
->threads_per_group
;
541 if (nproc
< group_end
)
543 __kmp_dist_barrier_wakeup(bt
, team
, tid
+ 1, group_end
, 1, tid
);
546 // Update to next iteration
547 KMP_ASSERT(my_current_iter
== b
->iter
[tid
].iter
);
548 b
->iter
[tid
].iter
= (b
->iter
[tid
].iter
+ 1) % distributedBarrier::MAX_ITERS
;
551 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
552 gtid
, team
->t
.t_id
, tid
, bt
));
556 template <bool cancellable
= false>
557 static bool __kmp_linear_barrier_gather_template(
558 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
559 void (*reduce
)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
560 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather
);
561 kmp_team_t
*team
= this_thr
->th
.th_team
;
562 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
563 kmp_info_t
**other_threads
= team
->t
.t_threads
;
567 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
568 gtid
, team
->t
.t_id
, tid
, bt
));
569 KMP_DEBUG_ASSERT(this_thr
== other_threads
[this_thr
->th
.th_info
.ds
.ds_tid
]);
571 #if USE_ITT_BUILD && USE_ITT_NOTIFY
572 // Barrier imbalance - save arrive time to the thread
573 if (__kmp_forkjoin_frames_mode
== 3 || __kmp_forkjoin_frames_mode
== 2) {
574 this_thr
->th
.th_bar_arrive_time
= this_thr
->th
.th_bar_min_time
=
575 __itt_get_timestamp();
578 // We now perform a linear reduction to signal that all of the threads have
580 if (!KMP_MASTER_TID(tid
)) {
582 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
583 "arrived(%p): %llu => %llu\n",
584 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(0, team
),
585 team
->t
.t_id
, 0, &thr_bar
->b_arrived
, thr_bar
->b_arrived
,
586 thr_bar
->b_arrived
+ KMP_BARRIER_STATE_BUMP
));
587 // Mark arrival to primary thread
588 /* After performing this write, a worker thread may not assume that the team
589 is valid any more - it could be deallocated by the primary thread at any
591 kmp_flag_64
<> flag(&thr_bar
->b_arrived
, other_threads
[0]);
594 kmp_balign_team_t
*team_bar
= &team
->t
.t_bar
[bt
];
595 int nproc
= this_thr
->th
.th_team_nproc
;
597 // Don't have to worry about sleep bit here or atomic since team setting
598 kmp_uint64 new_state
= team_bar
->b_arrived
+ KMP_BARRIER_STATE_BUMP
;
600 // Collect all the worker team member threads.
601 for (i
= 1; i
< nproc
; ++i
) {
603 // Prefetch next thread's arrived count
605 KMP_CACHE_PREFETCH(&other_threads
[i
+ 1]->th
.th_bar
[bt
].bb
.b_arrived
);
606 #endif /* KMP_CACHE_MANAGE */
607 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
608 "arrived(%p) == %llu\n",
609 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(i
, team
),
611 &other_threads
[i
]->th
.th_bar
[bt
].bb
.b_arrived
, new_state
));
613 // Wait for worker thread to arrive
615 kmp_flag_64
<true, false> flag(
616 &other_threads
[i
]->th
.th_bar
[bt
].bb
.b_arrived
, new_state
);
617 if (flag
.wait(this_thr
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
)))
620 kmp_flag_64
<> flag(&other_threads
[i
]->th
.th_bar
[bt
].bb
.b_arrived
,
622 flag
.wait(this_thr
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
624 #if USE_ITT_BUILD && USE_ITT_NOTIFY
625 // Barrier imbalance - write min of the thread time and the other thread
626 // time to the thread.
627 if (__kmp_forkjoin_frames_mode
== 2) {
628 this_thr
->th
.th_bar_min_time
= KMP_MIN(
629 this_thr
->th
.th_bar_min_time
, other_threads
[i
]->th
.th_bar_min_time
);
634 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
635 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(i
, team
),
637 OMPT_REDUCTION_DECL(this_thr
, gtid
);
638 OMPT_REDUCTION_BEGIN
;
639 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
640 other_threads
[i
]->th
.th_local
.reduce_data
);
644 // Don't have to worry about sleep bit here or atomic since team setting
645 team_bar
->b_arrived
= new_state
;
646 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
647 "arrived(%p) = %llu\n",
648 gtid
, team
->t
.t_id
, tid
, team
->t
.t_id
, &team_bar
->b_arrived
,
653 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
654 gtid
, team
->t
.t_id
, tid
, bt
));
658 template <bool cancellable
= false>
659 static bool __kmp_linear_barrier_release_template(
660 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
661 int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
662 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release
);
663 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
666 if (KMP_MASTER_TID(tid
)) {
668 kmp_uint32 nproc
= this_thr
->th
.th_team_nproc
;
669 kmp_info_t
**other_threads
;
671 team
= __kmp_threads
[gtid
]->th
.th_team
;
672 KMP_DEBUG_ASSERT(team
!= NULL
);
673 other_threads
= team
->t
.t_threads
;
675 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
677 gtid
, team
->t
.t_id
, tid
, bt
));
680 #if KMP_BARRIER_ICV_PUSH
682 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy
);
683 if (propagate_icvs
) {
684 ngo_load(&team
->t
.t_implicit_task_taskdata
[0].td_icvs
);
685 for (i
= 1; i
< nproc
; ++i
) {
686 __kmp_init_implicit_task(team
->t
.t_ident
, team
->t
.t_threads
[i
],
688 ngo_store_icvs(&team
->t
.t_implicit_task_taskdata
[i
].td_icvs
,
689 &team
->t
.t_implicit_task_taskdata
[0].td_icvs
);
694 #endif // KMP_BARRIER_ICV_PUSH
696 // Now, release all of the worker threads
697 for (i
= 1; i
< nproc
; ++i
) {
699 // Prefetch next thread's go flag
701 KMP_CACHE_PREFETCH(&other_threads
[i
+ 1]->th
.th_bar
[bt
].bb
.b_go
);
702 #endif /* KMP_CACHE_MANAGE */
705 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
706 "go(%p): %u => %u\n",
707 gtid
, team
->t
.t_id
, tid
, other_threads
[i
]->th
.th_info
.ds
.ds_gtid
,
708 team
->t
.t_id
, i
, &other_threads
[i
]->th
.th_bar
[bt
].bb
.b_go
,
709 other_threads
[i
]->th
.th_bar
[bt
].bb
.b_go
,
710 other_threads
[i
]->th
.th_bar
[bt
].bb
.b_go
+ KMP_BARRIER_STATE_BUMP
));
711 kmp_flag_64
<> flag(&other_threads
[i
]->th
.th_bar
[bt
].bb
.b_go
,
716 } else { // Wait for the PRIMARY thread to release us
717 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
718 gtid
, &thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
));
720 kmp_flag_64
<true, false> flag(&thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
);
721 if (flag
.wait(this_thr
, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
)))
724 kmp_flag_64
<> flag(&thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
);
725 flag
.wait(this_thr
, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
727 #if USE_ITT_BUILD && USE_ITT_NOTIFY
728 if ((__itt_sync_create_ptr
&& itt_sync_obj
== NULL
) || KMP_ITT_DEBUG
) {
729 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
731 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
, 0, -1);
732 // Cancel wait on previous parallel region...
733 __kmp_itt_task_starting(itt_sync_obj
);
735 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
738 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
739 if (itt_sync_obj
!= NULL
)
740 // Call prepare as early as possible for "new" barrier
741 __kmp_itt_task_finished(itt_sync_obj
);
743 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
744 // Early exit for reaping threads releasing forkjoin barrier
745 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
747 // The worker thread may now assume that the team is valid.
749 tid
= __kmp_tid_from_gtid(gtid
);
750 team
= __kmp_threads
[gtid
]->th
.th_team
;
752 KMP_DEBUG_ASSERT(team
!= NULL
);
753 TCW_4(thr_bar
->b_go
, KMP_INIT_BARRIER_STATE
);
755 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
756 gtid
, team
->t
.t_id
, tid
, &thr_bar
->b_go
, KMP_INIT_BARRIER_STATE
));
757 KMP_MB(); // Flush all pending memory write invalidates.
761 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
762 gtid
, team
->t
.t_id
, tid
, bt
));
766 static void __kmp_linear_barrier_gather(
767 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
768 void (*reduce
)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
769 __kmp_linear_barrier_gather_template
<false>(
770 bt
, this_thr
, gtid
, tid
, reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
773 static bool __kmp_linear_barrier_gather_cancellable(
774 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
775 void (*reduce
)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
776 return __kmp_linear_barrier_gather_template
<true>(
777 bt
, this_thr
, gtid
, tid
, reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
780 static void __kmp_linear_barrier_release(
781 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
782 int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
783 __kmp_linear_barrier_release_template
<false>(
784 bt
, this_thr
, gtid
, tid
, propagate_icvs
USE_ITT_BUILD_ARG(itt_sync_obj
));
787 static bool __kmp_linear_barrier_release_cancellable(
788 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
789 int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
790 return __kmp_linear_barrier_release_template
<true>(
791 bt
, this_thr
, gtid
, tid
, propagate_icvs
USE_ITT_BUILD_ARG(itt_sync_obj
));
795 static void __kmp_tree_barrier_gather(
796 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
797 void (*reduce
)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
798 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather
);
799 kmp_team_t
*team
= this_thr
->th
.th_team
;
800 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
801 kmp_info_t
**other_threads
= team
->t
.t_threads
;
802 kmp_uint32 nproc
= this_thr
->th
.th_team_nproc
;
803 kmp_uint32 branch_bits
= __kmp_barrier_gather_branch_bits
[bt
];
804 kmp_uint32 branch_factor
= 1 << branch_bits
;
806 kmp_uint32 child_tid
;
807 kmp_uint64 new_state
= 0;
810 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
811 gtid
, team
->t
.t_id
, tid
, bt
));
812 KMP_DEBUG_ASSERT(this_thr
== other_threads
[this_thr
->th
.th_info
.ds
.ds_tid
]);
814 #if USE_ITT_BUILD && USE_ITT_NOTIFY
815 // Barrier imbalance - save arrive time to the thread
816 if (__kmp_forkjoin_frames_mode
== 3 || __kmp_forkjoin_frames_mode
== 2) {
817 this_thr
->th
.th_bar_arrive_time
= this_thr
->th
.th_bar_min_time
=
818 __itt_get_timestamp();
821 // Perform tree gather to wait until all threads have arrived; reduce any
822 // required data as we go
823 child_tid
= (tid
<< branch_bits
) + 1;
824 if (child_tid
< nproc
) {
825 // Parent threads wait for all their children to arrive
826 new_state
= team
->t
.t_bar
[bt
].b_arrived
+ KMP_BARRIER_STATE_BUMP
;
829 kmp_info_t
*child_thr
= other_threads
[child_tid
];
830 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
832 // Prefetch next thread's arrived count
833 if (child
+ 1 <= branch_factor
&& child_tid
+ 1 < nproc
)
835 &other_threads
[child_tid
+ 1]->th
.th_bar
[bt
].bb
.b_arrived
);
836 #endif /* KMP_CACHE_MANAGE */
838 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
839 "arrived(%p) == %llu\n",
840 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(child_tid
, team
),
841 team
->t
.t_id
, child_tid
, &child_bar
->b_arrived
, new_state
));
842 // Wait for child to arrive
843 kmp_flag_64
<> flag(&child_bar
->b_arrived
, new_state
);
844 flag
.wait(this_thr
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
845 #if USE_ITT_BUILD && USE_ITT_NOTIFY
846 // Barrier imbalance - write min of the thread time and a child time to
848 if (__kmp_forkjoin_frames_mode
== 2) {
849 this_thr
->th
.th_bar_min_time
= KMP_MIN(this_thr
->th
.th_bar_min_time
,
850 child_thr
->th
.th_bar_min_time
);
855 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
856 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(child_tid
, team
),
857 team
->t
.t_id
, child_tid
));
858 OMPT_REDUCTION_DECL(this_thr
, gtid
);
859 OMPT_REDUCTION_BEGIN
;
860 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
861 child_thr
->th
.th_local
.reduce_data
);
866 } while (child
<= branch_factor
&& child_tid
< nproc
);
869 if (!KMP_MASTER_TID(tid
)) { // Worker threads
870 kmp_int32 parent_tid
= (tid
- 1) >> branch_bits
;
873 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
874 "arrived(%p): %llu => %llu\n",
875 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(parent_tid
, team
),
876 team
->t
.t_id
, parent_tid
, &thr_bar
->b_arrived
, thr_bar
->b_arrived
,
877 thr_bar
->b_arrived
+ KMP_BARRIER_STATE_BUMP
));
879 // Mark arrival to parent thread
880 /* After performing this write, a worker thread may not assume that the team
881 is valid any more - it could be deallocated by the primary thread at any
883 kmp_flag_64
<> flag(&thr_bar
->b_arrived
, other_threads
[parent_tid
]);
886 // Need to update the team arrived pointer if we are the primary thread
887 if (nproc
> 1) // New value was already computed above
888 team
->t
.t_bar
[bt
].b_arrived
= new_state
;
890 team
->t
.t_bar
[bt
].b_arrived
+= KMP_BARRIER_STATE_BUMP
;
891 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
892 "arrived(%p) = %llu\n",
893 gtid
, team
->t
.t_id
, tid
, team
->t
.t_id
,
894 &team
->t
.t_bar
[bt
].b_arrived
, team
->t
.t_bar
[bt
].b_arrived
));
897 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
898 gtid
, team
->t
.t_id
, tid
, bt
));
901 static void __kmp_tree_barrier_release(
902 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
903 int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
904 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release
);
906 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
908 kmp_uint32 branch_bits
= __kmp_barrier_release_branch_bits
[bt
];
909 kmp_uint32 branch_factor
= 1 << branch_bits
;
911 kmp_uint32 child_tid
;
913 // Perform a tree release for all of the threads that have been gathered
915 tid
)) { // Handle fork barrier workers who aren't part of a team yet
916 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid
,
917 &thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
));
918 // Wait for parent thread to release us
919 kmp_flag_64
<> flag(&thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
);
920 flag
.wait(this_thr
, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
921 #if USE_ITT_BUILD && USE_ITT_NOTIFY
922 if ((__itt_sync_create_ptr
&& itt_sync_obj
== NULL
) || KMP_ITT_DEBUG
) {
923 // In fork barrier where we could not get the object reliably (or
924 // ITTNOTIFY is disabled)
925 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
, 0, -1);
926 // Cancel wait on previous parallel region...
927 __kmp_itt_task_starting(itt_sync_obj
);
929 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
932 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
933 if (itt_sync_obj
!= NULL
)
934 // Call prepare as early as possible for "new" barrier
935 __kmp_itt_task_finished(itt_sync_obj
);
937 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
938 // Early exit for reaping threads releasing forkjoin barrier
939 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
942 // The worker thread may now assume that the team is valid.
943 team
= __kmp_threads
[gtid
]->th
.th_team
;
944 KMP_DEBUG_ASSERT(team
!= NULL
);
945 tid
= __kmp_tid_from_gtid(gtid
);
947 TCW_4(thr_bar
->b_go
, KMP_INIT_BARRIER_STATE
);
949 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid
,
950 team
->t
.t_id
, tid
, &thr_bar
->b_go
, KMP_INIT_BARRIER_STATE
));
951 KMP_MB(); // Flush all pending memory write invalidates.
953 team
= __kmp_threads
[gtid
]->th
.th_team
;
954 KMP_DEBUG_ASSERT(team
!= NULL
);
955 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
957 gtid
, team
->t
.t_id
, tid
, bt
));
959 nproc
= this_thr
->th
.th_team_nproc
;
960 child_tid
= (tid
<< branch_bits
) + 1;
962 if (child_tid
< nproc
) {
963 kmp_info_t
**other_threads
= team
->t
.t_threads
;
965 // Parent threads release all their children
967 kmp_info_t
*child_thr
= other_threads
[child_tid
];
968 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
970 // Prefetch next thread's go count
971 if (child
+ 1 <= branch_factor
&& child_tid
+ 1 < nproc
)
973 &other_threads
[child_tid
+ 1]->th
.th_bar
[bt
].bb
.b_go
);
974 #endif /* KMP_CACHE_MANAGE */
976 #if KMP_BARRIER_ICV_PUSH
978 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy
);
979 if (propagate_icvs
) {
980 __kmp_init_implicit_task(team
->t
.t_ident
,
981 team
->t
.t_threads
[child_tid
], team
,
983 copy_icvs(&team
->t
.t_implicit_task_taskdata
[child_tid
].td_icvs
,
984 &team
->t
.t_implicit_task_taskdata
[0].td_icvs
);
987 #endif // KMP_BARRIER_ICV_PUSH
989 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
990 "go(%p): %u => %u\n",
991 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(child_tid
, team
),
992 team
->t
.t_id
, child_tid
, &child_bar
->b_go
, child_bar
->b_go
,
993 child_bar
->b_go
+ KMP_BARRIER_STATE_BUMP
));
994 // Release child from barrier
995 kmp_flag_64
<> flag(&child_bar
->b_go
, child_thr
);
999 } while (child
<= branch_factor
&& child_tid
< nproc
);
1002 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1003 gtid
, team
->t
.t_id
, tid
, bt
));
1007 static void __kmp_hyper_barrier_gather(
1008 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
1009 void (*reduce
)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
1010 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather
);
1011 kmp_team_t
*team
= this_thr
->th
.th_team
;
1012 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
1013 kmp_info_t
**other_threads
= team
->t
.t_threads
;
1014 kmp_uint64 new_state
= KMP_BARRIER_UNUSED_STATE
;
1015 kmp_uint32 num_threads
= this_thr
->th
.th_team_nproc
;
1016 kmp_uint32 branch_bits
= __kmp_barrier_gather_branch_bits
[bt
];
1017 kmp_uint32 branch_factor
= 1 << branch_bits
;
1023 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1024 gtid
, team
->t
.t_id
, tid
, bt
));
1025 KMP_DEBUG_ASSERT(this_thr
== other_threads
[this_thr
->th
.th_info
.ds
.ds_tid
]);
1027 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1028 // Barrier imbalance - save arrive time to the thread
1029 if (__kmp_forkjoin_frames_mode
== 3 || __kmp_forkjoin_frames_mode
== 2) {
1030 this_thr
->th
.th_bar_arrive_time
= this_thr
->th
.th_bar_min_time
=
1031 __itt_get_timestamp();
1034 /* Perform a hypercube-embedded tree gather to wait until all of the threads
1035 have arrived, and reduce any required data as we go. */
1036 kmp_flag_64
<> p_flag(&thr_bar
->b_arrived
);
1037 for (level
= 0, offset
= 1; offset
< num_threads
;
1038 level
+= branch_bits
, offset
<<= branch_bits
) {
1040 kmp_uint32 child_tid
;
1042 if (((tid
>> level
) & (branch_factor
- 1)) != 0) {
1043 kmp_int32 parent_tid
= tid
& ~((1 << (level
+ branch_bits
)) - 1);
1045 KMP_MB(); // Synchronize parent and child threads.
1047 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1048 "arrived(%p): %llu => %llu\n",
1049 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(parent_tid
, team
),
1050 team
->t
.t_id
, parent_tid
, &thr_bar
->b_arrived
,
1052 thr_bar
->b_arrived
+ KMP_BARRIER_STATE_BUMP
));
1053 // Mark arrival to parent thread
1054 /* After performing this write (in the last iteration of the enclosing for
1055 loop), a worker thread may not assume that the team is valid any more
1056 - it could be deallocated by the primary thread at any time. */
1057 p_flag
.set_waiter(other_threads
[parent_tid
]);
1062 // Parent threads wait for children to arrive
1063 if (new_state
== KMP_BARRIER_UNUSED_STATE
)
1064 new_state
= team
->t
.t_bar
[bt
].b_arrived
+ KMP_BARRIER_STATE_BUMP
;
1065 for (child
= 1, child_tid
= tid
+ (1 << level
);
1066 child
< branch_factor
&& child_tid
< num_threads
;
1067 child
++, child_tid
+= (1 << level
)) {
1068 kmp_info_t
*child_thr
= other_threads
[child_tid
];
1069 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
1070 #if KMP_CACHE_MANAGE
1071 kmp_uint32 next_child_tid
= child_tid
+ (1 << level
);
1072 // Prefetch next thread's arrived count
1073 if (child
+ 1 < branch_factor
&& next_child_tid
< num_threads
)
1075 &other_threads
[next_child_tid
]->th
.th_bar
[bt
].bb
.b_arrived
);
1076 #endif /* KMP_CACHE_MANAGE */
1078 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1079 "arrived(%p) == %llu\n",
1080 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(child_tid
, team
),
1081 team
->t
.t_id
, child_tid
, &child_bar
->b_arrived
, new_state
));
1082 // Wait for child to arrive
1083 kmp_flag_64
<> c_flag(&child_bar
->b_arrived
, new_state
);
1084 c_flag
.wait(this_thr
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
1085 KMP_MB(); // Synchronize parent and child threads.
1086 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1087 // Barrier imbalance - write min of the thread time and a child time to
1089 if (__kmp_forkjoin_frames_mode
== 2) {
1090 this_thr
->th
.th_bar_min_time
= KMP_MIN(this_thr
->th
.th_bar_min_time
,
1091 child_thr
->th
.th_bar_min_time
);
1096 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1097 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(child_tid
, team
),
1098 team
->t
.t_id
, child_tid
));
1099 OMPT_REDUCTION_DECL(this_thr
, gtid
);
1100 OMPT_REDUCTION_BEGIN
;
1101 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
1102 child_thr
->th
.th_local
.reduce_data
);
1108 if (KMP_MASTER_TID(tid
)) {
1109 // Need to update the team arrived pointer if we are the primary thread
1110 if (new_state
== KMP_BARRIER_UNUSED_STATE
)
1111 team
->t
.t_bar
[bt
].b_arrived
+= KMP_BARRIER_STATE_BUMP
;
1113 team
->t
.t_bar
[bt
].b_arrived
= new_state
;
1114 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1115 "arrived(%p) = %llu\n",
1116 gtid
, team
->t
.t_id
, tid
, team
->t
.t_id
,
1117 &team
->t
.t_bar
[bt
].b_arrived
, team
->t
.t_bar
[bt
].b_arrived
));
1120 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1121 gtid
, team
->t
.t_id
, tid
, bt
));
1124 // The reverse versions seem to beat the forward versions overall
1125 #define KMP_REVERSE_HYPER_BAR
1126 static void __kmp_hyper_barrier_release(
1127 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
1128 int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
1129 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release
);
1131 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
1132 kmp_info_t
**other_threads
;
1133 kmp_uint32 num_threads
;
1134 kmp_uint32 branch_bits
= __kmp_barrier_release_branch_bits
[bt
];
1135 kmp_uint32 branch_factor
= 1 << branch_bits
;
1137 kmp_uint32 child_tid
;
1141 /* Perform a hypercube-embedded tree release for all of the threads that have
1142 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1143 are released in the reverse order of the corresponding gather, otherwise
1144 threads are released in the same order. */
1145 if (KMP_MASTER_TID(tid
)) { // primary thread
1146 team
= __kmp_threads
[gtid
]->th
.th_team
;
1147 KMP_DEBUG_ASSERT(team
!= NULL
);
1148 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1149 "barrier type %d\n",
1150 gtid
, team
->t
.t_id
, tid
, bt
));
1151 #if KMP_BARRIER_ICV_PUSH
1152 if (propagate_icvs
) { // primary already has ICVs in final destination; copy
1153 copy_icvs(&thr_bar
->th_fixed_icvs
,
1154 &team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
);
1157 } else { // Handle fork barrier workers who aren't part of a team yet
1158 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid
,
1159 &thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
));
1160 // Wait for parent thread to release us
1161 kmp_flag_64
<> flag(&thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
);
1162 flag
.wait(this_thr
, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
1163 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1164 if ((__itt_sync_create_ptr
&& itt_sync_obj
== NULL
) || KMP_ITT_DEBUG
) {
1165 // In fork barrier where we could not get the object reliably
1166 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
, 0, -1);
1167 // Cancel wait on previous parallel region...
1168 __kmp_itt_task_starting(itt_sync_obj
);
1170 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
1173 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
1174 if (itt_sync_obj
!= NULL
)
1175 // Call prepare as early as possible for "new" barrier
1176 __kmp_itt_task_finished(itt_sync_obj
);
1178 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1179 // Early exit for reaping threads releasing forkjoin barrier
1180 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
1183 // The worker thread may now assume that the team is valid.
1184 team
= __kmp_threads
[gtid
]->th
.th_team
;
1185 KMP_DEBUG_ASSERT(team
!= NULL
);
1186 tid
= __kmp_tid_from_gtid(gtid
);
1188 TCW_4(thr_bar
->b_go
, KMP_INIT_BARRIER_STATE
);
1190 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1191 gtid
, team
->t
.t_id
, tid
, &thr_bar
->b_go
, KMP_INIT_BARRIER_STATE
));
1192 KMP_MB(); // Flush all pending memory write invalidates.
1194 num_threads
= this_thr
->th
.th_team_nproc
;
1195 other_threads
= team
->t
.t_threads
;
1197 #ifdef KMP_REVERSE_HYPER_BAR
1198 // Count up to correct level for parent
1199 for (level
= 0, offset
= 1;
1200 offset
< num_threads
&& (((tid
>> level
) & (branch_factor
- 1)) == 0);
1201 level
+= branch_bits
, offset
<<= branch_bits
)
1204 // Now go down from there
1205 for (level
-= branch_bits
, offset
>>= branch_bits
; offset
!= 0;
1206 level
-= branch_bits
, offset
>>= branch_bits
)
1208 // Go down the tree, level by level
1209 for (level
= 0, offset
= 1; offset
< num_threads
;
1210 level
+= branch_bits
, offset
<<= branch_bits
)
1211 #endif // KMP_REVERSE_HYPER_BAR
1213 #ifdef KMP_REVERSE_HYPER_BAR
1214 /* Now go in reverse order through the children, highest to lowest.
1215 Initial setting of child is conservative here. */
1216 child
= num_threads
>> ((level
== 0) ? level
: level
- 1);
1217 for (child
= (child
< branch_factor
- 1) ? child
: branch_factor
- 1,
1218 child_tid
= tid
+ (child
<< level
);
1219 child
>= 1; child
--, child_tid
-= (1 << level
))
1221 if (((tid
>> level
) & (branch_factor
- 1)) != 0)
1222 // No need to go lower than this, since this is the level parent would be
1225 // Iterate through children on this level of the tree
1226 for (child
= 1, child_tid
= tid
+ (1 << level
);
1227 child
< branch_factor
&& child_tid
< num_threads
;
1228 child
++, child_tid
+= (1 << level
))
1229 #endif // KMP_REVERSE_HYPER_BAR
1231 if (child_tid
>= num_threads
)
1232 continue; // Child doesn't exist so keep going
1234 kmp_info_t
*child_thr
= other_threads
[child_tid
];
1235 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
1236 #if KMP_CACHE_MANAGE
1237 kmp_uint32 next_child_tid
= child_tid
- (1 << level
);
1238 // Prefetch next thread's go count
1239 #ifdef KMP_REVERSE_HYPER_BAR
1240 if (child
- 1 >= 1 && next_child_tid
< num_threads
)
1242 if (child
+ 1 < branch_factor
&& next_child_tid
< num_threads
)
1243 #endif // KMP_REVERSE_HYPER_BAR
1245 &other_threads
[next_child_tid
]->th
.th_bar
[bt
].bb
.b_go
);
1246 #endif /* KMP_CACHE_MANAGE */
1248 #if KMP_BARRIER_ICV_PUSH
1249 if (propagate_icvs
) // push my fixed ICVs to my child
1250 copy_icvs(&child_bar
->th_fixed_icvs
, &thr_bar
->th_fixed_icvs
);
1251 #endif // KMP_BARRIER_ICV_PUSH
1255 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1256 "go(%p): %u => %u\n",
1257 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(child_tid
, team
),
1258 team
->t
.t_id
, child_tid
, &child_bar
->b_go
, child_bar
->b_go
,
1259 child_bar
->b_go
+ KMP_BARRIER_STATE_BUMP
));
1260 // Release child from barrier
1261 kmp_flag_64
<> flag(&child_bar
->b_go
, child_thr
);
1266 #if KMP_BARRIER_ICV_PUSH
1267 if (propagate_icvs
&&
1268 !KMP_MASTER_TID(tid
)) { // copy ICVs locally to final dest
1269 __kmp_init_implicit_task(team
->t
.t_ident
, team
->t
.t_threads
[tid
], team
, tid
,
1271 copy_icvs(&team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
,
1272 &thr_bar
->th_fixed_icvs
);
1277 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1278 gtid
, team
->t
.t_id
, tid
, bt
));
1281 // Hierarchical Barrier
1283 // Initialize thread barrier data
1284 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1285 Performs the minimum amount of initialization required based on how the team
1286 has changed. Returns true if leaf children will require both on-core and
1287 traditional wake-up mechanisms. For example, if the team size increases,
1288 threads already in the team will respond to on-core wakeup on their parent
1289 thread, but threads newly added to the team will only be listening on the
1290 their local b_go. */
1291 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt
,
1292 kmp_bstate_t
*thr_bar
,
1293 kmp_uint32 nproc
, int gtid
,
1294 int tid
, kmp_team_t
*team
) {
1295 // Checks to determine if (re-)initialization is needed
1296 bool uninitialized
= thr_bar
->team
== NULL
;
1297 bool team_changed
= team
!= thr_bar
->team
;
1298 bool team_sz_changed
= nproc
!= thr_bar
->nproc
;
1299 bool tid_changed
= tid
!= thr_bar
->old_tid
;
1300 bool retval
= false;
1302 if (uninitialized
|| team_sz_changed
) {
1303 __kmp_get_hierarchy(nproc
, thr_bar
);
1306 if (uninitialized
|| team_sz_changed
|| tid_changed
) {
1307 thr_bar
->my_level
= thr_bar
->depth
- 1; // default for primary thread
1308 thr_bar
->parent_tid
= -1; // default for primary thread
1309 if (!KMP_MASTER_TID(tid
)) {
1310 // if not primary thread, find parent thread in hierarchy
1312 while (d
< thr_bar
->depth
) { // find parent based on level of thread in
1313 // hierarchy, and note level
1315 if (d
== thr_bar
->depth
- 2) { // reached level right below the primary
1316 thr_bar
->parent_tid
= 0;
1317 thr_bar
->my_level
= d
;
1319 } else if ((rem
= tid
% thr_bar
->skip_per_level
[d
+ 1]) != 0) {
1320 // TODO: can we make the above op faster?
1321 // thread is not a subtree root at next level, so this is max
1322 thr_bar
->parent_tid
= tid
- rem
;
1323 thr_bar
->my_level
= d
;
1329 __kmp_type_convert(7 - ((tid
- thr_bar
->parent_tid
) /
1330 (thr_bar
->skip_per_level
[thr_bar
->my_level
])),
1331 &(thr_bar
->offset
));
1332 thr_bar
->old_tid
= tid
;
1333 thr_bar
->wait_flag
= KMP_BARRIER_NOT_WAITING
;
1334 thr_bar
->team
= team
;
1335 thr_bar
->parent_bar
=
1336 &team
->t
.t_threads
[thr_bar
->parent_tid
]->th
.th_bar
[bt
].bb
;
1338 if (uninitialized
|| team_changed
|| tid_changed
) {
1339 thr_bar
->team
= team
;
1340 thr_bar
->parent_bar
=
1341 &team
->t
.t_threads
[thr_bar
->parent_tid
]->th
.th_bar
[bt
].bb
;
1344 if (uninitialized
|| team_sz_changed
|| tid_changed
) {
1345 thr_bar
->nproc
= nproc
;
1346 thr_bar
->leaf_kids
= thr_bar
->base_leaf_kids
;
1347 if (thr_bar
->my_level
== 0)
1348 thr_bar
->leaf_kids
= 0;
1349 if (thr_bar
->leaf_kids
&& (kmp_uint32
)tid
+ thr_bar
->leaf_kids
+ 1 > nproc
)
1350 __kmp_type_convert(nproc
- tid
- 1, &(thr_bar
->leaf_kids
));
1351 thr_bar
->leaf_state
= 0;
1352 for (int i
= 0; i
< thr_bar
->leaf_kids
; ++i
)
1353 ((char *)&(thr_bar
->leaf_state
))[7 - i
] = 1;
1358 static void __kmp_hierarchical_barrier_gather(
1359 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
1360 void (*reduce
)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
1361 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather
);
1362 kmp_team_t
*team
= this_thr
->th
.th_team
;
1363 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
1364 kmp_uint32 nproc
= this_thr
->th
.th_team_nproc
;
1365 kmp_info_t
**other_threads
= team
->t
.t_threads
;
1366 kmp_uint64 new_state
= 0;
1368 int level
= team
->t
.t_level
;
1369 if (other_threads
[0]
1370 ->th
.th_teams_microtask
) // are we inside the teams construct?
1371 if (this_thr
->th
.th_teams_size
.nteams
> 1)
1372 ++level
; // level was not increased in teams construct for team_of_masters
1374 thr_bar
->use_oncore_barrier
= 1;
1376 thr_bar
->use_oncore_barrier
= 0; // Do not use oncore barrier when nested
1378 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1379 "barrier type %d\n",
1380 gtid
, team
->t
.t_id
, tid
, bt
));
1381 KMP_DEBUG_ASSERT(this_thr
== other_threads
[this_thr
->th
.th_info
.ds
.ds_tid
]);
1383 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1384 // Barrier imbalance - save arrive time to the thread
1385 if (__kmp_forkjoin_frames_mode
== 3 || __kmp_forkjoin_frames_mode
== 2) {
1386 this_thr
->th
.th_bar_arrive_time
= __itt_get_timestamp();
1390 (void)__kmp_init_hierarchical_barrier_thread(bt
, thr_bar
, nproc
, gtid
, tid
,
1393 if (thr_bar
->my_level
) { // not a leaf (my_level==0 means leaf)
1394 kmp_int32 child_tid
;
1396 (kmp_uint64
)team
->t
.t_bar
[bt
].b_arrived
+ KMP_BARRIER_STATE_BUMP
;
1397 if (__kmp_dflt_blocktime
== KMP_MAX_BLOCKTIME
&&
1398 thr_bar
->use_oncore_barrier
) {
1399 if (thr_bar
->leaf_kids
) {
1400 // First, wait for leaf children to check-in on my b_arrived flag
1401 kmp_uint64 leaf_state
=
1403 ? thr_bar
->b_arrived
| thr_bar
->leaf_state
1404 : team
->t
.t_bar
[bt
].b_arrived
| thr_bar
->leaf_state
;
1405 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1407 gtid
, team
->t
.t_id
, tid
));
1408 kmp_flag_64
<> flag(&thr_bar
->b_arrived
, leaf_state
);
1409 flag
.wait(this_thr
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
1411 OMPT_REDUCTION_DECL(this_thr
, gtid
);
1412 OMPT_REDUCTION_BEGIN
;
1413 for (child_tid
= tid
+ 1; child_tid
<= tid
+ thr_bar
->leaf_kids
;
1415 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1417 gtid
, team
->t
.t_id
, tid
,
1418 __kmp_gtid_from_tid(child_tid
, team
), team
->t
.t_id
,
1420 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
1421 other_threads
[child_tid
]->th
.th_local
.reduce_data
);
1425 // clear leaf_state bits
1426 KMP_TEST_THEN_AND64(&thr_bar
->b_arrived
, ~(thr_bar
->leaf_state
));
1428 // Next, wait for higher level children on each child's b_arrived flag
1429 for (kmp_uint32 d
= 1; d
< thr_bar
->my_level
;
1430 ++d
) { // gather lowest level threads first, but skip 0
1431 kmp_uint32 last
= tid
+ thr_bar
->skip_per_level
[d
+ 1],
1432 skip
= thr_bar
->skip_per_level
[d
];
1435 for (child_tid
= tid
+ skip
; child_tid
< (int)last
; child_tid
+= skip
) {
1436 kmp_info_t
*child_thr
= other_threads
[child_tid
];
1437 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
1438 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1440 "arrived(%p) == %llu\n",
1441 gtid
, team
->t
.t_id
, tid
,
1442 __kmp_gtid_from_tid(child_tid
, team
), team
->t
.t_id
,
1443 child_tid
, &child_bar
->b_arrived
, new_state
));
1444 kmp_flag_64
<> flag(&child_bar
->b_arrived
, new_state
);
1445 flag
.wait(this_thr
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
1447 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1449 gtid
, team
->t
.t_id
, tid
,
1450 __kmp_gtid_from_tid(child_tid
, team
), team
->t
.t_id
,
1452 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
1453 child_thr
->th
.th_local
.reduce_data
);
1457 } else { // Blocktime is not infinite
1458 for (kmp_uint32 d
= 0; d
< thr_bar
->my_level
;
1459 ++d
) { // Gather lowest level threads first
1460 kmp_uint32 last
= tid
+ thr_bar
->skip_per_level
[d
+ 1],
1461 skip
= thr_bar
->skip_per_level
[d
];
1464 for (child_tid
= tid
+ skip
; child_tid
< (int)last
; child_tid
+= skip
) {
1465 kmp_info_t
*child_thr
= other_threads
[child_tid
];
1466 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
1467 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1469 "arrived(%p) == %llu\n",
1470 gtid
, team
->t
.t_id
, tid
,
1471 __kmp_gtid_from_tid(child_tid
, team
), team
->t
.t_id
,
1472 child_tid
, &child_bar
->b_arrived
, new_state
));
1473 kmp_flag_64
<> flag(&child_bar
->b_arrived
, new_state
);
1474 flag
.wait(this_thr
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
1476 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1478 gtid
, team
->t
.t_id
, tid
,
1479 __kmp_gtid_from_tid(child_tid
, team
), team
->t
.t_id
,
1481 (*reduce
)(this_thr
->th
.th_local
.reduce_data
,
1482 child_thr
->th
.th_local
.reduce_data
);
1488 // All subordinates are gathered; now release parent if not primary thread
1490 if (!KMP_MASTER_TID(tid
)) { // worker threads release parent in hierarchy
1491 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1492 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1493 gtid
, team
->t
.t_id
, tid
,
1494 __kmp_gtid_from_tid(thr_bar
->parent_tid
, team
), team
->t
.t_id
,
1495 thr_bar
->parent_tid
, &thr_bar
->b_arrived
, thr_bar
->b_arrived
,
1496 thr_bar
->b_arrived
+ KMP_BARRIER_STATE_BUMP
));
1497 /* Mark arrival to parent: After performing this write, a worker thread may
1498 not assume that the team is valid any more - it could be deallocated by
1499 the primary thread at any time. */
1500 if (thr_bar
->my_level
|| __kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
||
1501 !thr_bar
->use_oncore_barrier
) { // Parent is waiting on my b_arrived
1503 kmp_flag_64
<> flag(&thr_bar
->b_arrived
,
1504 other_threads
[thr_bar
->parent_tid
]);
1507 // Leaf does special release on "offset" bits of parent's b_arrived flag
1508 thr_bar
->b_arrived
= team
->t
.t_bar
[bt
].b_arrived
+ KMP_BARRIER_STATE_BUMP
;
1509 kmp_flag_oncore
flag(&thr_bar
->parent_bar
->b_arrived
,
1510 thr_bar
->offset
+ 1);
1511 flag
.set_waiter(other_threads
[thr_bar
->parent_tid
]);
1514 } else { // Primary thread needs to update the team's b_arrived value
1515 team
->t
.t_bar
[bt
].b_arrived
= new_state
;
1516 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1517 "arrived(%p) = %llu\n",
1518 gtid
, team
->t
.t_id
, tid
, team
->t
.t_id
,
1519 &team
->t
.t_bar
[bt
].b_arrived
, team
->t
.t_bar
[bt
].b_arrived
));
1521 // Is the team access below unsafe or just technically invalid?
1522 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1523 "barrier type %d\n",
1524 gtid
, team
->t
.t_id
, tid
, bt
));
1527 static void __kmp_hierarchical_barrier_release(
1528 enum barrier_type bt
, kmp_info_t
*this_thr
, int gtid
, int tid
,
1529 int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj
)) {
1530 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release
);
1532 kmp_bstate_t
*thr_bar
= &this_thr
->th
.th_bar
[bt
].bb
;
1534 bool team_change
= false; // indicates on-core barrier shouldn't be used
1536 if (KMP_MASTER_TID(tid
)) {
1537 team
= __kmp_threads
[gtid
]->th
.th_team
;
1538 KMP_DEBUG_ASSERT(team
!= NULL
);
1539 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1540 "entered barrier type %d\n",
1541 gtid
, team
->t
.t_id
, tid
, bt
));
1542 } else { // Worker threads
1543 // Wait for parent thread to release me
1544 if (!thr_bar
->use_oncore_barrier
||
1545 __kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
|| thr_bar
->my_level
!= 0 ||
1546 thr_bar
->team
== NULL
) {
1547 // Use traditional method of waiting on my own b_go flag
1548 thr_bar
->wait_flag
= KMP_BARRIER_OWN_FLAG
;
1549 kmp_flag_64
<> flag(&thr_bar
->b_go
, KMP_BARRIER_STATE_BUMP
);
1550 flag
.wait(this_thr
, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
1551 TCW_8(thr_bar
->b_go
,
1552 KMP_INIT_BARRIER_STATE
); // Reset my b_go flag for next time
1553 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1554 // infinite, not nested
1555 // Wait on my "offset" bits on parent's b_go flag
1556 thr_bar
->wait_flag
= KMP_BARRIER_PARENT_FLAG
;
1557 kmp_flag_oncore
flag(&thr_bar
->parent_bar
->b_go
, KMP_BARRIER_STATE_BUMP
,
1558 thr_bar
->offset
+ 1, bt
,
1559 this_thr
USE_ITT_BUILD_ARG(itt_sync_obj
));
1560 flag
.wait(this_thr
, TRUE
);
1561 if (thr_bar
->wait_flag
==
1562 KMP_BARRIER_SWITCHING
) { // Thread was switched to own b_go
1563 TCW_8(thr_bar
->b_go
,
1564 KMP_INIT_BARRIER_STATE
); // Reset my b_go flag for next time
1565 } else { // Reset my bits on parent's b_go flag
1566 (RCAST(volatile char *,
1567 &(thr_bar
->parent_bar
->b_go
)))[thr_bar
->offset
+ 1] = 0;
1570 thr_bar
->wait_flag
= KMP_BARRIER_NOT_WAITING
;
1571 // Early exit for reaping threads releasing forkjoin barrier
1572 if (bt
== bs_forkjoin_barrier
&& TCR_4(__kmp_global
.g
.g_done
))
1574 // The worker thread may now assume that the team is valid.
1575 team
= __kmp_threads
[gtid
]->th
.th_team
;
1576 KMP_DEBUG_ASSERT(team
!= NULL
);
1577 tid
= __kmp_tid_from_gtid(gtid
);
1581 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1582 gtid
, team
->t
.t_id
, tid
, &thr_bar
->b_go
, KMP_INIT_BARRIER_STATE
));
1583 KMP_MB(); // Flush all pending memory write invalidates.
1586 nproc
= this_thr
->th
.th_team_nproc
;
1587 int level
= team
->t
.t_level
;
1588 if (team
->t
.t_threads
[0]
1589 ->th
.th_teams_microtask
) { // are we inside the teams construct?
1590 if (team
->t
.t_pkfn
!= (microtask_t
)__kmp_teams_master
&&
1591 this_thr
->th
.th_teams_level
== level
)
1592 ++level
; // level was not increased in teams construct for team_of_workers
1593 if (this_thr
->th
.th_teams_size
.nteams
> 1)
1594 ++level
; // level was not increased in teams construct for team_of_masters
1597 thr_bar
->use_oncore_barrier
= 1;
1599 thr_bar
->use_oncore_barrier
= 0; // Do not use oncore barrier when nested
1601 // If the team size has increased, we still communicate with old leaves via
1603 unsigned short int old_leaf_kids
= thr_bar
->leaf_kids
;
1604 kmp_uint64 old_leaf_state
= thr_bar
->leaf_state
;
1605 team_change
= __kmp_init_hierarchical_barrier_thread(bt
, thr_bar
, nproc
, gtid
,
1607 // But if the entire team changes, we won't use oncore barrier at all
1611 #if KMP_BARRIER_ICV_PUSH
1612 if (propagate_icvs
) {
1613 __kmp_init_implicit_task(team
->t
.t_ident
, team
->t
.t_threads
[tid
], team
, tid
,
1616 tid
)) { // primary already has copy in final destination; copy
1617 copy_icvs(&thr_bar
->th_fixed_icvs
,
1618 &team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
);
1619 } else if (__kmp_dflt_blocktime
== KMP_MAX_BLOCKTIME
&&
1620 thr_bar
->use_oncore_barrier
) { // optimization for inf blocktime
1621 if (!thr_bar
->my_level
) // I'm a leaf in the hierarchy (my_level==0)
1622 // leaves (on-core children) pull parent's fixed ICVs directly to local
1624 copy_icvs(&team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
,
1625 &thr_bar
->parent_bar
->th_fixed_icvs
);
1626 // non-leaves will get ICVs piggybacked with b_go via NGO store
1627 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1628 if (thr_bar
->my_level
) // not a leaf; copy ICVs to my fixed ICVs child can
1630 copy_icvs(&thr_bar
->th_fixed_icvs
, &thr_bar
->parent_bar
->th_fixed_icvs
);
1631 else // leaves copy parent's fixed ICVs directly to local ICV store
1632 copy_icvs(&team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
,
1633 &thr_bar
->parent_bar
->th_fixed_icvs
);
1636 #endif // KMP_BARRIER_ICV_PUSH
1638 // Now, release my children
1639 if (thr_bar
->my_level
) { // not a leaf
1640 kmp_int32 child_tid
;
1642 if (__kmp_dflt_blocktime
== KMP_MAX_BLOCKTIME
&&
1643 thr_bar
->use_oncore_barrier
) {
1644 if (KMP_MASTER_TID(tid
)) { // do a flat release
1645 // Set local b_go to bump children via NGO store of the cache line
1646 // containing IVCs and b_go.
1647 thr_bar
->b_go
= KMP_BARRIER_STATE_BUMP
;
1648 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1650 ngo_load(&thr_bar
->th_fixed_icvs
);
1651 // This loops over all the threads skipping only the leaf nodes in the
1653 for (child_tid
= thr_bar
->skip_per_level
[1]; child_tid
< (int)nproc
;
1654 child_tid
+= thr_bar
->skip_per_level
[1]) {
1655 kmp_bstate_t
*child_bar
=
1656 &team
->t
.t_threads
[child_tid
]->th
.th_bar
[bt
].bb
;
1657 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1658 "releasing T#%d(%d:%d)"
1659 " go(%p): %u => %u\n",
1660 gtid
, team
->t
.t_id
, tid
,
1661 __kmp_gtid_from_tid(child_tid
, team
), team
->t
.t_id
,
1662 child_tid
, &child_bar
->b_go
, child_bar
->b_go
,
1663 child_bar
->b_go
+ KMP_BARRIER_STATE_BUMP
));
1664 // Use ngo store (if available) to both store ICVs and release child
1666 ngo_store_go(&child_bar
->th_fixed_icvs
, &thr_bar
->th_fixed_icvs
);
1670 TCW_8(thr_bar
->b_go
,
1671 KMP_INIT_BARRIER_STATE
); // Reset my b_go flag for next time
1672 // Now, release leaf children
1673 if (thr_bar
->leaf_kids
) { // if there are any
1674 // We test team_change on the off-chance that the level 1 team changed.
1676 old_leaf_kids
< thr_bar
->leaf_kids
) { // some old, some new
1677 if (old_leaf_kids
) { // release old leaf kids
1678 thr_bar
->b_go
|= old_leaf_state
;
1680 // Release new leaf kids
1681 last
= tid
+ thr_bar
->skip_per_level
[1];
1684 for (child_tid
= tid
+ 1 + old_leaf_kids
; child_tid
< (int)last
;
1685 ++child_tid
) { // skip_per_level[0]=1
1686 kmp_info_t
*child_thr
= team
->t
.t_threads
[child_tid
];
1687 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
1690 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1691 " T#%d(%d:%d) go(%p): %u => %u\n",
1692 gtid
, team
->t
.t_id
, tid
, __kmp_gtid_from_tid(child_tid
, team
),
1693 team
->t
.t_id
, child_tid
, &child_bar
->b_go
, child_bar
->b_go
,
1694 child_bar
->b_go
+ KMP_BARRIER_STATE_BUMP
));
1695 // Release child using child's b_go flag
1696 kmp_flag_64
<> flag(&child_bar
->b_go
, child_thr
);
1699 } else { // Release all children at once with leaf_state bits on my own
1701 thr_bar
->b_go
|= thr_bar
->leaf_state
;
1704 } else { // Blocktime is not infinite; do a simple hierarchical release
1705 for (int d
= thr_bar
->my_level
- 1; d
>= 0;
1706 --d
) { // Release highest level threads first
1707 last
= tid
+ thr_bar
->skip_per_level
[d
+ 1];
1708 kmp_uint32 skip
= thr_bar
->skip_per_level
[d
];
1711 for (child_tid
= tid
+ skip
; child_tid
< (int)last
; child_tid
+= skip
) {
1712 kmp_info_t
*child_thr
= team
->t
.t_threads
[child_tid
];
1713 kmp_bstate_t
*child_bar
= &child_thr
->th
.th_bar
[bt
].bb
;
1714 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1715 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1716 gtid
, team
->t
.t_id
, tid
,
1717 __kmp_gtid_from_tid(child_tid
, team
), team
->t
.t_id
,
1718 child_tid
, &child_bar
->b_go
, child_bar
->b_go
,
1719 child_bar
->b_go
+ KMP_BARRIER_STATE_BUMP
));
1720 // Release child using child's b_go flag
1721 kmp_flag_64
<> flag(&child_bar
->b_go
, child_thr
);
1726 #if KMP_BARRIER_ICV_PUSH
1727 if (propagate_icvs
&& !KMP_MASTER_TID(tid
))
1728 // non-leaves copy ICVs from fixed ICVs to local dest
1729 copy_icvs(&team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
,
1730 &thr_bar
->th_fixed_icvs
);
1731 #endif // KMP_BARRIER_ICV_PUSH
1733 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1734 "barrier type %d\n",
1735 gtid
, team
->t
.t_id
, tid
, bt
));
1738 // End of Barrier Algorithms
1740 // type traits for cancellable value
1741 // if cancellable is true, then is_cancellable is a normal boolean variable
1742 // if cancellable is false, then is_cancellable is a compile time constant
1743 template <bool cancellable
> struct is_cancellable
{};
1744 template <> struct is_cancellable
<true> {
1746 is_cancellable() : value(false) {}
1747 is_cancellable(bool b
) : value(b
) {}
1748 is_cancellable
&operator=(bool b
) {
1752 operator bool() const { return value
; }
1754 template <> struct is_cancellable
<false> {
1755 is_cancellable
&operator=(bool b
) { return *this; }
1756 constexpr operator bool() const { return false; }
1759 // Internal function to do a barrier.
1760 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1761 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1763 When cancellable = false,
1764 Returns 0 if primary thread, 1 if worker thread.
1765 When cancellable = true
1766 Returns 0 if not cancelled, 1 if cancelled. */
1767 template <bool cancellable
= false>
1768 static int __kmp_barrier_template(enum barrier_type bt
, int gtid
, int is_split
,
1769 size_t reduce_size
, void *reduce_data
,
1770 void (*reduce
)(void *, void *)) {
1771 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier
);
1772 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER
);
1773 int tid
= __kmp_tid_from_gtid(gtid
);
1774 kmp_info_t
*this_thr
= __kmp_threads
[gtid
];
1775 kmp_team_t
*team
= this_thr
->th
.th_team
;
1777 is_cancellable
<cancellable
> cancelled
;
1778 #if OMPT_SUPPORT && OMPT_OPTIONAL
1779 ompt_data_t
*my_task_data
;
1780 ompt_data_t
*my_parallel_data
;
1781 void *return_address
;
1782 ompt_sync_region_t barrier_kind
;
1785 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid
,
1786 __kmp_team_from_gtid(gtid
)->t
.t_id
, __kmp_tid_from_gtid(gtid
)));
1789 if (ompt_enabled
.enabled
) {
1791 my_task_data
= OMPT_CUR_TASK_DATA(this_thr
);
1792 my_parallel_data
= OMPT_CUR_TEAM_DATA(this_thr
);
1793 return_address
= OMPT_LOAD_RETURN_ADDRESS(gtid
);
1794 barrier_kind
= __ompt_get_barrier_kind(bt
, this_thr
);
1795 if (ompt_enabled
.ompt_callback_sync_region
) {
1796 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
1797 barrier_kind
, ompt_scope_begin
, my_parallel_data
, my_task_data
,
1800 if (ompt_enabled
.ompt_callback_sync_region_wait
) {
1801 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
1802 barrier_kind
, ompt_scope_begin
, my_parallel_data
, my_task_data
,
1806 // It is OK to report the barrier state after the barrier begin callback.
1807 // According to the OMPT specification, a compliant implementation may
1808 // even delay reporting this state until the barrier begins to wait.
1809 auto *ompt_thr_info
= &this_thr
->th
.ompt_thread_info
;
1810 switch (barrier_kind
) {
1811 case ompt_sync_region_barrier_explicit
:
1812 ompt_thr_info
->state
= ompt_state_wait_barrier_explicit
;
1814 case ompt_sync_region_barrier_implicit_workshare
:
1815 ompt_thr_info
->state
= ompt_state_wait_barrier_implicit_workshare
;
1817 case ompt_sync_region_barrier_implicit_parallel
:
1818 ompt_thr_info
->state
= ompt_state_wait_barrier_implicit_parallel
;
1820 case ompt_sync_region_barrier_teams
:
1821 ompt_thr_info
->state
= ompt_state_wait_barrier_teams
;
1823 case ompt_sync_region_barrier_implementation
:
1826 ompt_thr_info
->state
= ompt_state_wait_barrier_implementation
;
1831 if (!team
->t
.t_serialized
) {
1833 // This value will be used in itt notify events below.
1834 void *itt_sync_obj
= NULL
;
1836 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
1837 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bt
, 1);
1839 #endif /* USE_ITT_BUILD */
1840 if (__kmp_tasking_mode
== tskm_extra_barrier
) {
1841 __kmp_tasking_barrier(team
, this_thr
, gtid
);
1843 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid
,
1844 __kmp_team_from_gtid(gtid
)->t
.t_id
, __kmp_tid_from_gtid(gtid
)));
1847 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1848 access it when the team struct is not guaranteed to exist. */
1849 // See note about the corresponding code in __kmp_join_barrier() being
1850 // performance-critical.
1851 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
) {
1853 this_thr
->th
.th_team_bt_intervals
=
1854 team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
.bt_intervals
;
1855 this_thr
->th
.th_team_bt_set
=
1856 team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
.bt_set
;
1858 this_thr
->th
.th_team_bt_intervals
= KMP_BLOCKTIME_INTERVAL(team
, tid
);
1863 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
1864 __kmp_itt_barrier_starting(gtid
, itt_sync_obj
);
1865 #endif /* USE_ITT_BUILD */
1867 // Let the debugger know: the thread arrived to the barrier and waiting.
1868 if (KMP_MASTER_TID(tid
)) { // Primary thread counter stored in team struct
1869 team
->t
.t_bar
[bt
].b_master_arrived
+= 1;
1871 this_thr
->th
.th_bar
[bt
].bb
.b_worker_arrived
+= 1;
1873 #endif /* USE_DEBUGGER */
1874 if (reduce
!= NULL
) {
1875 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1876 this_thr
->th
.th_local
.reduce_data
= reduce_data
;
1879 if (KMP_MASTER_TID(tid
) && __kmp_tasking_mode
!= tskm_immediate_exec
)
1880 __kmp_task_team_setup(this_thr
, team
);
1883 cancelled
= __kmp_linear_barrier_gather_cancellable(
1884 bt
, this_thr
, gtid
, tid
, reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
1886 switch (__kmp_barrier_gather_pattern
[bt
]) {
1888 __kmp_dist_barrier_gather(bt
, this_thr
, gtid
, tid
,
1889 reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
1892 case bp_hyper_bar
: {
1893 // don't set branch bits to 0; use linear
1894 KMP_ASSERT(__kmp_barrier_gather_branch_bits
[bt
]);
1895 __kmp_hyper_barrier_gather(bt
, this_thr
, gtid
, tid
,
1896 reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
1899 case bp_hierarchical_bar
: {
1900 __kmp_hierarchical_barrier_gather(
1901 bt
, this_thr
, gtid
, tid
, reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
1905 // don't set branch bits to 0; use linear
1906 KMP_ASSERT(__kmp_barrier_gather_branch_bits
[bt
]);
1907 __kmp_tree_barrier_gather(bt
, this_thr
, gtid
, tid
,
1908 reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
1912 __kmp_linear_barrier_gather(bt
, this_thr
, gtid
, tid
,
1913 reduce
USE_ITT_BUILD_ARG(itt_sync_obj
));
1920 if (KMP_MASTER_TID(tid
)) {
1922 if (__kmp_tasking_mode
!= tskm_immediate_exec
&& !cancelled
) {
1923 __kmp_task_team_wait(this_thr
, team
USE_ITT_BUILD_ARG(itt_sync_obj
));
1926 // Let the debugger know: All threads are arrived and starting leaving the
1928 team
->t
.t_bar
[bt
].b_team_arrived
+= 1;
1931 if (__kmp_omp_cancellation
) {
1932 kmp_int32 cancel_request
= KMP_ATOMIC_LD_RLX(&team
->t
.t_cancel_request
);
1933 // Reset cancellation flag for worksharing constructs
1934 if (cancel_request
== cancel_loop
||
1935 cancel_request
== cancel_sections
) {
1936 KMP_ATOMIC_ST_RLX(&team
->t
.t_cancel_request
, cancel_noreq
);
1940 /* TODO: In case of split reduction barrier, primary thread may send
1941 acquired event early, before the final summation into the shared
1942 variable is done (final summation can be a long operation for array
1944 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
1945 __kmp_itt_barrier_middle(gtid
, itt_sync_obj
);
1946 #endif /* USE_ITT_BUILD */
1947 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1948 // Barrier - report frame end (only if active_level == 1)
1949 if ((__itt_frame_submit_v3_ptr
|| KMP_ITT_DEBUG
) &&
1950 __kmp_forkjoin_frames_mode
&&
1951 (this_thr
->th
.th_teams_microtask
== NULL
|| // either not in teams
1952 this_thr
->th
.th_teams_size
.nteams
== 1) && // or inside single team
1953 team
->t
.t_active_level
== 1) {
1954 ident_t
*loc
= __kmp_threads
[gtid
]->th
.th_ident
;
1955 kmp_uint64 cur_time
= __itt_get_timestamp();
1956 kmp_info_t
**other_threads
= team
->t
.t_threads
;
1957 int nproc
= this_thr
->th
.th_team_nproc
;
1959 switch (__kmp_forkjoin_frames_mode
) {
1961 __kmp_itt_frame_submit(gtid
, this_thr
->th
.th_frame_time
, cur_time
, 0,
1963 this_thr
->th
.th_frame_time
= cur_time
;
1965 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1967 __kmp_itt_frame_submit(gtid
, this_thr
->th
.th_bar_min_time
, cur_time
,
1971 if (__itt_metadata_add_ptr
) {
1972 // Initialize with primary thread's wait time
1973 kmp_uint64 delta
= cur_time
- this_thr
->th
.th_bar_arrive_time
;
1974 // Set arrive time to zero to be able to check it in
1975 // __kmp_invoke_task(); the same is done inside the loop below
1976 this_thr
->th
.th_bar_arrive_time
= 0;
1977 for (i
= 1; i
< nproc
; ++i
) {
1978 delta
+= (cur_time
- other_threads
[i
]->th
.th_bar_arrive_time
);
1979 other_threads
[i
]->th
.th_bar_arrive_time
= 0;
1981 __kmp_itt_metadata_imbalance(gtid
, this_thr
->th
.th_frame_time
,
1983 (kmp_uint64
)(reduce
!= NULL
));
1985 __kmp_itt_frame_submit(gtid
, this_thr
->th
.th_frame_time
, cur_time
, 0,
1987 this_thr
->th
.th_frame_time
= cur_time
;
1991 #endif /* USE_ITT_BUILD */
1995 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
1996 __kmp_itt_barrier_middle(gtid
, itt_sync_obj
);
1997 #endif /* USE_ITT_BUILD */
1999 if ((status
== 1 || !is_split
) && !cancelled
) {
2001 cancelled
= __kmp_linear_barrier_release_cancellable(
2002 bt
, this_thr
, gtid
, tid
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2004 switch (__kmp_barrier_release_pattern
[bt
]) {
2006 KMP_ASSERT(__kmp_barrier_release_branch_bits
[bt
]);
2007 __kmp_dist_barrier_release(bt
, this_thr
, gtid
, tid
,
2008 FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2011 case bp_hyper_bar
: {
2012 KMP_ASSERT(__kmp_barrier_release_branch_bits
[bt
]);
2013 __kmp_hyper_barrier_release(bt
, this_thr
, gtid
, tid
,
2014 FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2017 case bp_hierarchical_bar
: {
2018 __kmp_hierarchical_barrier_release(
2019 bt
, this_thr
, gtid
, tid
, FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2023 KMP_ASSERT(__kmp_barrier_release_branch_bits
[bt
]);
2024 __kmp_tree_barrier_release(bt
, this_thr
, gtid
, tid
,
2025 FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2029 __kmp_linear_barrier_release(bt
, this_thr
, gtid
, tid
,
2030 FALSE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2034 if (__kmp_tasking_mode
!= tskm_immediate_exec
&& !cancelled
) {
2035 __kmp_task_team_sync(this_thr
, team
);
2040 /* GEH: TODO: Move this under if-condition above and also include in
2041 __kmp_end_split_barrier(). This will more accurately represent the actual
2042 release time of the threads for split barriers. */
2043 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
2044 __kmp_itt_barrier_finished(gtid
, itt_sync_obj
);
2045 #endif /* USE_ITT_BUILD */
2046 } else { // Team is serialized.
2048 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
2049 if (this_thr
->th
.th_task_team
!= NULL
) {
2051 void *itt_sync_obj
= NULL
;
2052 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
) {
2053 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bt
, 1);
2054 __kmp_itt_barrier_starting(gtid
, itt_sync_obj
);
2059 this_thr
->th
.th_task_team
->tt
.tt_found_proxy_tasks
== TRUE
||
2060 this_thr
->th
.th_task_team
->tt
.tt_hidden_helper_task_encountered
==
2062 __kmp_task_team_wait(this_thr
, team
USE_ITT_BUILD_ARG(itt_sync_obj
));
2063 __kmp_task_team_setup(this_thr
, team
);
2066 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
2067 __kmp_itt_barrier_finished(gtid
, itt_sync_obj
);
2068 #endif /* USE_ITT_BUILD */
2072 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2073 gtid
, __kmp_team_from_gtid(gtid
)->t
.t_id
,
2074 __kmp_tid_from_gtid(gtid
), status
));
2077 if (ompt_enabled
.enabled
) {
2079 if (ompt_enabled
.ompt_callback_sync_region_wait
) {
2080 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
2081 barrier_kind
, ompt_scope_end
, my_parallel_data
, my_task_data
,
2084 if (ompt_enabled
.ompt_callback_sync_region
) {
2085 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
2086 barrier_kind
, ompt_scope_end
, my_parallel_data
, my_task_data
,
2090 this_thr
->th
.ompt_thread_info
.state
= ompt_state_work_parallel
;
2095 return (int)cancelled
;
2099 // Returns 0 if primary thread, 1 if worker thread.
2100 int __kmp_barrier(enum barrier_type bt
, int gtid
, int is_split
,
2101 size_t reduce_size
, void *reduce_data
,
2102 void (*reduce
)(void *, void *)) {
2103 return __kmp_barrier_template
<>(bt
, gtid
, is_split
, reduce_size
, reduce_data
,
2107 #if defined(KMP_GOMP_COMPAT)
2108 // Returns 1 if cancelled, 0 otherwise
2109 int __kmp_barrier_gomp_cancel(int gtid
) {
2110 if (__kmp_omp_cancellation
) {
2111 int cancelled
= __kmp_barrier_template
<true>(bs_plain_barrier
, gtid
, FALSE
,
2114 int tid
= __kmp_tid_from_gtid(gtid
);
2115 kmp_info_t
*this_thr
= __kmp_threads
[gtid
];
2116 if (KMP_MASTER_TID(tid
)) {
2117 // Primary thread does not need to revert anything
2119 // Workers need to revert their private b_arrived flag
2120 this_thr
->th
.th_bar
[bs_plain_barrier
].bb
.b_arrived
-=
2121 KMP_BARRIER_STATE_BUMP
;
2126 __kmp_barrier(bs_plain_barrier
, gtid
, FALSE
, 0, NULL
, NULL
);
2131 void __kmp_end_split_barrier(enum barrier_type bt
, int gtid
) {
2132 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier
);
2133 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER
);
2134 KMP_DEBUG_ASSERT(bt
< bs_last_barrier
);
2135 int tid
= __kmp_tid_from_gtid(gtid
);
2136 kmp_info_t
*this_thr
= __kmp_threads
[gtid
];
2137 kmp_team_t
*team
= this_thr
->th
.th_team
;
2139 if (!team
->t
.t_serialized
) {
2140 if (KMP_MASTER_GTID(gtid
)) {
2141 switch (__kmp_barrier_release_pattern
[bt
]) {
2143 __kmp_dist_barrier_release(bt
, this_thr
, gtid
, tid
,
2144 FALSE
USE_ITT_BUILD_ARG(NULL
));
2147 case bp_hyper_bar
: {
2148 KMP_ASSERT(__kmp_barrier_release_branch_bits
[bt
]);
2149 __kmp_hyper_barrier_release(bt
, this_thr
, gtid
, tid
,
2150 FALSE
USE_ITT_BUILD_ARG(NULL
));
2153 case bp_hierarchical_bar
: {
2154 __kmp_hierarchical_barrier_release(bt
, this_thr
, gtid
, tid
,
2155 FALSE
USE_ITT_BUILD_ARG(NULL
));
2159 KMP_ASSERT(__kmp_barrier_release_branch_bits
[bt
]);
2160 __kmp_tree_barrier_release(bt
, this_thr
, gtid
, tid
,
2161 FALSE
USE_ITT_BUILD_ARG(NULL
));
2165 __kmp_linear_barrier_release(bt
, this_thr
, gtid
, tid
,
2166 FALSE
USE_ITT_BUILD_ARG(NULL
));
2169 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
2170 __kmp_task_team_sync(this_thr
, team
);
2176 void __kmp_join_barrier(int gtid
) {
2177 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier
);
2178 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER
);
2180 KMP_DEBUG_ASSERT(__kmp_threads
&& __kmp_threads
[gtid
]);
2182 kmp_info_t
*this_thr
= __kmp_threads
[gtid
];
2187 #endif /* KMP_DEBUG */
2189 void *itt_sync_obj
= NULL
;
2191 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
) // Don't call routine without need
2192 // Get object created at fork_barrier
2193 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
2195 #endif /* USE_ITT_BUILD */
2196 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2197 int nproc
= this_thr
->th
.th_team_nproc
;
2202 team
= this_thr
->th
.th_team
;
2203 KMP_DEBUG_ASSERT(nproc
== team
->t
.t_nproc
);
2204 tid
= __kmp_tid_from_gtid(gtid
);
2206 team_id
= team
->t
.t_id
;
2207 kmp_info_t
*master_thread
= this_thr
->th
.th_team_master
;
2208 if (master_thread
!= team
->t
.t_threads
[0]) {
2209 __kmp_print_structure();
2211 #endif /* KMP_DEBUG */
2212 KMP_DEBUG_ASSERT(master_thread
== team
->t
.t_threads
[0]);
2216 KMP_DEBUG_ASSERT(TCR_PTR(this_thr
->th
.th_team
));
2217 KMP_DEBUG_ASSERT(TCR_PTR(this_thr
->th
.th_root
));
2218 KMP_DEBUG_ASSERT(this_thr
== team
->t
.t_threads
[tid
]);
2219 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2220 gtid
, team_id
, tid
));
2223 if (ompt_enabled
.enabled
) {
2225 ompt_data_t
*my_task_data
;
2226 ompt_data_t
*my_parallel_data
;
2227 void *codeptr
= NULL
;
2228 int ds_tid
= this_thr
->th
.th_info
.ds
.ds_tid
;
2229 if (KMP_MASTER_TID(ds_tid
) &&
2230 (ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
) ||
2231 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)))
2232 codeptr
= team
->t
.ompt_team_info
.master_return_address
;
2233 my_task_data
= OMPT_CUR_TASK_DATA(this_thr
);
2234 my_parallel_data
= OMPT_CUR_TEAM_DATA(this_thr
);
2235 ompt_sync_region_t sync_kind
= ompt_sync_region_barrier_implicit_parallel
;
2236 ompt_state_t ompt_state
= ompt_state_wait_barrier_implicit_parallel
;
2237 if (this_thr
->th
.ompt_thread_info
.parallel_flags
& ompt_parallel_league
) {
2238 sync_kind
= ompt_sync_region_barrier_teams
;
2239 ompt_state
= ompt_state_wait_barrier_teams
;
2241 if (ompt_enabled
.ompt_callback_sync_region
) {
2242 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
2243 sync_kind
, ompt_scope_begin
, my_parallel_data
, my_task_data
, codeptr
);
2245 if (ompt_enabled
.ompt_callback_sync_region_wait
) {
2246 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
2247 sync_kind
, ompt_scope_begin
, my_parallel_data
, my_task_data
, codeptr
);
2249 if (!KMP_MASTER_TID(ds_tid
))
2250 this_thr
->th
.ompt_thread_info
.task_data
= *OMPT_CUR_TASK_DATA(this_thr
);
2252 this_thr
->th
.ompt_thread_info
.state
= ompt_state
;
2256 if (__kmp_tasking_mode
== tskm_extra_barrier
) {
2257 __kmp_tasking_barrier(team
, this_thr
, gtid
);
2258 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2259 gtid
, team_id
, tid
));
2262 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
2263 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2264 "%p, th_task_team = %p\n",
2265 __kmp_gtid_from_thread(this_thr
), team_id
,
2266 team
->t
.t_task_team
[this_thr
->th
.th_task_state
],
2267 this_thr
->th
.th_task_team
));
2268 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team
, this_thr
);
2270 #endif /* KMP_DEBUG */
2272 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2273 access it when the team struct is not guaranteed to exist. Doing these
2274 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2275 we do not perform the copy if blocktime=infinite, since the values are not
2276 used by __kmp_wait_template() in that case. */
2277 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
) {
2279 this_thr
->th
.th_team_bt_intervals
=
2280 team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
.bt_intervals
;
2281 this_thr
->th
.th_team_bt_set
=
2282 team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
.bt_set
;
2284 this_thr
->th
.th_team_bt_intervals
= KMP_BLOCKTIME_INTERVAL(team
, tid
);
2289 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
2290 __kmp_itt_barrier_starting(gtid
, itt_sync_obj
);
2291 #endif /* USE_ITT_BUILD */
2293 switch (__kmp_barrier_gather_pattern
[bs_forkjoin_barrier
]) {
2295 __kmp_dist_barrier_gather(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2296 NULL
USE_ITT_BUILD_ARG(itt_sync_obj
));
2299 case bp_hyper_bar
: {
2300 KMP_ASSERT(__kmp_barrier_gather_branch_bits
[bs_forkjoin_barrier
]);
2301 __kmp_hyper_barrier_gather(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2302 NULL
USE_ITT_BUILD_ARG(itt_sync_obj
));
2305 case bp_hierarchical_bar
: {
2306 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2307 NULL
USE_ITT_BUILD_ARG(itt_sync_obj
));
2311 KMP_ASSERT(__kmp_barrier_gather_branch_bits
[bs_forkjoin_barrier
]);
2312 __kmp_tree_barrier_gather(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2313 NULL
USE_ITT_BUILD_ARG(itt_sync_obj
));
2317 __kmp_linear_barrier_gather(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2318 NULL
USE_ITT_BUILD_ARG(itt_sync_obj
));
2322 /* From this point on, the team data structure may be deallocated at any time
2323 by the primary thread - it is unsafe to reference it in any of the worker
2324 threads. Any per-team data items that need to be referenced before the
2325 end of the barrier should be moved to the kmp_task_team_t structs. */
2326 if (KMP_MASTER_TID(tid
)) {
2327 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
2328 __kmp_task_team_wait(this_thr
, team
USE_ITT_BUILD_ARG(itt_sync_obj
));
2330 if (__kmp_display_affinity
) {
2331 KMP_CHECK_UPDATE(team
->t
.t_display_affinity
, 0);
2333 #if KMP_STATS_ENABLED
2334 // Have primary thread flag the workers to indicate they are now waiting for
2335 // next parallel region, Also wake them up so they switch their timers to
2337 for (int i
= 0; i
< team
->t
.t_nproc
; ++i
) {
2338 kmp_info_t
*team_thread
= team
->t
.t_threads
[i
];
2339 if (team_thread
== this_thr
)
2341 team_thread
->th
.th_stats
->setIdleFlag();
2342 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
&&
2343 team_thread
->th
.th_sleep_loc
!= NULL
)
2344 __kmp_null_resume_wrapper(team_thread
);
2348 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
2349 __kmp_itt_barrier_middle(gtid
, itt_sync_obj
);
2350 #endif /* USE_ITT_BUILD */
2352 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2353 // Join barrier - report frame end
2354 if ((__itt_frame_submit_v3_ptr
|| KMP_ITT_DEBUG
) &&
2355 __kmp_forkjoin_frames_mode
&&
2356 (this_thr
->th
.th_teams_microtask
== NULL
|| // either not in teams
2357 this_thr
->th
.th_teams_size
.nteams
== 1) && // or inside single team
2358 team
->t
.t_active_level
== 1) {
2359 kmp_uint64 cur_time
= __itt_get_timestamp();
2360 ident_t
*loc
= team
->t
.t_ident
;
2361 kmp_info_t
**other_threads
= team
->t
.t_threads
;
2362 switch (__kmp_forkjoin_frames_mode
) {
2364 __kmp_itt_frame_submit(gtid
, this_thr
->th
.th_frame_time
, cur_time
, 0,
2368 __kmp_itt_frame_submit(gtid
, this_thr
->th
.th_bar_min_time
, cur_time
, 1,
2372 if (__itt_metadata_add_ptr
) {
2373 // Initialize with primary thread's wait time
2374 kmp_uint64 delta
= cur_time
- this_thr
->th
.th_bar_arrive_time
;
2375 // Set arrive time to zero to be able to check it in
2376 // __kmp_invoke_task(); the same is done inside the loop below
2377 this_thr
->th
.th_bar_arrive_time
= 0;
2378 for (int i
= 1; i
< nproc
; ++i
) {
2379 delta
+= (cur_time
- other_threads
[i
]->th
.th_bar_arrive_time
);
2380 other_threads
[i
]->th
.th_bar_arrive_time
= 0;
2382 __kmp_itt_metadata_imbalance(gtid
, this_thr
->th
.th_frame_time
,
2383 cur_time
, delta
, 0);
2385 __kmp_itt_frame_submit(gtid
, this_thr
->th
.th_frame_time
, cur_time
, 0,
2387 this_thr
->th
.th_frame_time
= cur_time
;
2391 #endif /* USE_ITT_BUILD */
2395 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
)
2396 __kmp_itt_barrier_middle(gtid
, itt_sync_obj
);
2398 #endif /* USE_ITT_BUILD */
2401 if (KMP_MASTER_TID(tid
)) {
2404 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2405 gtid
, team_id
, tid
, nproc
));
2407 #endif /* KMP_DEBUG */
2409 // TODO now, mark worker threads as done so they may be disbanded
2410 KMP_MB(); // Flush all pending memory write invalidates.
2412 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid
, team_id
, tid
));
2416 // TODO release worker threads' fork barriers as we are ready instead of all at
2418 void __kmp_fork_barrier(int gtid
, int tid
) {
2419 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier
);
2420 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER
);
2421 kmp_info_t
*this_thr
= __kmp_threads
[gtid
];
2422 kmp_team_t
*team
= (tid
== 0) ? this_thr
->th
.th_team
: NULL
;
2424 void *itt_sync_obj
= NULL
;
2425 #endif /* USE_ITT_BUILD */
2428 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid
,
2429 (team
!= NULL
) ? team
->t
.t_id
: -1, tid
));
2431 // th_team pointer only valid for primary thread here
2432 if (KMP_MASTER_TID(tid
)) {
2433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2434 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
) {
2435 // Create itt barrier object
2436 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
, 1);
2437 __kmp_itt_barrier_middle(gtid
, itt_sync_obj
); // Call acquired/releasing
2439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2442 KMP_DEBUG_ASSERT(team
);
2443 kmp_info_t
**other_threads
= team
->t
.t_threads
;
2449 for (i
= 1; i
< team
->t
.t_nproc
; ++i
) {
2451 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2453 gtid
, team
->t
.t_id
, other_threads
[i
]->th
.th_info
.ds
.ds_gtid
,
2454 team
->t
.t_id
, other_threads
[i
]->th
.th_info
.ds
.ds_tid
,
2455 other_threads
[i
]->th
.th_bar
[bs_forkjoin_barrier
].bb
.b_go
));
2457 (TCR_4(other_threads
[i
]->th
.th_bar
[bs_forkjoin_barrier
].bb
.b_go
) &
2458 ~(KMP_BARRIER_SLEEP_STATE
)) == KMP_INIT_BARRIER_STATE
);
2459 KMP_DEBUG_ASSERT(other_threads
[i
]->th
.th_team
== team
);
2463 if (__kmp_tasking_mode
!= tskm_immediate_exec
)
2464 __kmp_task_team_setup(this_thr
, team
);
2466 /* The primary thread may have changed its blocktime between join barrier
2467 and fork barrier. Copy the blocktime info to the thread, where
2468 __kmp_wait_template() can access it when the team struct is not
2469 guaranteed to exist. */
2470 // See note about the corresponding code in __kmp_join_barrier() being
2471 // performance-critical
2472 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
) {
2474 this_thr
->th
.th_team_bt_intervals
=
2475 team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
.bt_intervals
;
2476 this_thr
->th
.th_team_bt_set
=
2477 team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
.bt_set
;
2479 this_thr
->th
.th_team_bt_intervals
= KMP_BLOCKTIME_INTERVAL(team
, tid
);
2484 switch (__kmp_barrier_release_pattern
[bs_forkjoin_barrier
]) {
2486 __kmp_dist_barrier_release(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2487 TRUE
USE_ITT_BUILD_ARG(NULL
));
2490 case bp_hyper_bar
: {
2491 KMP_ASSERT(__kmp_barrier_release_branch_bits
[bs_forkjoin_barrier
]);
2492 __kmp_hyper_barrier_release(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2493 TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2496 case bp_hierarchical_bar
: {
2497 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2498 TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2502 KMP_ASSERT(__kmp_barrier_release_branch_bits
[bs_forkjoin_barrier
]);
2503 __kmp_tree_barrier_release(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2504 TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2508 __kmp_linear_barrier_release(bs_forkjoin_barrier
, this_thr
, gtid
, tid
,
2509 TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
2514 ompt_state_t ompt_state
= this_thr
->th
.ompt_thread_info
.state
;
2515 if (ompt_enabled
.enabled
&&
2516 (ompt_state
== ompt_state_wait_barrier_teams
||
2517 ompt_state
== ompt_state_wait_barrier_implicit_parallel
)) {
2518 int ds_tid
= this_thr
->th
.th_info
.ds
.ds_tid
;
2519 ompt_data_t
*task_data
= (team
)
2520 ? OMPT_CUR_TASK_DATA(this_thr
)
2521 : &(this_thr
->th
.ompt_thread_info
.task_data
);
2522 this_thr
->th
.ompt_thread_info
.state
= ompt_state_overhead
;
2524 void *codeptr
= NULL
;
2525 if (KMP_MASTER_TID(ds_tid
) &&
2526 (ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
) ||
2527 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)))
2528 codeptr
= team
? team
->t
.ompt_team_info
.master_return_address
: NULL
;
2529 ompt_sync_region_t sync_kind
= ompt_sync_region_barrier_implicit_parallel
;
2530 if (this_thr
->th
.ompt_thread_info
.parallel_flags
& ompt_parallel_league
)
2531 sync_kind
= ompt_sync_region_barrier_teams
;
2532 if (ompt_enabled
.ompt_callback_sync_region_wait
) {
2533 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
2534 sync_kind
, ompt_scope_end
, NULL
, task_data
, codeptr
);
2536 if (ompt_enabled
.ompt_callback_sync_region
) {
2537 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
2538 sync_kind
, ompt_scope_end
, NULL
, task_data
, codeptr
);
2541 if (!KMP_MASTER_TID(ds_tid
) && ompt_enabled
.ompt_callback_implicit_task
) {
2542 ompt_callbacks
.ompt_callback(ompt_callback_implicit_task
)(
2543 ompt_scope_end
, NULL
, task_data
, 0, ds_tid
,
2544 ompt_task_implicit
); // TODO: Can this be ompt_task_initial?
2549 // Early exit for reaping threads releasing forkjoin barrier
2550 if (TCR_4(__kmp_global
.g
.g_done
)) {
2551 this_thr
->th
.th_task_team
= NULL
;
2553 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2554 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
) {
2555 if (!KMP_MASTER_TID(tid
)) {
2556 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
2558 __kmp_itt_barrier_finished(gtid
, itt_sync_obj
);
2561 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2562 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid
));
2566 /* We can now assume that a valid team structure has been allocated by the
2567 primary thread and propagated to all worker threads. The current thread,
2568 however, may not be part of the team, so we can't blindly assume that the
2569 team pointer is non-null. */
2570 team
= (kmp_team_t
*)TCR_PTR(this_thr
->th
.th_team
);
2571 KMP_DEBUG_ASSERT(team
!= NULL
);
2572 tid
= __kmp_tid_from_gtid(gtid
);
2574 #if KMP_BARRIER_ICV_PULL
2575 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2576 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2577 implicit task has this data before this function is called. We cannot
2578 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2579 thread struct, because it is not always the case that the threads arrays
2580 have been allocated when __kmp_fork_call() is executed. */
2582 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy
);
2583 if (!KMP_MASTER_TID(tid
)) { // primary thread already has ICVs
2584 // Copy the initial ICVs from the primary thread's thread struct to the
2585 // implicit task for this tid.
2587 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid
, tid
));
2588 __kmp_init_implicit_task(team
->t
.t_ident
, team
->t
.t_threads
[tid
], team
,
2590 copy_icvs(&team
->t
.t_implicit_task_taskdata
[tid
].td_icvs
,
2591 &team
->t
.t_threads
[0]
2592 ->th
.th_bar
[bs_forkjoin_barrier
]
2596 #endif // KMP_BARRIER_ICV_PULL
2598 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
2599 __kmp_task_team_sync(this_thr
, team
);
2602 #if KMP_AFFINITY_SUPPORTED
2603 kmp_proc_bind_t proc_bind
= team
->t
.t_proc_bind
;
2604 if (proc_bind
== proc_bind_intel
) {
2605 // Call dynamic affinity settings
2606 if (__kmp_affinity
.type
== affinity_balanced
&& team
->t
.t_size_changed
) {
2607 __kmp_balanced_affinity(this_thr
, team
->t
.t_nproc
);
2609 } else if (proc_bind
!= proc_bind_false
) {
2610 if (this_thr
->th
.th_new_place
== this_thr
->th
.th_current_place
) {
2611 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2612 __kmp_gtid_from_thread(this_thr
),
2613 this_thr
->th
.th_current_place
));
2615 __kmp_affinity_bind_place(gtid
);
2618 #endif // KMP_AFFINITY_SUPPORTED
2619 // Perform the display affinity functionality
2620 if (__kmp_display_affinity
) {
2621 if (team
->t
.t_display_affinity
2622 #if KMP_AFFINITY_SUPPORTED
2623 || (__kmp_affinity
.type
== affinity_balanced
&& team
->t
.t_size_changed
)
2626 // NULL means use the affinity-format-var ICV
2627 __kmp_aux_display_affinity(gtid
, NULL
);
2628 this_thr
->th
.th_prev_num_threads
= team
->t
.t_nproc
;
2629 this_thr
->th
.th_prev_level
= team
->t
.t_level
;
2632 if (!KMP_MASTER_TID(tid
))
2633 KMP_CHECK_UPDATE(this_thr
->th
.th_def_allocator
, team
->t
.t_def_allocator
);
2635 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2636 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
) {
2637 if (!KMP_MASTER_TID(tid
)) {
2638 // Get correct barrier object
2639 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
2640 __kmp_itt_barrier_finished(gtid
, itt_sync_obj
); // Workers call acquired
2641 } // (prepare called inside barrier_release)
2643 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2644 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid
,
2645 team
->t
.t_id
, tid
));
2648 void __kmp_setup_icv_copy(kmp_team_t
*team
, int new_nproc
,
2649 kmp_internal_control_t
*new_icvs
, ident_t
*loc
) {
2650 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy
);
2652 KMP_DEBUG_ASSERT(team
&& new_nproc
&& new_icvs
);
2653 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel
)) || new_icvs
->nproc
);
2655 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2656 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2657 implicit task has this data before this function is called. */
2658 #if KMP_BARRIER_ICV_PULL
2659 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2660 remains untouched), where all of the worker threads can access them and
2661 make their own copies after the barrier. */
2662 KMP_DEBUG_ASSERT(team
->t
.t_threads
[0]); // The threads arrays should be
2663 // allocated at this point
2665 &team
->t
.t_threads
[0]->th
.th_bar
[bs_forkjoin_barrier
].bb
.th_fixed_icvs
,
2667 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2668 team
->t
.t_threads
[0], team
));
2669 #elif KMP_BARRIER_ICV_PUSH
2670 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2672 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2673 team
->t
.t_threads
[0], team
));
2675 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2678 KMP_DEBUG_ASSERT(team
->t
.t_threads
[0]); // The threads arrays should be
2679 // allocated at this point
2680 for (int f
= 1; f
< new_nproc
; ++f
) { // Skip the primary thread
2681 // TODO: GEH - pass in better source location info since usually NULL here
2682 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2683 f
, team
->t
.t_threads
[f
], team
));
2684 __kmp_init_implicit_task(loc
, team
->t
.t_threads
[f
], team
, f
, FALSE
);
2685 ngo_store_icvs(&team
->t
.t_implicit_task_taskdata
[f
].td_icvs
, new_icvs
);
2686 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2687 f
, team
->t
.t_threads
[f
], team
));
2690 #endif // KMP_BARRIER_ICV_PULL