[memprof] Upgrade a unit test to MemProf Version 3 (#117063)
[llvm-project.git] / openmp / runtime / src / kmp_barrier.cpp
blobd7ef57c608149e7729fc0fc4457a67299131b103
1 /*
2 * kmp_barrier.cpp
3 */
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
40 void __kmp_print_structure(void); // Forward declaration
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n) {
48 int nsockets = 1;
49 if (__kmp_topology) {
50 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52 int ncores_per_socket =
53 __kmp_topology->calculate_ratio(core_level, socket_level);
54 nsockets = __kmp_topology->get_count(socket_level);
56 if (nsockets <= 0)
57 nsockets = 1;
58 if (ncores_per_socket <= 0)
59 ncores_per_socket = 1;
61 threads_per_go = ncores_per_socket >> 1;
62 if (!fix_threads_per_go) {
63 // Minimize num_gos
64 if (threads_per_go > 4) {
65 if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66 threads_per_go = threads_per_go >> 1;
68 if (threads_per_go > 4 && nsockets == 1)
69 threads_per_go = threads_per_go >> 1;
72 if (threads_per_go == 0)
73 threads_per_go = 1;
74 fix_threads_per_go = true;
75 num_gos = n / threads_per_go;
76 if (n % threads_per_go)
77 num_gos++;
78 if (nsockets == 1 || num_gos == 1)
79 num_groups = 1;
80 else {
81 num_groups = num_gos / nsockets;
82 if (num_gos % nsockets)
83 num_groups++;
85 if (num_groups <= 0)
86 num_groups = 1;
87 gos_per_group = num_gos / num_groups;
88 if (num_gos % num_groups)
89 gos_per_group++;
90 threads_per_group = threads_per_go * gos_per_group;
91 } else {
92 num_gos = n / threads_per_go;
93 if (n % threads_per_go)
94 num_gos++;
95 if (num_gos == 1)
96 num_groups = 1;
97 else {
98 num_groups = num_gos / 2;
99 if (num_gos % 2)
100 num_groups++;
102 gos_per_group = num_gos / num_groups;
103 if (num_gos % num_groups)
104 gos_per_group++;
105 threads_per_group = threads_per_go * gos_per_group;
109 void distributedBarrier::computeGo(size_t n) {
110 // Minimize num_gos
111 for (num_gos = 1;; num_gos++)
112 if (IDEAL_CONTENTION * num_gos >= n)
113 break;
114 threads_per_go = n / num_gos;
115 if (n % num_gos)
116 threads_per_go++;
117 while (num_gos > MAX_GOS) {
118 threads_per_go++;
119 num_gos = n / threads_per_go;
120 if (n % threads_per_go)
121 num_gos++;
123 computeVarsForN(n);
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr) {
129 KMP_DEBUG_ASSERT(nthr > max_threads);
131 // expand to requested size * 2
132 max_threads = nthr * 2;
134 // allocate arrays to new max threads
135 for (int i = 0; i < MAX_ITERS; ++i) {
136 if (flags[i])
137 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138 max_threads * sizeof(flags_s));
139 else
140 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
143 if (go)
144 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145 else
146 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
148 if (iter)
149 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150 else
151 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
153 if (sleep)
154 sleep =
155 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156 else
157 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64 distributedBarrier::go_release() {
164 kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165 for (size_t j = 0; j < num_gos; j++) {
166 go[j].go.store(next_go);
168 return next_go;
171 void distributedBarrier::go_reset() {
172 for (size_t j = 0; j < max_threads; ++j) {
173 for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174 flags[i][j].stillNeed = 1;
176 go[j].go.store(0);
177 iter[j].iter = 0;
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr) {
184 size_t old_max = max_threads;
185 if (nthr > max_threads) { // need more space in arrays
186 resize(nthr);
189 for (size_t i = 0; i < max_threads; i++) {
190 for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191 flags[j][i].stillNeed = 1;
193 go[i].go.store(0);
194 iter[i].iter = 0;
195 if (i >= old_max)
196 sleep[i].sleep = false;
199 // Recalculate num_gos, etc. based on new nthr
200 computeVarsForN(nthr);
202 num_threads = nthr;
204 if (team_icvs == NULL)
205 team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
208 // This function is used only when KMP_BLOCKTIME is not infinite.
209 // static
210 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211 size_t start, size_t stop, size_t inc,
212 size_t tid) {
213 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215 return;
217 kmp_info_t **other_threads = team->t.t_threads;
218 for (size_t thr = start; thr < stop; thr += inc) {
219 KMP_DEBUG_ASSERT(other_threads[thr]);
220 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221 // Wake up worker regardless of if it appears to be sleeping or not
222 __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
226 static void __kmp_dist_barrier_gather(
227 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230 kmp_team_t *team;
231 distributedBarrier *b;
232 kmp_info_t **other_threads;
233 kmp_uint64 my_current_iter, my_next_iter;
234 kmp_uint32 nproc;
235 bool group_leader;
237 team = this_thr->th.th_team;
238 nproc = this_thr->th.th_team_nproc;
239 other_threads = team->t.t_threads;
240 b = team->t.b;
241 my_current_iter = b->iter[tid].iter;
242 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243 group_leader = ((tid % b->threads_per_group) == 0);
245 KA_TRACE(20,
246 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247 gtid, team->t.t_id, tid, bt));
249 #if USE_ITT_BUILD && USE_ITT_NOTIFY
250 // Barrier imbalance - save arrive time to the thread
251 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253 __itt_get_timestamp();
255 #endif
257 if (group_leader) {
258 // Start from the thread after the group leader
259 size_t group_start = tid + 1;
260 size_t group_end = tid + b->threads_per_group;
261 size_t threads_pending = 0;
263 if (group_end > nproc)
264 group_end = nproc;
265 do { // wait for threads in my group
266 threads_pending = 0;
267 // Check all the flags every time to avoid branch misspredict
268 for (size_t thr = group_start; thr < group_end; thr++) {
269 // Each thread uses a different cache line
270 threads_pending += b->flags[my_current_iter][thr].stillNeed;
272 // Execute tasks here
273 if (__kmp_tasking_mode != tskm_immediate_exec) {
274 kmp_task_team_t *task_team = this_thr->th.th_task_team;
275 if (task_team != NULL) {
276 if (TCR_SYNC_4(task_team->tt.tt_active)) {
277 if (KMP_TASKING_ENABLED(task_team)) {
278 int tasks_completed = FALSE;
279 __kmp_atomic_execute_tasks_64(
280 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282 } else
283 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
285 } else {
286 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287 } // if
289 if (TCR_4(__kmp_global.g.g_done)) {
290 if (__kmp_global.g.g_abort)
291 __kmp_abort_thread();
292 break;
293 } else if (__kmp_tasking_mode != tskm_immediate_exec &&
294 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
297 } while (threads_pending > 0);
299 if (reduce) { // Perform reduction if needed
300 OMPT_REDUCTION_DECL(this_thr, gtid);
301 OMPT_REDUCTION_BEGIN;
302 // Group leader reduces all threads in group
303 for (size_t thr = group_start; thr < group_end; thr++) {
304 (*reduce)(this_thr->th.th_local.reduce_data,
305 other_threads[thr]->th.th_local.reduce_data);
307 OMPT_REDUCTION_END;
310 // Set flag for next iteration
311 b->flags[my_next_iter][tid].stillNeed = 1;
312 // Each thread uses a different cache line; resets stillNeed to 0 to
313 // indicate it has reached the barrier
314 b->flags[my_current_iter][tid].stillNeed = 0;
316 do { // wait for all group leaders
317 threads_pending = 0;
318 for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319 threads_pending += b->flags[my_current_iter][thr].stillNeed;
321 // Execute tasks here
322 if (__kmp_tasking_mode != tskm_immediate_exec) {
323 kmp_task_team_t *task_team = this_thr->th.th_task_team;
324 if (task_team != NULL) {
325 if (TCR_SYNC_4(task_team->tt.tt_active)) {
326 if (KMP_TASKING_ENABLED(task_team)) {
327 int tasks_completed = FALSE;
328 __kmp_atomic_execute_tasks_64(
329 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331 } else
332 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
334 } else {
335 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336 } // if
338 if (TCR_4(__kmp_global.g.g_done)) {
339 if (__kmp_global.g.g_abort)
340 __kmp_abort_thread();
341 break;
342 } else if (__kmp_tasking_mode != tskm_immediate_exec &&
343 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
346 } while (threads_pending > 0);
348 if (reduce) { // Perform reduction if needed
349 if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350 OMPT_REDUCTION_DECL(this_thr, gtid);
351 OMPT_REDUCTION_BEGIN;
352 for (size_t thr = b->threads_per_group; thr < nproc;
353 thr += b->threads_per_group) {
354 (*reduce)(this_thr->th.th_local.reduce_data,
355 other_threads[thr]->th.th_local.reduce_data);
357 OMPT_REDUCTION_END;
360 } else {
361 // Set flag for next iteration
362 b->flags[my_next_iter][tid].stillNeed = 1;
363 // Each thread uses a different cache line; resets stillNeed to 0 to
364 // indicate it has reached the barrier
365 b->flags[my_current_iter][tid].stillNeed = 0;
368 KMP_MFENCE();
370 KA_TRACE(20,
371 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372 gtid, team->t.t_id, tid, bt));
375 static void __kmp_dist_barrier_release(
376 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379 kmp_team_t *team;
380 distributedBarrier *b;
381 kmp_bstate_t *thr_bar;
382 kmp_uint64 my_current_iter, next_go;
383 size_t my_go_index;
384 bool group_leader;
386 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387 gtid, tid, bt));
389 thr_bar = &this_thr->th.th_bar[bt].bb;
391 if (!KMP_MASTER_TID(tid)) {
392 // workers and non-master group leaders need to check their presence in team
393 do {
394 if (this_thr->th.th_used_in_team.load() != 1 &&
395 this_thr->th.th_used_in_team.load() != 3) {
396 // Thread is not in use in a team. Wait on location in tid's thread
397 // struct. The 0 value tells anyone looking that this thread is spinning
398 // or sleeping until this location becomes 3 again; 3 is the transition
399 // state to get to 1 which is waiting on go and being in the team
400 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402 0) ||
403 this_thr->th.th_used_in_team.load() == 0) {
404 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
407 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408 // In fork barrier where we could not get the object reliably
409 itt_sync_obj =
410 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411 // Cancel wait on previous parallel region...
412 __kmp_itt_task_starting(itt_sync_obj);
414 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415 return;
417 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418 if (itt_sync_obj != NULL)
419 // Call prepare as early as possible for "new" barrier
420 __kmp_itt_task_finished(itt_sync_obj);
421 } else
422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424 return;
426 if (this_thr->th.th_used_in_team.load() != 1 &&
427 this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428 continue;
429 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430 return;
432 // At this point, the thread thinks it is in use in a team, or in
433 // transition to be used in a team, but it might have reached this barrier
434 // before it was marked unused by the team. Unused threads are awoken and
435 // shifted to wait on local thread struct elsewhere. It also might reach
436 // this point by being picked up for use by a different team. Either way,
437 // we need to update the tid.
438 tid = __kmp_tid_from_gtid(gtid);
439 team = this_thr->th.th_team;
440 KMP_DEBUG_ASSERT(tid >= 0);
441 KMP_DEBUG_ASSERT(team);
442 b = team->t.b;
443 my_current_iter = b->iter[tid].iter;
444 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445 my_go_index = tid / b->threads_per_go;
446 if (this_thr->th.th_used_in_team.load() == 3) {
447 (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
450 // Check if go flag is set
451 if (b->go[my_go_index].go.load() != next_go) {
452 // Wait on go flag on team
453 kmp_atomic_flag_64<false, true> my_flag(
454 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
455 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
456 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
457 b->iter[tid].iter == 0);
458 KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
461 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
462 return;
463 // At this point, the thread's go location was set. This means the primary
464 // thread is safely in the barrier, and so this thread's data is
465 // up-to-date, but we should check again that this thread is really in
466 // use in the team, as it could have been woken up for the purpose of
467 // changing team size, or reaping threads at shutdown.
468 if (this_thr->th.th_used_in_team.load() == 1)
469 break;
470 } while (1);
472 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
473 return;
475 group_leader = ((tid % b->threads_per_group) == 0);
476 if (group_leader) {
477 // Tell all the threads in my group they can go!
478 for (size_t go_idx = my_go_index + 1;
479 go_idx < my_go_index + b->gos_per_group; go_idx++) {
480 b->go[go_idx].go.store(next_go);
482 // Fence added so that workers can see changes to go. sfence inadequate.
483 KMP_MFENCE();
486 #if KMP_BARRIER_ICV_PUSH
487 if (propagate_icvs) { // copy ICVs to final dest
488 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
489 tid, FALSE);
490 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
491 (kmp_internal_control_t *)team->t.b->team_icvs);
492 copy_icvs(&thr_bar->th_fixed_icvs,
493 &team->t.t_implicit_task_taskdata[tid].td_icvs);
495 #endif
496 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
497 // This thread is now awake and participating in the barrier;
498 // wake up the other threads in the group
499 size_t nproc = this_thr->th.th_team_nproc;
500 size_t group_end = tid + b->threads_per_group;
501 if (nproc < group_end)
502 group_end = nproc;
503 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
505 } else { // Primary thread
506 team = this_thr->th.th_team;
507 b = team->t.b;
508 my_current_iter = b->iter[tid].iter;
509 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
510 #if KMP_BARRIER_ICV_PUSH
511 if (propagate_icvs) {
512 // primary thread has ICVs in final destination; copy
513 copy_icvs(&thr_bar->th_fixed_icvs,
514 &team->t.t_implicit_task_taskdata[tid].td_icvs);
516 #endif
517 // Tell all the group leaders they can go!
518 for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
519 b->go[go_idx].go.store(next_go);
522 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
523 // Wake-up the group leaders
524 size_t nproc = this_thr->th.th_team_nproc;
525 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
526 b->threads_per_group, tid);
529 // Tell all the threads in my group they can go!
530 for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
531 b->go[go_idx].go.store(next_go);
534 // Fence added so that workers can see changes to go. sfence inadequate.
535 KMP_MFENCE();
537 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
538 // Wake-up the other threads in my group
539 size_t nproc = this_thr->th.th_team_nproc;
540 size_t group_end = tid + b->threads_per_group;
541 if (nproc < group_end)
542 group_end = nproc;
543 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
546 // Update to next iteration
547 KMP_ASSERT(my_current_iter == b->iter[tid].iter);
548 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
550 KA_TRACE(
551 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
552 gtid, team->t.t_id, tid, bt));
555 // Linear Barrier
556 template <bool cancellable = false>
557 static bool __kmp_linear_barrier_gather_template(
558 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
559 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
560 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
561 kmp_team_t *team = this_thr->th.th_team;
562 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
563 kmp_info_t **other_threads = team->t.t_threads;
565 KA_TRACE(
567 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
568 gtid, team->t.t_id, tid, bt));
569 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
571 #if USE_ITT_BUILD && USE_ITT_NOTIFY
572 // Barrier imbalance - save arrive time to the thread
573 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
574 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
575 __itt_get_timestamp();
577 #endif
578 // We now perform a linear reduction to signal that all of the threads have
579 // arrived.
580 if (!KMP_MASTER_TID(tid)) {
581 KA_TRACE(20,
582 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
583 "arrived(%p): %llu => %llu\n",
584 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
585 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
586 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
587 // Mark arrival to primary thread
588 /* After performing this write, a worker thread may not assume that the team
589 is valid any more - it could be deallocated by the primary thread at any
590 time. */
591 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
592 flag.release();
593 } else {
594 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
595 int nproc = this_thr->th.th_team_nproc;
596 int i;
597 // Don't have to worry about sleep bit here or atomic since team setting
598 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
600 // Collect all the worker team member threads.
601 for (i = 1; i < nproc; ++i) {
602 #if KMP_CACHE_MANAGE
603 // Prefetch next thread's arrived count
604 if (i + 1 < nproc)
605 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
606 #endif /* KMP_CACHE_MANAGE */
607 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
608 "arrived(%p) == %llu\n",
609 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
610 team->t.t_id, i,
611 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
613 // Wait for worker thread to arrive
614 if (cancellable) {
615 kmp_flag_64<true, false> flag(
616 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
617 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
618 return true;
619 } else {
620 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
621 new_state);
622 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
624 #if USE_ITT_BUILD && USE_ITT_NOTIFY
625 // Barrier imbalance - write min of the thread time and the other thread
626 // time to the thread.
627 if (__kmp_forkjoin_frames_mode == 2) {
628 this_thr->th.th_bar_min_time = KMP_MIN(
629 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
631 #endif
632 if (reduce) {
633 KA_TRACE(100,
634 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
635 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
636 team->t.t_id, i));
637 OMPT_REDUCTION_DECL(this_thr, gtid);
638 OMPT_REDUCTION_BEGIN;
639 (*reduce)(this_thr->th.th_local.reduce_data,
640 other_threads[i]->th.th_local.reduce_data);
641 OMPT_REDUCTION_END;
644 // Don't have to worry about sleep bit here or atomic since team setting
645 team_bar->b_arrived = new_state;
646 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
647 "arrived(%p) = %llu\n",
648 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
649 new_state));
651 KA_TRACE(
653 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
654 gtid, team->t.t_id, tid, bt));
655 return false;
658 template <bool cancellable = false>
659 static bool __kmp_linear_barrier_release_template(
660 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
661 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
662 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
663 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
664 kmp_team_t *team;
666 if (KMP_MASTER_TID(tid)) {
667 unsigned int i;
668 kmp_uint32 nproc = this_thr->th.th_team_nproc;
669 kmp_info_t **other_threads;
671 team = __kmp_threads[gtid]->th.th_team;
672 KMP_DEBUG_ASSERT(team != NULL);
673 other_threads = team->t.t_threads;
675 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
676 "barrier type %d\n",
677 gtid, team->t.t_id, tid, bt));
679 if (nproc > 1) {
680 #if KMP_BARRIER_ICV_PUSH
682 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
683 if (propagate_icvs) {
684 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
685 for (i = 1; i < nproc; ++i) {
686 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
687 team, i, FALSE);
688 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
689 &team->t.t_implicit_task_taskdata[0].td_icvs);
691 ngo_sync();
694 #endif // KMP_BARRIER_ICV_PUSH
696 // Now, release all of the worker threads
697 for (i = 1; i < nproc; ++i) {
698 #if KMP_CACHE_MANAGE
699 // Prefetch next thread's go flag
700 if (i + 1 < nproc)
701 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
702 #endif /* KMP_CACHE_MANAGE */
703 KA_TRACE(
705 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
706 "go(%p): %u => %u\n",
707 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
708 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
709 other_threads[i]->th.th_bar[bt].bb.b_go,
710 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
711 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
712 other_threads[i]);
713 flag.release();
716 } else { // Wait for the PRIMARY thread to release us
717 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
718 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
719 if (cancellable) {
720 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
721 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
722 return true;
723 } else {
724 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
725 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
727 #if USE_ITT_BUILD && USE_ITT_NOTIFY
728 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
729 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
730 // disabled)
731 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
732 // Cancel wait on previous parallel region...
733 __kmp_itt_task_starting(itt_sync_obj);
735 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
736 return false;
738 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
739 if (itt_sync_obj != NULL)
740 // Call prepare as early as possible for "new" barrier
741 __kmp_itt_task_finished(itt_sync_obj);
742 } else
743 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
744 // Early exit for reaping threads releasing forkjoin barrier
745 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
746 return false;
747 // The worker thread may now assume that the team is valid.
748 #ifdef KMP_DEBUG
749 tid = __kmp_tid_from_gtid(gtid);
750 team = __kmp_threads[gtid]->th.th_team;
751 #endif
752 KMP_DEBUG_ASSERT(team != NULL);
753 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
754 KA_TRACE(20,
755 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
756 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
757 KMP_MB(); // Flush all pending memory write invalidates.
759 KA_TRACE(
761 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
762 gtid, team->t.t_id, tid, bt));
763 return false;
766 static void __kmp_linear_barrier_gather(
767 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
768 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
769 __kmp_linear_barrier_gather_template<false>(
770 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
773 static bool __kmp_linear_barrier_gather_cancellable(
774 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
775 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
776 return __kmp_linear_barrier_gather_template<true>(
777 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
780 static void __kmp_linear_barrier_release(
781 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
782 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
783 __kmp_linear_barrier_release_template<false>(
784 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
787 static bool __kmp_linear_barrier_release_cancellable(
788 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
789 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
790 return __kmp_linear_barrier_release_template<true>(
791 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
794 // Tree barrier
795 static void __kmp_tree_barrier_gather(
796 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
797 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
798 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
799 kmp_team_t *team = this_thr->th.th_team;
800 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
801 kmp_info_t **other_threads = team->t.t_threads;
802 kmp_uint32 nproc = this_thr->th.th_team_nproc;
803 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
804 kmp_uint32 branch_factor = 1 << branch_bits;
805 kmp_uint32 child;
806 kmp_uint32 child_tid;
807 kmp_uint64 new_state = 0;
809 KA_TRACE(
810 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
811 gtid, team->t.t_id, tid, bt));
812 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
814 #if USE_ITT_BUILD && USE_ITT_NOTIFY
815 // Barrier imbalance - save arrive time to the thread
816 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
817 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
818 __itt_get_timestamp();
820 #endif
821 // Perform tree gather to wait until all threads have arrived; reduce any
822 // required data as we go
823 child_tid = (tid << branch_bits) + 1;
824 if (child_tid < nproc) {
825 // Parent threads wait for all their children to arrive
826 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
827 child = 1;
828 do {
829 kmp_info_t *child_thr = other_threads[child_tid];
830 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831 #if KMP_CACHE_MANAGE
832 // Prefetch next thread's arrived count
833 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
834 KMP_CACHE_PREFETCH(
835 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
836 #endif /* KMP_CACHE_MANAGE */
837 KA_TRACE(20,
838 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
839 "arrived(%p) == %llu\n",
840 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
842 // Wait for child to arrive
843 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
844 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
845 #if USE_ITT_BUILD && USE_ITT_NOTIFY
846 // Barrier imbalance - write min of the thread time and a child time to
847 // the thread.
848 if (__kmp_forkjoin_frames_mode == 2) {
849 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
850 child_thr->th.th_bar_min_time);
852 #endif
853 if (reduce) {
854 KA_TRACE(100,
855 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
856 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
857 team->t.t_id, child_tid));
858 OMPT_REDUCTION_DECL(this_thr, gtid);
859 OMPT_REDUCTION_BEGIN;
860 (*reduce)(this_thr->th.th_local.reduce_data,
861 child_thr->th.th_local.reduce_data);
862 OMPT_REDUCTION_END;
864 child++;
865 child_tid++;
866 } while (child <= branch_factor && child_tid < nproc);
869 if (!KMP_MASTER_TID(tid)) { // Worker threads
870 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
872 KA_TRACE(20,
873 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
874 "arrived(%p): %llu => %llu\n",
875 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
876 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
877 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
879 // Mark arrival to parent thread
880 /* After performing this write, a worker thread may not assume that the team
881 is valid any more - it could be deallocated by the primary thread at any
882 time. */
883 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
884 flag.release();
885 } else {
886 // Need to update the team arrived pointer if we are the primary thread
887 if (nproc > 1) // New value was already computed above
888 team->t.t_bar[bt].b_arrived = new_state;
889 else
890 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
891 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
892 "arrived(%p) = %llu\n",
893 gtid, team->t.t_id, tid, team->t.t_id,
894 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
896 KA_TRACE(20,
897 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
898 gtid, team->t.t_id, tid, bt));
901 static void __kmp_tree_barrier_release(
902 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
903 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
904 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
905 kmp_team_t *team;
906 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
907 kmp_uint32 nproc;
908 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
909 kmp_uint32 branch_factor = 1 << branch_bits;
910 kmp_uint32 child;
911 kmp_uint32 child_tid;
913 // Perform a tree release for all of the threads that have been gathered
914 if (!KMP_MASTER_TID(
915 tid)) { // Handle fork barrier workers who aren't part of a team yet
916 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
917 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
918 // Wait for parent thread to release us
919 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
920 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
921 #if USE_ITT_BUILD && USE_ITT_NOTIFY
922 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
923 // In fork barrier where we could not get the object reliably (or
924 // ITTNOTIFY is disabled)
925 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
926 // Cancel wait on previous parallel region...
927 __kmp_itt_task_starting(itt_sync_obj);
929 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
930 return;
932 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
933 if (itt_sync_obj != NULL)
934 // Call prepare as early as possible for "new" barrier
935 __kmp_itt_task_finished(itt_sync_obj);
936 } else
937 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
938 // Early exit for reaping threads releasing forkjoin barrier
939 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
940 return;
942 // The worker thread may now assume that the team is valid.
943 team = __kmp_threads[gtid]->th.th_team;
944 KMP_DEBUG_ASSERT(team != NULL);
945 tid = __kmp_tid_from_gtid(gtid);
947 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
948 KA_TRACE(20,
949 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
950 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
951 KMP_MB(); // Flush all pending memory write invalidates.
952 } else {
953 team = __kmp_threads[gtid]->th.th_team;
954 KMP_DEBUG_ASSERT(team != NULL);
955 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
956 "barrier type %d\n",
957 gtid, team->t.t_id, tid, bt));
959 nproc = this_thr->th.th_team_nproc;
960 child_tid = (tid << branch_bits) + 1;
962 if (child_tid < nproc) {
963 kmp_info_t **other_threads = team->t.t_threads;
964 child = 1;
965 // Parent threads release all their children
966 do {
967 kmp_info_t *child_thr = other_threads[child_tid];
968 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
969 #if KMP_CACHE_MANAGE
970 // Prefetch next thread's go count
971 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
972 KMP_CACHE_PREFETCH(
973 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
974 #endif /* KMP_CACHE_MANAGE */
976 #if KMP_BARRIER_ICV_PUSH
978 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
979 if (propagate_icvs) {
980 __kmp_init_implicit_task(team->t.t_ident,
981 team->t.t_threads[child_tid], team,
982 child_tid, FALSE);
983 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
984 &team->t.t_implicit_task_taskdata[0].td_icvs);
987 #endif // KMP_BARRIER_ICV_PUSH
988 KA_TRACE(20,
989 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
990 "go(%p): %u => %u\n",
991 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
992 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
993 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
994 // Release child from barrier
995 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
996 flag.release();
997 child++;
998 child_tid++;
999 } while (child <= branch_factor && child_tid < nproc);
1001 KA_TRACE(
1002 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1003 gtid, team->t.t_id, tid, bt));
1006 // Hyper Barrier
1007 static void __kmp_hyper_barrier_gather(
1008 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1009 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1010 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1011 kmp_team_t *team = this_thr->th.th_team;
1012 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1013 kmp_info_t **other_threads = team->t.t_threads;
1014 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1015 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1016 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1017 kmp_uint32 branch_factor = 1 << branch_bits;
1018 kmp_uint32 offset;
1019 kmp_uint32 level;
1021 KA_TRACE(
1023 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1024 gtid, team->t.t_id, tid, bt));
1025 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1027 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1028 // Barrier imbalance - save arrive time to the thread
1029 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1030 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1031 __itt_get_timestamp();
1033 #endif
1034 /* Perform a hypercube-embedded tree gather to wait until all of the threads
1035 have arrived, and reduce any required data as we go. */
1036 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1037 for (level = 0, offset = 1; offset < num_threads;
1038 level += branch_bits, offset <<= branch_bits) {
1039 kmp_uint32 child;
1040 kmp_uint32 child_tid;
1042 if (((tid >> level) & (branch_factor - 1)) != 0) {
1043 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1045 KMP_MB(); // Synchronize parent and child threads.
1046 KA_TRACE(20,
1047 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1048 "arrived(%p): %llu => %llu\n",
1049 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1050 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1051 thr_bar->b_arrived,
1052 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1053 // Mark arrival to parent thread
1054 /* After performing this write (in the last iteration of the enclosing for
1055 loop), a worker thread may not assume that the team is valid any more
1056 - it could be deallocated by the primary thread at any time. */
1057 p_flag.set_waiter(other_threads[parent_tid]);
1058 p_flag.release();
1059 break;
1062 // Parent threads wait for children to arrive
1063 if (new_state == KMP_BARRIER_UNUSED_STATE)
1064 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1065 for (child = 1, child_tid = tid + (1 << level);
1066 child < branch_factor && child_tid < num_threads;
1067 child++, child_tid += (1 << level)) {
1068 kmp_info_t *child_thr = other_threads[child_tid];
1069 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1070 #if KMP_CACHE_MANAGE
1071 kmp_uint32 next_child_tid = child_tid + (1 << level);
1072 // Prefetch next thread's arrived count
1073 if (child + 1 < branch_factor && next_child_tid < num_threads)
1074 KMP_CACHE_PREFETCH(
1075 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1076 #endif /* KMP_CACHE_MANAGE */
1077 KA_TRACE(20,
1078 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1079 "arrived(%p) == %llu\n",
1080 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1081 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1082 // Wait for child to arrive
1083 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1084 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1085 KMP_MB(); // Synchronize parent and child threads.
1086 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1087 // Barrier imbalance - write min of the thread time and a child time to
1088 // the thread.
1089 if (__kmp_forkjoin_frames_mode == 2) {
1090 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1091 child_thr->th.th_bar_min_time);
1093 #endif
1094 if (reduce) {
1095 KA_TRACE(100,
1096 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1097 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1098 team->t.t_id, child_tid));
1099 OMPT_REDUCTION_DECL(this_thr, gtid);
1100 OMPT_REDUCTION_BEGIN;
1101 (*reduce)(this_thr->th.th_local.reduce_data,
1102 child_thr->th.th_local.reduce_data);
1103 OMPT_REDUCTION_END;
1108 if (KMP_MASTER_TID(tid)) {
1109 // Need to update the team arrived pointer if we are the primary thread
1110 if (new_state == KMP_BARRIER_UNUSED_STATE)
1111 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1112 else
1113 team->t.t_bar[bt].b_arrived = new_state;
1114 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1115 "arrived(%p) = %llu\n",
1116 gtid, team->t.t_id, tid, team->t.t_id,
1117 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1119 KA_TRACE(
1120 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1121 gtid, team->t.t_id, tid, bt));
1124 // The reverse versions seem to beat the forward versions overall
1125 #define KMP_REVERSE_HYPER_BAR
1126 static void __kmp_hyper_barrier_release(
1127 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1128 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1129 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1130 kmp_team_t *team;
1131 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1132 kmp_info_t **other_threads;
1133 kmp_uint32 num_threads;
1134 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1135 kmp_uint32 branch_factor = 1 << branch_bits;
1136 kmp_uint32 child;
1137 kmp_uint32 child_tid;
1138 kmp_uint32 offset;
1139 kmp_uint32 level;
1141 /* Perform a hypercube-embedded tree release for all of the threads that have
1142 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1143 are released in the reverse order of the corresponding gather, otherwise
1144 threads are released in the same order. */
1145 if (KMP_MASTER_TID(tid)) { // primary thread
1146 team = __kmp_threads[gtid]->th.th_team;
1147 KMP_DEBUG_ASSERT(team != NULL);
1148 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1149 "barrier type %d\n",
1150 gtid, team->t.t_id, tid, bt));
1151 #if KMP_BARRIER_ICV_PUSH
1152 if (propagate_icvs) { // primary already has ICVs in final destination; copy
1153 copy_icvs(&thr_bar->th_fixed_icvs,
1154 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1156 #endif
1157 } else { // Handle fork barrier workers who aren't part of a team yet
1158 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1159 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1160 // Wait for parent thread to release us
1161 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1162 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1163 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1164 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1165 // In fork barrier where we could not get the object reliably
1166 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1167 // Cancel wait on previous parallel region...
1168 __kmp_itt_task_starting(itt_sync_obj);
1170 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1171 return;
1173 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1174 if (itt_sync_obj != NULL)
1175 // Call prepare as early as possible for "new" barrier
1176 __kmp_itt_task_finished(itt_sync_obj);
1177 } else
1178 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1179 // Early exit for reaping threads releasing forkjoin barrier
1180 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1181 return;
1183 // The worker thread may now assume that the team is valid.
1184 team = __kmp_threads[gtid]->th.th_team;
1185 KMP_DEBUG_ASSERT(team != NULL);
1186 tid = __kmp_tid_from_gtid(gtid);
1188 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1189 KA_TRACE(20,
1190 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1191 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1192 KMP_MB(); // Flush all pending memory write invalidates.
1194 num_threads = this_thr->th.th_team_nproc;
1195 other_threads = team->t.t_threads;
1197 #ifdef KMP_REVERSE_HYPER_BAR
1198 // Count up to correct level for parent
1199 for (level = 0, offset = 1;
1200 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1201 level += branch_bits, offset <<= branch_bits)
1204 // Now go down from there
1205 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1206 level -= branch_bits, offset >>= branch_bits)
1207 #else
1208 // Go down the tree, level by level
1209 for (level = 0, offset = 1; offset < num_threads;
1210 level += branch_bits, offset <<= branch_bits)
1211 #endif // KMP_REVERSE_HYPER_BAR
1213 #ifdef KMP_REVERSE_HYPER_BAR
1214 /* Now go in reverse order through the children, highest to lowest.
1215 Initial setting of child is conservative here. */
1216 child = num_threads >> ((level == 0) ? level : level - 1);
1217 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1218 child_tid = tid + (child << level);
1219 child >= 1; child--, child_tid -= (1 << level))
1220 #else
1221 if (((tid >> level) & (branch_factor - 1)) != 0)
1222 // No need to go lower than this, since this is the level parent would be
1223 // notified
1224 break;
1225 // Iterate through children on this level of the tree
1226 for (child = 1, child_tid = tid + (1 << level);
1227 child < branch_factor && child_tid < num_threads;
1228 child++, child_tid += (1 << level))
1229 #endif // KMP_REVERSE_HYPER_BAR
1231 if (child_tid >= num_threads)
1232 continue; // Child doesn't exist so keep going
1233 else {
1234 kmp_info_t *child_thr = other_threads[child_tid];
1235 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1236 #if KMP_CACHE_MANAGE
1237 kmp_uint32 next_child_tid = child_tid - (1 << level);
1238 // Prefetch next thread's go count
1239 #ifdef KMP_REVERSE_HYPER_BAR
1240 if (child - 1 >= 1 && next_child_tid < num_threads)
1241 #else
1242 if (child + 1 < branch_factor && next_child_tid < num_threads)
1243 #endif // KMP_REVERSE_HYPER_BAR
1244 KMP_CACHE_PREFETCH(
1245 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1246 #endif /* KMP_CACHE_MANAGE */
1248 #if KMP_BARRIER_ICV_PUSH
1249 if (propagate_icvs) // push my fixed ICVs to my child
1250 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1251 #endif // KMP_BARRIER_ICV_PUSH
1253 KA_TRACE(
1255 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1256 "go(%p): %u => %u\n",
1257 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1258 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1259 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1260 // Release child from barrier
1261 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1262 flag.release();
1266 #if KMP_BARRIER_ICV_PUSH
1267 if (propagate_icvs &&
1268 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1269 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1270 FALSE);
1271 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1272 &thr_bar->th_fixed_icvs);
1274 #endif
1275 KA_TRACE(
1277 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1278 gtid, team->t.t_id, tid, bt));
1281 // Hierarchical Barrier
1283 // Initialize thread barrier data
1284 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1285 Performs the minimum amount of initialization required based on how the team
1286 has changed. Returns true if leaf children will require both on-core and
1287 traditional wake-up mechanisms. For example, if the team size increases,
1288 threads already in the team will respond to on-core wakeup on their parent
1289 thread, but threads newly added to the team will only be listening on the
1290 their local b_go. */
1291 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1292 kmp_bstate_t *thr_bar,
1293 kmp_uint32 nproc, int gtid,
1294 int tid, kmp_team_t *team) {
1295 // Checks to determine if (re-)initialization is needed
1296 bool uninitialized = thr_bar->team == NULL;
1297 bool team_changed = team != thr_bar->team;
1298 bool team_sz_changed = nproc != thr_bar->nproc;
1299 bool tid_changed = tid != thr_bar->old_tid;
1300 bool retval = false;
1302 if (uninitialized || team_sz_changed) {
1303 __kmp_get_hierarchy(nproc, thr_bar);
1306 if (uninitialized || team_sz_changed || tid_changed) {
1307 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1308 thr_bar->parent_tid = -1; // default for primary thread
1309 if (!KMP_MASTER_TID(tid)) {
1310 // if not primary thread, find parent thread in hierarchy
1311 kmp_uint32 d = 0;
1312 while (d < thr_bar->depth) { // find parent based on level of thread in
1313 // hierarchy, and note level
1314 kmp_uint32 rem;
1315 if (d == thr_bar->depth - 2) { // reached level right below the primary
1316 thr_bar->parent_tid = 0;
1317 thr_bar->my_level = d;
1318 break;
1319 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1320 // TODO: can we make the above op faster?
1321 // thread is not a subtree root at next level, so this is max
1322 thr_bar->parent_tid = tid - rem;
1323 thr_bar->my_level = d;
1324 break;
1326 ++d;
1329 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1330 (thr_bar->skip_per_level[thr_bar->my_level])),
1331 &(thr_bar->offset));
1332 thr_bar->old_tid = tid;
1333 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1334 thr_bar->team = team;
1335 thr_bar->parent_bar =
1336 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1338 if (uninitialized || team_changed || tid_changed) {
1339 thr_bar->team = team;
1340 thr_bar->parent_bar =
1341 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1342 retval = true;
1344 if (uninitialized || team_sz_changed || tid_changed) {
1345 thr_bar->nproc = nproc;
1346 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1347 if (thr_bar->my_level == 0)
1348 thr_bar->leaf_kids = 0;
1349 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1350 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1351 thr_bar->leaf_state = 0;
1352 for (int i = 0; i < thr_bar->leaf_kids; ++i)
1353 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1355 return retval;
1358 static void __kmp_hierarchical_barrier_gather(
1359 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1360 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1361 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1362 kmp_team_t *team = this_thr->th.th_team;
1363 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1364 kmp_uint32 nproc = this_thr->th.th_team_nproc;
1365 kmp_info_t **other_threads = team->t.t_threads;
1366 kmp_uint64 new_state = 0;
1368 int level = team->t.t_level;
1369 if (other_threads[0]
1370 ->th.th_teams_microtask) // are we inside the teams construct?
1371 if (this_thr->th.th_teams_size.nteams > 1)
1372 ++level; // level was not increased in teams construct for team_of_masters
1373 if (level == 1)
1374 thr_bar->use_oncore_barrier = 1;
1375 else
1376 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1378 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1379 "barrier type %d\n",
1380 gtid, team->t.t_id, tid, bt));
1381 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1383 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1384 // Barrier imbalance - save arrive time to the thread
1385 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1386 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1388 #endif
1390 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1391 team);
1393 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1394 kmp_int32 child_tid;
1395 new_state =
1396 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1397 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1398 thr_bar->use_oncore_barrier) {
1399 if (thr_bar->leaf_kids) {
1400 // First, wait for leaf children to check-in on my b_arrived flag
1401 kmp_uint64 leaf_state =
1402 KMP_MASTER_TID(tid)
1403 ? thr_bar->b_arrived | thr_bar->leaf_state
1404 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1405 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1406 "for leaf kids\n",
1407 gtid, team->t.t_id, tid));
1408 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1409 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1410 if (reduce) {
1411 OMPT_REDUCTION_DECL(this_thr, gtid);
1412 OMPT_REDUCTION_BEGIN;
1413 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1414 ++child_tid) {
1415 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1416 "T#%d(%d:%d)\n",
1417 gtid, team->t.t_id, tid,
1418 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1419 child_tid));
1420 (*reduce)(this_thr->th.th_local.reduce_data,
1421 other_threads[child_tid]->th.th_local.reduce_data);
1423 OMPT_REDUCTION_END;
1425 // clear leaf_state bits
1426 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1428 // Next, wait for higher level children on each child's b_arrived flag
1429 for (kmp_uint32 d = 1; d < thr_bar->my_level;
1430 ++d) { // gather lowest level threads first, but skip 0
1431 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1432 skip = thr_bar->skip_per_level[d];
1433 if (last > nproc)
1434 last = nproc;
1435 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1436 kmp_info_t *child_thr = other_threads[child_tid];
1437 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1438 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1439 "T#%d(%d:%d) "
1440 "arrived(%p) == %llu\n",
1441 gtid, team->t.t_id, tid,
1442 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1443 child_tid, &child_bar->b_arrived, new_state));
1444 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1445 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1446 if (reduce) {
1447 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1448 "T#%d(%d:%d)\n",
1449 gtid, team->t.t_id, tid,
1450 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1451 child_tid));
1452 (*reduce)(this_thr->th.th_local.reduce_data,
1453 child_thr->th.th_local.reduce_data);
1457 } else { // Blocktime is not infinite
1458 for (kmp_uint32 d = 0; d < thr_bar->my_level;
1459 ++d) { // Gather lowest level threads first
1460 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1461 skip = thr_bar->skip_per_level[d];
1462 if (last > nproc)
1463 last = nproc;
1464 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1465 kmp_info_t *child_thr = other_threads[child_tid];
1466 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1467 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1468 "T#%d(%d:%d) "
1469 "arrived(%p) == %llu\n",
1470 gtid, team->t.t_id, tid,
1471 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1472 child_tid, &child_bar->b_arrived, new_state));
1473 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1474 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1475 if (reduce) {
1476 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1477 "T#%d(%d:%d)\n",
1478 gtid, team->t.t_id, tid,
1479 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1480 child_tid));
1481 (*reduce)(this_thr->th.th_local.reduce_data,
1482 child_thr->th.th_local.reduce_data);
1488 // All subordinates are gathered; now release parent if not primary thread
1490 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1491 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1492 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1493 gtid, team->t.t_id, tid,
1494 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1495 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1496 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1497 /* Mark arrival to parent: After performing this write, a worker thread may
1498 not assume that the team is valid any more - it could be deallocated by
1499 the primary thread at any time. */
1500 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1501 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1502 // flag; release it
1503 kmp_flag_64<> flag(&thr_bar->b_arrived,
1504 other_threads[thr_bar->parent_tid]);
1505 flag.release();
1506 } else {
1507 // Leaf does special release on "offset" bits of parent's b_arrived flag
1508 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1509 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1510 thr_bar->offset + 1);
1511 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1512 flag.release();
1514 } else { // Primary thread needs to update the team's b_arrived value
1515 team->t.t_bar[bt].b_arrived = new_state;
1516 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1517 "arrived(%p) = %llu\n",
1518 gtid, team->t.t_id, tid, team->t.t_id,
1519 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1521 // Is the team access below unsafe or just technically invalid?
1522 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1523 "barrier type %d\n",
1524 gtid, team->t.t_id, tid, bt));
1527 static void __kmp_hierarchical_barrier_release(
1528 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1529 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1530 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1531 kmp_team_t *team;
1532 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1533 kmp_uint32 nproc;
1534 bool team_change = false; // indicates on-core barrier shouldn't be used
1536 if (KMP_MASTER_TID(tid)) {
1537 team = __kmp_threads[gtid]->th.th_team;
1538 KMP_DEBUG_ASSERT(team != NULL);
1539 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1540 "entered barrier type %d\n",
1541 gtid, team->t.t_id, tid, bt));
1542 } else { // Worker threads
1543 // Wait for parent thread to release me
1544 if (!thr_bar->use_oncore_barrier ||
1545 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1546 thr_bar->team == NULL) {
1547 // Use traditional method of waiting on my own b_go flag
1548 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1549 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1550 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1551 TCW_8(thr_bar->b_go,
1552 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1553 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1554 // infinite, not nested
1555 // Wait on my "offset" bits on parent's b_go flag
1556 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1557 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1558 thr_bar->offset + 1, bt,
1559 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1560 flag.wait(this_thr, TRUE);
1561 if (thr_bar->wait_flag ==
1562 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1563 TCW_8(thr_bar->b_go,
1564 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1565 } else { // Reset my bits on parent's b_go flag
1566 (RCAST(volatile char *,
1567 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1570 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1571 // Early exit for reaping threads releasing forkjoin barrier
1572 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1573 return;
1574 // The worker thread may now assume that the team is valid.
1575 team = __kmp_threads[gtid]->th.th_team;
1576 KMP_DEBUG_ASSERT(team != NULL);
1577 tid = __kmp_tid_from_gtid(gtid);
1579 KA_TRACE(
1581 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1582 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1583 KMP_MB(); // Flush all pending memory write invalidates.
1586 nproc = this_thr->th.th_team_nproc;
1587 int level = team->t.t_level;
1588 if (team->t.t_threads[0]
1589 ->th.th_teams_microtask) { // are we inside the teams construct?
1590 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1591 this_thr->th.th_teams_level == level)
1592 ++level; // level was not increased in teams construct for team_of_workers
1593 if (this_thr->th.th_teams_size.nteams > 1)
1594 ++level; // level was not increased in teams construct for team_of_masters
1596 if (level == 1)
1597 thr_bar->use_oncore_barrier = 1;
1598 else
1599 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1601 // If the team size has increased, we still communicate with old leaves via
1602 // oncore barrier.
1603 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1604 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1605 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1606 tid, team);
1607 // But if the entire team changes, we won't use oncore barrier at all
1608 if (team_change)
1609 old_leaf_kids = 0;
1611 #if KMP_BARRIER_ICV_PUSH
1612 if (propagate_icvs) {
1613 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1614 FALSE);
1615 if (KMP_MASTER_TID(
1616 tid)) { // primary already has copy in final destination; copy
1617 copy_icvs(&thr_bar->th_fixed_icvs,
1618 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1619 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1620 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1621 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1622 // leaves (on-core children) pull parent's fixed ICVs directly to local
1623 // ICV store
1624 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1625 &thr_bar->parent_bar->th_fixed_icvs);
1626 // non-leaves will get ICVs piggybacked with b_go via NGO store
1627 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1628 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1629 // access
1630 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1631 else // leaves copy parent's fixed ICVs directly to local ICV store
1632 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1633 &thr_bar->parent_bar->th_fixed_icvs);
1636 #endif // KMP_BARRIER_ICV_PUSH
1638 // Now, release my children
1639 if (thr_bar->my_level) { // not a leaf
1640 kmp_int32 child_tid;
1641 kmp_uint32 last;
1642 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1643 thr_bar->use_oncore_barrier) {
1644 if (KMP_MASTER_TID(tid)) { // do a flat release
1645 // Set local b_go to bump children via NGO store of the cache line
1646 // containing IVCs and b_go.
1647 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1648 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1649 // the cache line
1650 ngo_load(&thr_bar->th_fixed_icvs);
1651 // This loops over all the threads skipping only the leaf nodes in the
1652 // hierarchy
1653 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1654 child_tid += thr_bar->skip_per_level[1]) {
1655 kmp_bstate_t *child_bar =
1656 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1657 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1658 "releasing T#%d(%d:%d)"
1659 " go(%p): %u => %u\n",
1660 gtid, team->t.t_id, tid,
1661 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1662 child_tid, &child_bar->b_go, child_bar->b_go,
1663 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1664 // Use ngo store (if available) to both store ICVs and release child
1665 // via child's b_go
1666 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1668 ngo_sync();
1670 TCW_8(thr_bar->b_go,
1671 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1672 // Now, release leaf children
1673 if (thr_bar->leaf_kids) { // if there are any
1674 // We test team_change on the off-chance that the level 1 team changed.
1675 if (team_change ||
1676 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1677 if (old_leaf_kids) { // release old leaf kids
1678 thr_bar->b_go |= old_leaf_state;
1680 // Release new leaf kids
1681 last = tid + thr_bar->skip_per_level[1];
1682 if (last > nproc)
1683 last = nproc;
1684 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1685 ++child_tid) { // skip_per_level[0]=1
1686 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1687 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1688 KA_TRACE(
1690 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1691 " T#%d(%d:%d) go(%p): %u => %u\n",
1692 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1693 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1694 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1695 // Release child using child's b_go flag
1696 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1697 flag.release();
1699 } else { // Release all children at once with leaf_state bits on my own
1700 // b_go flag
1701 thr_bar->b_go |= thr_bar->leaf_state;
1704 } else { // Blocktime is not infinite; do a simple hierarchical release
1705 for (int d = thr_bar->my_level - 1; d >= 0;
1706 --d) { // Release highest level threads first
1707 last = tid + thr_bar->skip_per_level[d + 1];
1708 kmp_uint32 skip = thr_bar->skip_per_level[d];
1709 if (last > nproc)
1710 last = nproc;
1711 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1712 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1713 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1714 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1715 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1716 gtid, team->t.t_id, tid,
1717 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1718 child_tid, &child_bar->b_go, child_bar->b_go,
1719 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1720 // Release child using child's b_go flag
1721 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1722 flag.release();
1726 #if KMP_BARRIER_ICV_PUSH
1727 if (propagate_icvs && !KMP_MASTER_TID(tid))
1728 // non-leaves copy ICVs from fixed ICVs to local dest
1729 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1730 &thr_bar->th_fixed_icvs);
1731 #endif // KMP_BARRIER_ICV_PUSH
1733 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1734 "barrier type %d\n",
1735 gtid, team->t.t_id, tid, bt));
1738 // End of Barrier Algorithms
1740 // type traits for cancellable value
1741 // if cancellable is true, then is_cancellable is a normal boolean variable
1742 // if cancellable is false, then is_cancellable is a compile time constant
1743 template <bool cancellable> struct is_cancellable {};
1744 template <> struct is_cancellable<true> {
1745 bool value;
1746 is_cancellable() : value(false) {}
1747 is_cancellable(bool b) : value(b) {}
1748 is_cancellable &operator=(bool b) {
1749 value = b;
1750 return *this;
1752 operator bool() const { return value; }
1754 template <> struct is_cancellable<false> {
1755 is_cancellable &operator=(bool b) { return *this; }
1756 constexpr operator bool() const { return false; }
1759 // Internal function to do a barrier.
1760 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1761 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1762 barrier
1763 When cancellable = false,
1764 Returns 0 if primary thread, 1 if worker thread.
1765 When cancellable = true
1766 Returns 0 if not cancelled, 1 if cancelled. */
1767 template <bool cancellable = false>
1768 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1769 size_t reduce_size, void *reduce_data,
1770 void (*reduce)(void *, void *)) {
1771 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1772 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1773 int tid = __kmp_tid_from_gtid(gtid);
1774 kmp_info_t *this_thr = __kmp_threads[gtid];
1775 kmp_team_t *team = this_thr->th.th_team;
1776 int status = 0;
1777 is_cancellable<cancellable> cancelled;
1778 #if OMPT_SUPPORT && OMPT_OPTIONAL
1779 ompt_data_t *my_task_data;
1780 ompt_data_t *my_parallel_data;
1781 void *return_address;
1782 ompt_sync_region_t barrier_kind;
1783 #endif
1785 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1786 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1788 #if OMPT_SUPPORT
1789 if (ompt_enabled.enabled) {
1790 #if OMPT_OPTIONAL
1791 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1792 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1793 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1794 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1795 if (ompt_enabled.ompt_callback_sync_region) {
1796 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1797 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1798 return_address);
1800 if (ompt_enabled.ompt_callback_sync_region_wait) {
1801 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1802 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1803 return_address);
1805 #endif
1806 // It is OK to report the barrier state after the barrier begin callback.
1807 // According to the OMPT specification, a compliant implementation may
1808 // even delay reporting this state until the barrier begins to wait.
1809 auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1810 switch (barrier_kind) {
1811 case ompt_sync_region_barrier_explicit:
1812 ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1813 break;
1814 case ompt_sync_region_barrier_implicit_workshare:
1815 ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1816 break;
1817 case ompt_sync_region_barrier_implicit_parallel:
1818 ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1819 break;
1820 case ompt_sync_region_barrier_teams:
1821 ompt_thr_info->state = ompt_state_wait_barrier_teams;
1822 break;
1823 case ompt_sync_region_barrier_implementation:
1824 [[fallthrough]];
1825 default:
1826 ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1829 #endif
1831 if (!team->t.t_serialized) {
1832 #if USE_ITT_BUILD
1833 // This value will be used in itt notify events below.
1834 void *itt_sync_obj = NULL;
1835 #if USE_ITT_NOTIFY
1836 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1837 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1838 #endif
1839 #endif /* USE_ITT_BUILD */
1840 if (__kmp_tasking_mode == tskm_extra_barrier) {
1841 __kmp_tasking_barrier(team, this_thr, gtid);
1842 KA_TRACE(15,
1843 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1844 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1847 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1848 access it when the team struct is not guaranteed to exist. */
1849 // See note about the corresponding code in __kmp_join_barrier() being
1850 // performance-critical.
1851 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1852 #if KMP_USE_MONITOR
1853 this_thr->th.th_team_bt_intervals =
1854 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1855 this_thr->th.th_team_bt_set =
1856 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1857 #else
1858 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1859 #endif
1862 #if USE_ITT_BUILD
1863 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1864 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1865 #endif /* USE_ITT_BUILD */
1866 #if USE_DEBUGGER
1867 // Let the debugger know: the thread arrived to the barrier and waiting.
1868 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1869 team->t.t_bar[bt].b_master_arrived += 1;
1870 } else {
1871 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1872 } // if
1873 #endif /* USE_DEBUGGER */
1874 if (reduce != NULL) {
1875 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1876 this_thr->th.th_local.reduce_data = reduce_data;
1879 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1880 __kmp_task_team_setup(this_thr, team);
1882 if (cancellable) {
1883 cancelled = __kmp_linear_barrier_gather_cancellable(
1884 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1885 } else {
1886 switch (__kmp_barrier_gather_pattern[bt]) {
1887 case bp_dist_bar: {
1888 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1889 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1890 break;
1892 case bp_hyper_bar: {
1893 // don't set branch bits to 0; use linear
1894 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1895 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1896 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1897 break;
1899 case bp_hierarchical_bar: {
1900 __kmp_hierarchical_barrier_gather(
1901 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1902 break;
1904 case bp_tree_bar: {
1905 // don't set branch bits to 0; use linear
1906 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1907 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1908 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1909 break;
1911 default: {
1912 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1913 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1918 KMP_MB();
1920 if (KMP_MASTER_TID(tid)) {
1921 status = 0;
1922 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1923 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1925 #if USE_DEBUGGER
1926 // Let the debugger know: All threads are arrived and starting leaving the
1927 // barrier.
1928 team->t.t_bar[bt].b_team_arrived += 1;
1929 #endif
1931 if (__kmp_omp_cancellation) {
1932 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1933 // Reset cancellation flag for worksharing constructs
1934 if (cancel_request == cancel_loop ||
1935 cancel_request == cancel_sections) {
1936 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1939 #if USE_ITT_BUILD
1940 /* TODO: In case of split reduction barrier, primary thread may send
1941 acquired event early, before the final summation into the shared
1942 variable is done (final summation can be a long operation for array
1943 reductions). */
1944 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1945 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1946 #endif /* USE_ITT_BUILD */
1947 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1948 // Barrier - report frame end (only if active_level == 1)
1949 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1950 __kmp_forkjoin_frames_mode &&
1951 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1952 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1953 team->t.t_active_level == 1) {
1954 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1955 kmp_uint64 cur_time = __itt_get_timestamp();
1956 kmp_info_t **other_threads = team->t.t_threads;
1957 int nproc = this_thr->th.th_team_nproc;
1958 int i;
1959 switch (__kmp_forkjoin_frames_mode) {
1960 case 1:
1961 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1962 loc, nproc);
1963 this_thr->th.th_frame_time = cur_time;
1964 break;
1965 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1966 // be fixed)
1967 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1968 1, loc, nproc);
1969 break;
1970 case 3:
1971 if (__itt_metadata_add_ptr) {
1972 // Initialize with primary thread's wait time
1973 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1974 // Set arrive time to zero to be able to check it in
1975 // __kmp_invoke_task(); the same is done inside the loop below
1976 this_thr->th.th_bar_arrive_time = 0;
1977 for (i = 1; i < nproc; ++i) {
1978 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1979 other_threads[i]->th.th_bar_arrive_time = 0;
1981 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1982 cur_time, delta,
1983 (kmp_uint64)(reduce != NULL));
1985 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1986 loc, nproc);
1987 this_thr->th.th_frame_time = cur_time;
1988 break;
1991 #endif /* USE_ITT_BUILD */
1992 } else {
1993 status = 1;
1994 #if USE_ITT_BUILD
1995 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1996 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1997 #endif /* USE_ITT_BUILD */
1999 if ((status == 1 || !is_split) && !cancelled) {
2000 if (cancellable) {
2001 cancelled = __kmp_linear_barrier_release_cancellable(
2002 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2003 } else {
2004 switch (__kmp_barrier_release_pattern[bt]) {
2005 case bp_dist_bar: {
2006 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2007 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2008 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2009 break;
2011 case bp_hyper_bar: {
2012 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2013 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2014 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2015 break;
2017 case bp_hierarchical_bar: {
2018 __kmp_hierarchical_barrier_release(
2019 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2020 break;
2022 case bp_tree_bar: {
2023 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2024 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2025 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2026 break;
2028 default: {
2029 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2030 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2034 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2035 __kmp_task_team_sync(this_thr, team);
2039 #if USE_ITT_BUILD
2040 /* GEH: TODO: Move this under if-condition above and also include in
2041 __kmp_end_split_barrier(). This will more accurately represent the actual
2042 release time of the threads for split barriers. */
2043 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2044 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2045 #endif /* USE_ITT_BUILD */
2046 } else { // Team is serialized.
2047 status = 0;
2048 if (__kmp_tasking_mode != tskm_immediate_exec) {
2049 if (this_thr->th.th_task_team != NULL) {
2050 #if USE_ITT_NOTIFY
2051 void *itt_sync_obj = NULL;
2052 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2053 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2054 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2056 #endif
2058 KMP_DEBUG_ASSERT(
2059 this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2060 this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2061 TRUE);
2062 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2063 __kmp_task_team_setup(this_thr, team);
2065 #if USE_ITT_BUILD
2066 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2067 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2068 #endif /* USE_ITT_BUILD */
2072 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2073 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2074 __kmp_tid_from_gtid(gtid), status));
2076 #if OMPT_SUPPORT
2077 if (ompt_enabled.enabled) {
2078 #if OMPT_OPTIONAL
2079 if (ompt_enabled.ompt_callback_sync_region_wait) {
2080 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2081 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2082 return_address);
2084 if (ompt_enabled.ompt_callback_sync_region) {
2085 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2086 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2087 return_address);
2089 #endif
2090 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2092 #endif
2094 if (cancellable)
2095 return (int)cancelled;
2096 return status;
2099 // Returns 0 if primary thread, 1 if worker thread.
2100 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2101 size_t reduce_size, void *reduce_data,
2102 void (*reduce)(void *, void *)) {
2103 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2104 reduce);
2107 #if defined(KMP_GOMP_COMPAT)
2108 // Returns 1 if cancelled, 0 otherwise
2109 int __kmp_barrier_gomp_cancel(int gtid) {
2110 if (__kmp_omp_cancellation) {
2111 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2112 0, NULL, NULL);
2113 if (cancelled) {
2114 int tid = __kmp_tid_from_gtid(gtid);
2115 kmp_info_t *this_thr = __kmp_threads[gtid];
2116 if (KMP_MASTER_TID(tid)) {
2117 // Primary thread does not need to revert anything
2118 } else {
2119 // Workers need to revert their private b_arrived flag
2120 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2121 KMP_BARRIER_STATE_BUMP;
2124 return cancelled;
2126 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2127 return FALSE;
2129 #endif
2131 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2132 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2133 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2134 KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2135 int tid = __kmp_tid_from_gtid(gtid);
2136 kmp_info_t *this_thr = __kmp_threads[gtid];
2137 kmp_team_t *team = this_thr->th.th_team;
2139 if (!team->t.t_serialized) {
2140 if (KMP_MASTER_GTID(gtid)) {
2141 switch (__kmp_barrier_release_pattern[bt]) {
2142 case bp_dist_bar: {
2143 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2144 FALSE USE_ITT_BUILD_ARG(NULL));
2145 break;
2147 case bp_hyper_bar: {
2148 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2149 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2150 FALSE USE_ITT_BUILD_ARG(NULL));
2151 break;
2153 case bp_hierarchical_bar: {
2154 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2155 FALSE USE_ITT_BUILD_ARG(NULL));
2156 break;
2158 case bp_tree_bar: {
2159 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2160 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2161 FALSE USE_ITT_BUILD_ARG(NULL));
2162 break;
2164 default: {
2165 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2166 FALSE USE_ITT_BUILD_ARG(NULL));
2169 if (__kmp_tasking_mode != tskm_immediate_exec) {
2170 __kmp_task_team_sync(this_thr, team);
2171 } // if
2176 void __kmp_join_barrier(int gtid) {
2177 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2178 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2180 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2182 kmp_info_t *this_thr = __kmp_threads[gtid];
2183 kmp_team_t *team;
2184 int tid;
2185 #ifdef KMP_DEBUG
2186 int team_id;
2187 #endif /* KMP_DEBUG */
2188 #if USE_ITT_BUILD
2189 void *itt_sync_obj = NULL;
2190 #if USE_ITT_NOTIFY
2191 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2192 // Get object created at fork_barrier
2193 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2194 #endif
2195 #endif /* USE_ITT_BUILD */
2196 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2197 int nproc = this_thr->th.th_team_nproc;
2198 #endif
2199 KMP_MB();
2201 // Get current info
2202 team = this_thr->th.th_team;
2203 KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2204 tid = __kmp_tid_from_gtid(gtid);
2205 #ifdef KMP_DEBUG
2206 team_id = team->t.t_id;
2207 kmp_info_t *master_thread = this_thr->th.th_team_master;
2208 if (master_thread != team->t.t_threads[0]) {
2209 __kmp_print_structure();
2211 #endif /* KMP_DEBUG */
2212 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2213 KMP_MB();
2215 // Verify state
2216 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2217 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2218 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2219 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2220 gtid, team_id, tid));
2222 #if OMPT_SUPPORT
2223 if (ompt_enabled.enabled) {
2224 #if OMPT_OPTIONAL
2225 ompt_data_t *my_task_data;
2226 ompt_data_t *my_parallel_data;
2227 void *codeptr = NULL;
2228 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2229 if (KMP_MASTER_TID(ds_tid) &&
2230 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2231 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2232 codeptr = team->t.ompt_team_info.master_return_address;
2233 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2234 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2235 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2236 ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2237 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2238 sync_kind = ompt_sync_region_barrier_teams;
2239 ompt_state = ompt_state_wait_barrier_teams;
2241 if (ompt_enabled.ompt_callback_sync_region) {
2242 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2243 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2245 if (ompt_enabled.ompt_callback_sync_region_wait) {
2246 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2247 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2249 if (!KMP_MASTER_TID(ds_tid))
2250 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2251 #endif
2252 this_thr->th.ompt_thread_info.state = ompt_state;
2254 #endif
2256 if (__kmp_tasking_mode == tskm_extra_barrier) {
2257 __kmp_tasking_barrier(team, this_thr, gtid);
2258 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2259 gtid, team_id, tid));
2261 #ifdef KMP_DEBUG
2262 if (__kmp_tasking_mode != tskm_immediate_exec) {
2263 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2264 "%p, th_task_team = %p\n",
2265 __kmp_gtid_from_thread(this_thr), team_id,
2266 team->t.t_task_team[this_thr->th.th_task_state],
2267 this_thr->th.th_task_team));
2268 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2270 #endif /* KMP_DEBUG */
2272 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2273 access it when the team struct is not guaranteed to exist. Doing these
2274 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2275 we do not perform the copy if blocktime=infinite, since the values are not
2276 used by __kmp_wait_template() in that case. */
2277 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2278 #if KMP_USE_MONITOR
2279 this_thr->th.th_team_bt_intervals =
2280 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2281 this_thr->th.th_team_bt_set =
2282 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2283 #else
2284 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2285 #endif
2288 #if USE_ITT_BUILD
2289 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2290 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2291 #endif /* USE_ITT_BUILD */
2293 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2294 case bp_dist_bar: {
2295 __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2296 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2297 break;
2299 case bp_hyper_bar: {
2300 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2301 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2302 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2303 break;
2305 case bp_hierarchical_bar: {
2306 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2307 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2308 break;
2310 case bp_tree_bar: {
2311 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2312 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2313 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2314 break;
2316 default: {
2317 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2318 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2322 /* From this point on, the team data structure may be deallocated at any time
2323 by the primary thread - it is unsafe to reference it in any of the worker
2324 threads. Any per-team data items that need to be referenced before the
2325 end of the barrier should be moved to the kmp_task_team_t structs. */
2326 if (KMP_MASTER_TID(tid)) {
2327 if (__kmp_tasking_mode != tskm_immediate_exec) {
2328 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2330 if (__kmp_display_affinity) {
2331 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2333 #if KMP_STATS_ENABLED
2334 // Have primary thread flag the workers to indicate they are now waiting for
2335 // next parallel region, Also wake them up so they switch their timers to
2336 // idle.
2337 for (int i = 0; i < team->t.t_nproc; ++i) {
2338 kmp_info_t *team_thread = team->t.t_threads[i];
2339 if (team_thread == this_thr)
2340 continue;
2341 team_thread->th.th_stats->setIdleFlag();
2342 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2343 team_thread->th.th_sleep_loc != NULL)
2344 __kmp_null_resume_wrapper(team_thread);
2346 #endif
2347 #if USE_ITT_BUILD
2348 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2349 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2350 #endif /* USE_ITT_BUILD */
2352 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2353 // Join barrier - report frame end
2354 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2355 __kmp_forkjoin_frames_mode &&
2356 (this_thr->th.th_teams_microtask == NULL || // either not in teams
2357 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2358 team->t.t_active_level == 1) {
2359 kmp_uint64 cur_time = __itt_get_timestamp();
2360 ident_t *loc = team->t.t_ident;
2361 kmp_info_t **other_threads = team->t.t_threads;
2362 switch (__kmp_forkjoin_frames_mode) {
2363 case 1:
2364 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2365 loc, nproc);
2366 break;
2367 case 2:
2368 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2369 loc, nproc);
2370 break;
2371 case 3:
2372 if (__itt_metadata_add_ptr) {
2373 // Initialize with primary thread's wait time
2374 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2375 // Set arrive time to zero to be able to check it in
2376 // __kmp_invoke_task(); the same is done inside the loop below
2377 this_thr->th.th_bar_arrive_time = 0;
2378 for (int i = 1; i < nproc; ++i) {
2379 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2380 other_threads[i]->th.th_bar_arrive_time = 0;
2382 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2383 cur_time, delta, 0);
2385 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2386 loc, nproc);
2387 this_thr->th.th_frame_time = cur_time;
2388 break;
2391 #endif /* USE_ITT_BUILD */
2393 #if USE_ITT_BUILD
2394 else {
2395 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2396 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2398 #endif /* USE_ITT_BUILD */
2400 #if KMP_DEBUG
2401 if (KMP_MASTER_TID(tid)) {
2402 KA_TRACE(
2404 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2405 gtid, team_id, tid, nproc));
2407 #endif /* KMP_DEBUG */
2409 // TODO now, mark worker threads as done so they may be disbanded
2410 KMP_MB(); // Flush all pending memory write invalidates.
2411 KA_TRACE(10,
2412 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2416 // TODO release worker threads' fork barriers as we are ready instead of all at
2417 // once
2418 void __kmp_fork_barrier(int gtid, int tid) {
2419 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2420 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2421 kmp_info_t *this_thr = __kmp_threads[gtid];
2422 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2423 #if USE_ITT_BUILD
2424 void *itt_sync_obj = NULL;
2425 #endif /* USE_ITT_BUILD */
2426 #ifdef KMP_DEBUG
2427 if (team)
2428 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2429 (team != NULL) ? team->t.t_id : -1, tid));
2430 #endif
2431 // th_team pointer only valid for primary thread here
2432 if (KMP_MASTER_TID(tid)) {
2433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2434 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2435 // Create itt barrier object
2436 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2437 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2441 #ifdef KMP_DEBUG
2442 KMP_DEBUG_ASSERT(team);
2443 kmp_info_t **other_threads = team->t.t_threads;
2444 int i;
2446 // Verify state
2447 KMP_MB();
2449 for (i = 1; i < team->t.t_nproc; ++i) {
2450 KA_TRACE(500,
2451 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2452 "== %u.\n",
2453 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2454 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2455 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2456 KMP_DEBUG_ASSERT(
2457 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2458 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2459 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2461 #endif
2463 if (__kmp_tasking_mode != tskm_immediate_exec)
2464 __kmp_task_team_setup(this_thr, team);
2466 /* The primary thread may have changed its blocktime between join barrier
2467 and fork barrier. Copy the blocktime info to the thread, where
2468 __kmp_wait_template() can access it when the team struct is not
2469 guaranteed to exist. */
2470 // See note about the corresponding code in __kmp_join_barrier() being
2471 // performance-critical
2472 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2473 #if KMP_USE_MONITOR
2474 this_thr->th.th_team_bt_intervals =
2475 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2476 this_thr->th.th_team_bt_set =
2477 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2478 #else
2479 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2480 #endif
2482 } // primary thread
2484 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2485 case bp_dist_bar: {
2486 __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2487 TRUE USE_ITT_BUILD_ARG(NULL));
2488 break;
2490 case bp_hyper_bar: {
2491 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2492 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2493 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2494 break;
2496 case bp_hierarchical_bar: {
2497 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2498 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2499 break;
2501 case bp_tree_bar: {
2502 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2503 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2504 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2505 break;
2507 default: {
2508 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2509 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2513 #if OMPT_SUPPORT
2514 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2515 if (ompt_enabled.enabled &&
2516 (ompt_state == ompt_state_wait_barrier_teams ||
2517 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2518 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2519 ompt_data_t *task_data = (team)
2520 ? OMPT_CUR_TASK_DATA(this_thr)
2521 : &(this_thr->th.ompt_thread_info.task_data);
2522 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2523 #if OMPT_OPTIONAL
2524 void *codeptr = NULL;
2525 if (KMP_MASTER_TID(ds_tid) &&
2526 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2527 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2528 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2529 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2530 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2531 sync_kind = ompt_sync_region_barrier_teams;
2532 if (ompt_enabled.ompt_callback_sync_region_wait) {
2533 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2534 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2536 if (ompt_enabled.ompt_callback_sync_region) {
2537 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2538 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2540 #endif
2541 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2542 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2543 ompt_scope_end, NULL, task_data, 0, ds_tid,
2544 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2547 #endif
2549 // Early exit for reaping threads releasing forkjoin barrier
2550 if (TCR_4(__kmp_global.g.g_done)) {
2551 this_thr->th.th_task_team = NULL;
2553 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2554 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2555 if (!KMP_MASTER_TID(tid)) {
2556 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2557 if (itt_sync_obj)
2558 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2561 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2562 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2563 return;
2566 /* We can now assume that a valid team structure has been allocated by the
2567 primary thread and propagated to all worker threads. The current thread,
2568 however, may not be part of the team, so we can't blindly assume that the
2569 team pointer is non-null. */
2570 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2571 KMP_DEBUG_ASSERT(team != NULL);
2572 tid = __kmp_tid_from_gtid(gtid);
2574 #if KMP_BARRIER_ICV_PULL
2575 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2576 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2577 implicit task has this data before this function is called. We cannot
2578 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2579 thread struct, because it is not always the case that the threads arrays
2580 have been allocated when __kmp_fork_call() is executed. */
2582 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2583 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2584 // Copy the initial ICVs from the primary thread's thread struct to the
2585 // implicit task for this tid.
2586 KA_TRACE(10,
2587 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2588 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2589 tid, FALSE);
2590 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2591 &team->t.t_threads[0]
2592 ->th.th_bar[bs_forkjoin_barrier]
2593 .bb.th_fixed_icvs);
2596 #endif // KMP_BARRIER_ICV_PULL
2598 if (__kmp_tasking_mode != tskm_immediate_exec) {
2599 __kmp_task_team_sync(this_thr, team);
2602 #if KMP_AFFINITY_SUPPORTED
2603 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2604 if (proc_bind == proc_bind_intel) {
2605 // Call dynamic affinity settings
2606 if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2607 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2609 } else if (proc_bind != proc_bind_false) {
2610 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2611 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2612 __kmp_gtid_from_thread(this_thr),
2613 this_thr->th.th_current_place));
2614 } else {
2615 __kmp_affinity_bind_place(gtid);
2618 #endif // KMP_AFFINITY_SUPPORTED
2619 // Perform the display affinity functionality
2620 if (__kmp_display_affinity) {
2621 if (team->t.t_display_affinity
2622 #if KMP_AFFINITY_SUPPORTED
2623 || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2624 #endif
2626 // NULL means use the affinity-format-var ICV
2627 __kmp_aux_display_affinity(gtid, NULL);
2628 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2629 this_thr->th.th_prev_level = team->t.t_level;
2632 if (!KMP_MASTER_TID(tid))
2633 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2635 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2636 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2637 if (!KMP_MASTER_TID(tid)) {
2638 // Get correct barrier object
2639 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2640 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2641 } // (prepare called inside barrier_release)
2643 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2644 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2645 team->t.t_id, tid));
2648 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2649 kmp_internal_control_t *new_icvs, ident_t *loc) {
2650 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2652 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2653 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2655 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2656 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2657 implicit task has this data before this function is called. */
2658 #if KMP_BARRIER_ICV_PULL
2659 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2660 remains untouched), where all of the worker threads can access them and
2661 make their own copies after the barrier. */
2662 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2663 // allocated at this point
2664 copy_icvs(
2665 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2666 new_icvs);
2667 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2668 team->t.t_threads[0], team));
2669 #elif KMP_BARRIER_ICV_PUSH
2670 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2671 // done here.
2672 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2673 team->t.t_threads[0], team));
2674 #else
2675 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2676 // time.
2677 ngo_load(new_icvs);
2678 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2679 // allocated at this point
2680 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2681 // TODO: GEH - pass in better source location info since usually NULL here
2682 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2683 f, team->t.t_threads[f], team));
2684 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2685 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2686 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2687 f, team->t.t_threads[f], team));
2689 ngo_sync();
2690 #endif // KMP_BARRIER_ICV_PULL