[clang][modules] Don't prevent translation of FW_Private includes when explicitly...
[llvm-project.git] / openmp / runtime / src / kmp_runtime.cpp
blobe83c09383769a518894ae396382ba46743fe5692
1 /*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
64 char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
74 /* ------------------------------------------------------------------------ */
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
80 /* Forward declarations */
82 void __kmp_cleanup(void);
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111 int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118 int i;
119 kmp_info_t **other_threads;
120 size_t stack_data;
121 char *stack_addr;
122 size_t stack_size;
123 char *stack_base;
125 KA_TRACE(
126 1000,
127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128 __kmp_nth, __kmp_all_nth));
130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133 __kmp_init_gtid for this to work. */
135 if (!TCR_4(__kmp_init_gtid))
136 return KMP_GTID_DNE;
138 #ifdef KMP_TDATA_GTID
139 if (TCR_4(__kmp_gtid_mode) >= 3) {
140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141 return __kmp_gtid;
143 #endif
144 if (TCR_4(__kmp_gtid_mode) >= 2) {
145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146 return __kmp_gtid_get_specific();
148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
150 stack_addr = (char *)&stack_data;
151 other_threads = __kmp_threads;
153 /* ATT: The code below is a source of potential bugs due to unsynchronized
154 access to __kmp_threads array. For example:
155 1. Current thread loads other_threads[i] to thr and checks it, it is
156 non-NULL.
157 2. Current thread is suspended by OS.
158 3. Another thread unregisters and finishes (debug versions of free()
159 may fill memory with something like 0xEF).
160 4. Current thread is resumed.
161 5. Current thread reads junk from *thr.
162 TODO: Fix it. --ln */
164 for (i = 0; i < __kmp_threads_capacity; i++) {
166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167 if (!thr)
168 continue;
170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
173 /* stack grows down -- search through all of the active threads */
175 if (stack_addr <= stack_base) {
176 size_t stack_diff = stack_base - stack_addr;
178 if (stack_diff <= stack_size) {
179 /* The only way we can be closer than the allocated */
180 /* stack size is if we are running on this thread. */
181 // __kmp_gtid_get_specific can return negative value because this
182 // function can be called by thread destructor. However, before the
183 // thread destructor is called, the value of the corresponding
184 // thread-specific data will be reset to NULL.
185 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
186 __kmp_gtid_get_specific() == i);
187 return i;
192 /* get specific to try and determine our gtid */
193 KA_TRACE(1000,
194 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
195 "thread, using TLS\n"));
196 i = __kmp_gtid_get_specific();
198 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
200 /* if we havn't been assigned a gtid, then return code */
201 if (i < 0)
202 return i;
204 // other_threads[i] can be nullptr at this point because the corresponding
205 // thread could have already been destructed. It can happen when this function
206 // is called in end library routine.
207 if (!TCR_SYNC_PTR(other_threads[i]))
208 return i;
210 /* dynamically updated stack window for uber threads to avoid get_specific
211 call */
212 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
213 KMP_FATAL(StackOverflow, i);
216 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
217 if (stack_addr > stack_base) {
218 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
219 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
220 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
221 stack_base);
222 } else {
223 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
224 stack_base - stack_addr);
227 /* Reprint stack bounds for ubermaster since they have been refined */
228 if (__kmp_storage_map) {
229 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
230 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
231 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
232 other_threads[i]->th.th_info.ds.ds_stacksize,
233 "th_%d stack (refinement)", i);
235 return i;
238 int __kmp_get_global_thread_id_reg() {
239 int gtid;
241 if (!__kmp_init_serial) {
242 gtid = KMP_GTID_DNE;
243 } else
244 #ifdef KMP_TDATA_GTID
245 if (TCR_4(__kmp_gtid_mode) >= 3) {
246 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
247 gtid = __kmp_gtid;
248 } else
249 #endif
250 if (TCR_4(__kmp_gtid_mode) >= 2) {
251 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
252 gtid = __kmp_gtid_get_specific();
253 } else {
254 KA_TRACE(1000,
255 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
256 gtid = __kmp_get_global_thread_id();
259 /* we must be a new uber master sibling thread */
260 if (gtid == KMP_GTID_DNE) {
261 KA_TRACE(10,
262 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
263 "Registering a new gtid.\n"));
264 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
265 if (!__kmp_init_serial) {
266 __kmp_do_serial_initialize();
267 gtid = __kmp_gtid_get_specific();
268 } else {
269 gtid = __kmp_register_root(FALSE);
271 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
272 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
275 KMP_DEBUG_ASSERT(gtid >= 0);
277 return gtid;
280 /* caller must hold forkjoin_lock */
281 void __kmp_check_stack_overlap(kmp_info_t *th) {
282 int f;
283 char *stack_beg = NULL;
284 char *stack_end = NULL;
285 int gtid;
287 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
288 if (__kmp_storage_map) {
289 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
290 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
292 gtid = __kmp_gtid_from_thread(th);
294 if (gtid == KMP_GTID_MONITOR) {
295 __kmp_print_storage_map_gtid(
296 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
297 "th_%s stack (%s)", "mon",
298 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
299 } else {
300 __kmp_print_storage_map_gtid(
301 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
302 "th_%d stack (%s)", gtid,
303 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
307 /* No point in checking ubermaster threads since they use refinement and
308 * cannot overlap */
309 gtid = __kmp_gtid_from_thread(th);
310 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
311 KA_TRACE(10,
312 ("__kmp_check_stack_overlap: performing extensive checking\n"));
313 if (stack_beg == NULL) {
314 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
315 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
318 for (f = 0; f < __kmp_threads_capacity; f++) {
319 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
321 if (f_th && f_th != th) {
322 char *other_stack_end =
323 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
324 char *other_stack_beg =
325 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
326 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
327 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
329 /* Print the other stack values before the abort */
330 if (__kmp_storage_map)
331 __kmp_print_storage_map_gtid(
332 -1, other_stack_beg, other_stack_end,
333 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
334 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
336 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
337 __kmp_msg_null);
342 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
345 /* ------------------------------------------------------------------------ */
347 void __kmp_infinite_loop(void) {
348 static int done = FALSE;
350 while (!done) {
351 KMP_YIELD(TRUE);
355 #define MAX_MESSAGE 512
357 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
358 char const *format, ...) {
359 char buffer[MAX_MESSAGE];
360 va_list ap;
362 va_start(ap, format);
363 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
364 p2, (unsigned long)size, format);
365 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
366 __kmp_vprintf(kmp_err, buffer, ap);
367 #if KMP_PRINT_DATA_PLACEMENT
368 int node;
369 if (gtid >= 0) {
370 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
371 if (__kmp_storage_map_verbose) {
372 node = __kmp_get_host_node(p1);
373 if (node < 0) /* doesn't work, so don't try this next time */
374 __kmp_storage_map_verbose = FALSE;
375 else {
376 char *last;
377 int lastNode;
378 int localProc = __kmp_get_cpu_from_gtid(gtid);
380 const int page_size = KMP_GET_PAGE_SIZE();
382 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
383 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
384 if (localProc >= 0)
385 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
386 localProc >> 1);
387 else
388 __kmp_printf_no_lock(" GTID %d\n", gtid);
389 #if KMP_USE_PRCTL
390 /* The more elaborate format is disabled for now because of the prctl
391 * hanging bug. */
392 do {
393 last = p1;
394 lastNode = node;
395 /* This loop collates adjacent pages with the same host node. */
396 do {
397 (char *)p1 += page_size;
398 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
399 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
400 lastNode);
401 } while (p1 <= p2);
402 #else
403 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
404 (char *)p1 + (page_size - 1),
405 __kmp_get_host_node(p1));
406 if (p1 < p2) {
407 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
408 (char *)p2 + (page_size - 1),
409 __kmp_get_host_node(p2));
411 #endif
414 } else
415 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
417 #endif /* KMP_PRINT_DATA_PLACEMENT */
418 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
420 va_end(ap);
423 void __kmp_warn(char const *format, ...) {
424 char buffer[MAX_MESSAGE];
425 va_list ap;
427 if (__kmp_generate_warnings == kmp_warnings_off) {
428 return;
431 va_start(ap, format);
433 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
434 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
435 __kmp_vprintf(kmp_err, buffer, ap);
436 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
438 va_end(ap);
441 void __kmp_abort_process() {
442 // Later threads may stall here, but that's ok because abort() will kill them.
443 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
445 if (__kmp_debug_buf) {
446 __kmp_dump_debug_buffer();
449 if (KMP_OS_WINDOWS) {
450 // Let other threads know of abnormal termination and prevent deadlock
451 // if abort happened during library initialization or shutdown
452 __kmp_global.g.g_abort = SIGABRT;
454 /* On Windows* OS by default abort() causes pop-up error box, which stalls
455 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
456 boxes. _set_abort_behavior() works well, but this function is not
457 available in VS7 (this is not problem for DLL, but it is a problem for
458 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
459 help, at least in some versions of MS C RTL.
461 It seems following sequence is the only way to simulate abort() and
462 avoid pop-up error box. */
463 raise(SIGABRT);
464 _exit(3); // Just in case, if signal ignored, exit anyway.
465 } else {
466 __kmp_unregister_library();
467 abort();
470 __kmp_infinite_loop();
471 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
473 } // __kmp_abort_process
475 void __kmp_abort_thread(void) {
476 // TODO: Eliminate g_abort global variable and this function.
477 // In case of abort just call abort(), it will kill all the threads.
478 __kmp_infinite_loop();
479 } // __kmp_abort_thread
481 /* Print out the storage map for the major kmp_info_t thread data structures
482 that are allocated together. */
484 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
485 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
486 gtid);
488 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
489 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
491 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
492 sizeof(kmp_local_t), "th_%d.th_local", gtid);
494 __kmp_print_storage_map_gtid(
495 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
496 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
498 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
499 &thr->th.th_bar[bs_plain_barrier + 1],
500 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
501 gtid);
503 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
504 &thr->th.th_bar[bs_forkjoin_barrier + 1],
505 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
506 gtid);
508 #if KMP_FAST_REDUCTION_BARRIER
509 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
510 &thr->th.th_bar[bs_reduction_barrier + 1],
511 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
512 gtid);
513 #endif // KMP_FAST_REDUCTION_BARRIER
516 /* Print out the storage map for the major kmp_team_t team data structures
517 that are allocated together. */
519 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
520 int team_id, int num_thr) {
521 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
522 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
523 header, team_id);
525 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
526 &team->t.t_bar[bs_last_barrier],
527 sizeof(kmp_balign_team_t) * bs_last_barrier,
528 "%s_%d.t_bar", header, team_id);
530 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
531 &team->t.t_bar[bs_plain_barrier + 1],
532 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
533 header, team_id);
535 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
536 &team->t.t_bar[bs_forkjoin_barrier + 1],
537 sizeof(kmp_balign_team_t),
538 "%s_%d.t_bar[forkjoin]", header, team_id);
540 #if KMP_FAST_REDUCTION_BARRIER
541 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
542 &team->t.t_bar[bs_reduction_barrier + 1],
543 sizeof(kmp_balign_team_t),
544 "%s_%d.t_bar[reduction]", header, team_id);
545 #endif // KMP_FAST_REDUCTION_BARRIER
547 __kmp_print_storage_map_gtid(
548 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
549 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
551 __kmp_print_storage_map_gtid(
552 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
553 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
555 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
556 &team->t.t_disp_buffer[num_disp_buff],
557 sizeof(dispatch_shared_info_t) * num_disp_buff,
558 "%s_%d.t_disp_buffer", header, team_id);
561 static void __kmp_init_allocator() {
562 __kmp_init_memkind();
563 __kmp_init_target_mem();
565 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
567 /* ------------------------------------------------------------------------ */
569 #if ENABLE_LIBOMPTARGET
570 static void __kmp_init_omptarget() {
571 __kmp_init_target_task();
573 #endif
575 /* ------------------------------------------------------------------------ */
577 #if KMP_DYNAMIC_LIB
578 #if KMP_OS_WINDOWS
580 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
581 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
583 switch (fdwReason) {
585 case DLL_PROCESS_ATTACH:
586 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
588 return TRUE;
590 case DLL_PROCESS_DETACH:
591 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
593 // According to Windows* documentation for DllMain entry point:
594 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
595 // lpReserved == NULL when FreeLibrary() is called,
596 // lpReserved != NULL when the process is terminated.
597 // When FreeLibrary() is called, worker threads remain alive. So the
598 // runtime's state is consistent and executing proper shutdown is OK.
599 // When the process is terminated, worker threads have exited or been
600 // forcefully terminated by the OS and only the shutdown thread remains.
601 // This can leave the runtime in an inconsistent state.
602 // Hence, only attempt proper cleanup when FreeLibrary() is called.
603 // Otherwise, rely on OS to reclaim resources.
604 if (lpReserved == NULL)
605 __kmp_internal_end_library(__kmp_gtid_get_specific());
607 return TRUE;
609 case DLL_THREAD_ATTACH:
610 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
612 /* if we want to register new siblings all the time here call
613 * __kmp_get_gtid(); */
614 return TRUE;
616 case DLL_THREAD_DETACH:
617 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
619 __kmp_internal_end_thread(__kmp_gtid_get_specific());
620 return TRUE;
623 return TRUE;
626 #endif /* KMP_OS_WINDOWS */
627 #endif /* KMP_DYNAMIC_LIB */
629 /* __kmp_parallel_deo -- Wait until it's our turn. */
630 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
631 int gtid = *gtid_ref;
632 #ifdef BUILD_PARALLEL_ORDERED
633 kmp_team_t *team = __kmp_team_from_gtid(gtid);
634 #endif /* BUILD_PARALLEL_ORDERED */
636 if (__kmp_env_consistency_check) {
637 if (__kmp_threads[gtid]->th.th_root->r.r_active)
638 #if KMP_USE_DYNAMIC_LOCK
639 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
640 #else
641 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
642 #endif
644 #ifdef BUILD_PARALLEL_ORDERED
645 if (!team->t.t_serialized) {
646 KMP_MB();
647 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
648 NULL);
649 KMP_MB();
651 #endif /* BUILD_PARALLEL_ORDERED */
654 /* __kmp_parallel_dxo -- Signal the next task. */
655 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
656 int gtid = *gtid_ref;
657 #ifdef BUILD_PARALLEL_ORDERED
658 int tid = __kmp_tid_from_gtid(gtid);
659 kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
662 if (__kmp_env_consistency_check) {
663 if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
666 #ifdef BUILD_PARALLEL_ORDERED
667 if (!team->t.t_serialized) {
668 KMP_MB(); /* Flush all pending memory write invalidates. */
670 /* use the tid of the next thread in this team */
671 /* TODO replace with general release procedure */
672 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
674 KMP_MB(); /* Flush all pending memory write invalidates. */
676 #endif /* BUILD_PARALLEL_ORDERED */
679 /* ------------------------------------------------------------------------ */
680 /* The BARRIER for a SINGLE process section is always explicit */
682 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
683 int status;
684 kmp_info_t *th;
685 kmp_team_t *team;
687 if (!TCR_4(__kmp_init_parallel))
688 __kmp_parallel_initialize();
689 __kmp_resume_if_soft_paused();
691 th = __kmp_threads[gtid];
692 team = th->th.th_team;
693 status = 0;
695 th->th.th_ident = id_ref;
697 if (team->t.t_serialized) {
698 status = 1;
699 } else {
700 kmp_int32 old_this = th->th.th_local.this_construct;
702 ++th->th.th_local.this_construct;
703 /* try to set team count to thread count--success means thread got the
704 single block */
705 /* TODO: Should this be acquire or release? */
706 if (team->t.t_construct == old_this) {
707 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
708 th->th.th_local.this_construct);
710 #if USE_ITT_BUILD
711 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
712 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
713 team->t.t_active_level == 1) {
714 // Only report metadata by primary thread of active team at level 1
715 __kmp_itt_metadata_single(id_ref);
717 #endif /* USE_ITT_BUILD */
720 if (__kmp_env_consistency_check) {
721 if (status && push_ws) {
722 __kmp_push_workshare(gtid, ct_psingle, id_ref);
723 } else {
724 __kmp_check_workshare(gtid, ct_psingle, id_ref);
727 #if USE_ITT_BUILD
728 if (status) {
729 __kmp_itt_single_start(gtid);
731 #endif /* USE_ITT_BUILD */
732 return status;
735 void __kmp_exit_single(int gtid) {
736 #if USE_ITT_BUILD
737 __kmp_itt_single_end(gtid);
738 #endif /* USE_ITT_BUILD */
739 if (__kmp_env_consistency_check)
740 __kmp_pop_workshare(gtid, ct_psingle, NULL);
743 /* determine if we can go parallel or must use a serialized parallel region and
744 * how many threads we can use
745 * set_nproc is the number of threads requested for the team
746 * returns 0 if we should serialize or only use one thread,
747 * otherwise the number of threads to use
748 * The forkjoin lock is held by the caller. */
749 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
750 int master_tid, int set_nthreads,
751 int enter_teams) {
752 int capacity;
753 int new_nthreads;
754 KMP_DEBUG_ASSERT(__kmp_init_serial);
755 KMP_DEBUG_ASSERT(root && parent_team);
756 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
758 // If dyn-var is set, dynamically adjust the number of desired threads,
759 // according to the method specified by dynamic_mode.
760 new_nthreads = set_nthreads;
761 if (!get__dynamic_2(parent_team, master_tid)) {
764 #ifdef USE_LOAD_BALANCE
765 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
766 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
767 if (new_nthreads == 1) {
768 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
769 "reservation to 1 thread\n",
770 master_tid));
771 return 1;
773 if (new_nthreads < set_nthreads) {
774 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
775 "reservation to %d threads\n",
776 master_tid, new_nthreads));
779 #endif /* USE_LOAD_BALANCE */
780 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
781 new_nthreads = __kmp_avail_proc - __kmp_nth +
782 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
783 if (new_nthreads <= 1) {
784 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
785 "reservation to 1 thread\n",
786 master_tid));
787 return 1;
789 if (new_nthreads < set_nthreads) {
790 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
791 "reservation to %d threads\n",
792 master_tid, new_nthreads));
793 } else {
794 new_nthreads = set_nthreads;
796 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
797 if (set_nthreads > 2) {
798 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
799 new_nthreads = (new_nthreads % set_nthreads) + 1;
800 if (new_nthreads == 1) {
801 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
802 "reservation to 1 thread\n",
803 master_tid));
804 return 1;
806 if (new_nthreads < set_nthreads) {
807 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
808 "reservation to %d threads\n",
809 master_tid, new_nthreads));
812 } else {
813 KMP_ASSERT(0);
816 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
817 if (__kmp_nth + new_nthreads -
818 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
819 __kmp_max_nth) {
820 int tl_nthreads = __kmp_max_nth - __kmp_nth +
821 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
822 if (tl_nthreads <= 0) {
823 tl_nthreads = 1;
826 // If dyn-var is false, emit a 1-time warning.
827 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
828 __kmp_reserve_warn = 1;
829 __kmp_msg(kmp_ms_warning,
830 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
831 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
833 if (tl_nthreads == 1) {
834 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
835 "reduced reservation to 1 thread\n",
836 master_tid));
837 return 1;
839 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
840 "reservation to %d threads\n",
841 master_tid, tl_nthreads));
842 new_nthreads = tl_nthreads;
845 // Respect OMP_THREAD_LIMIT
846 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
847 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
848 if (cg_nthreads + new_nthreads -
849 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
850 max_cg_threads) {
851 int tl_nthreads = max_cg_threads - cg_nthreads +
852 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
853 if (tl_nthreads <= 0) {
854 tl_nthreads = 1;
857 // If dyn-var is false, emit a 1-time warning.
858 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
859 __kmp_reserve_warn = 1;
860 __kmp_msg(kmp_ms_warning,
861 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
862 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
864 if (tl_nthreads == 1) {
865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
866 "reduced reservation to 1 thread\n",
867 master_tid));
868 return 1;
870 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
871 "reservation to %d threads\n",
872 master_tid, tl_nthreads));
873 new_nthreads = tl_nthreads;
876 // Check if the threads array is large enough, or needs expanding.
877 // See comment in __kmp_register_root() about the adjustment if
878 // __kmp_threads[0] == NULL.
879 capacity = __kmp_threads_capacity;
880 if (TCR_PTR(__kmp_threads[0]) == NULL) {
881 --capacity;
883 // If it is not for initializing the hidden helper team, we need to take
884 // __kmp_hidden_helper_threads_num out of the capacity because it is included
885 // in __kmp_threads_capacity.
886 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
887 capacity -= __kmp_hidden_helper_threads_num;
889 if (__kmp_nth + new_nthreads -
890 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
891 capacity) {
892 // Expand the threads array.
893 int slotsRequired = __kmp_nth + new_nthreads -
894 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
895 capacity;
896 int slotsAdded = __kmp_expand_threads(slotsRequired);
897 if (slotsAdded < slotsRequired) {
898 // The threads array was not expanded enough.
899 new_nthreads -= (slotsRequired - slotsAdded);
900 KMP_ASSERT(new_nthreads >= 1);
902 // If dyn-var is false, emit a 1-time warning.
903 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
904 __kmp_reserve_warn = 1;
905 if (__kmp_tp_cached) {
906 __kmp_msg(kmp_ms_warning,
907 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
908 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
909 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
910 } else {
911 __kmp_msg(kmp_ms_warning,
912 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
913 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
919 #ifdef KMP_DEBUG
920 if (new_nthreads == 1) {
921 KC_TRACE(10,
922 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
923 "dead roots and rechecking; requested %d threads\n",
924 __kmp_get_gtid(), set_nthreads));
925 } else {
926 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
927 " %d threads\n",
928 __kmp_get_gtid(), new_nthreads, set_nthreads));
930 #endif // KMP_DEBUG
931 return new_nthreads;
934 /* Allocate threads from the thread pool and assign them to the new team. We are
935 assured that there are enough threads available, because we checked on that
936 earlier within critical section forkjoin */
937 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
938 kmp_info_t *master_th, int master_gtid,
939 int fork_teams_workers) {
940 int i;
941 int use_hot_team;
943 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
944 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
945 KMP_MB();
947 /* first, let's setup the primary thread */
948 master_th->th.th_info.ds.ds_tid = 0;
949 master_th->th.th_team = team;
950 master_th->th.th_team_nproc = team->t.t_nproc;
951 master_th->th.th_team_master = master_th;
952 master_th->th.th_team_serialized = FALSE;
953 master_th->th.th_dispatch = &team->t.t_dispatch[0];
955 /* make sure we are not the optimized hot team */
956 #if KMP_NESTED_HOT_TEAMS
957 use_hot_team = 0;
958 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
959 if (hot_teams) { // hot teams array is not allocated if
960 // KMP_HOT_TEAMS_MAX_LEVEL=0
961 int level = team->t.t_active_level - 1; // index in array of hot teams
962 if (master_th->th.th_teams_microtask) { // are we inside the teams?
963 if (master_th->th.th_teams_size.nteams > 1) {
964 ++level; // level was not increased in teams construct for
965 // team_of_masters
967 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
968 master_th->th.th_teams_level == team->t.t_level) {
969 ++level; // level was not increased in teams construct for
970 // team_of_workers before the parallel
971 } // team->t.t_level will be increased inside parallel
973 if (level < __kmp_hot_teams_max_level) {
974 if (hot_teams[level].hot_team) {
975 // hot team has already been allocated for given level
976 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
977 use_hot_team = 1; // the team is ready to use
978 } else {
979 use_hot_team = 0; // AC: threads are not allocated yet
980 hot_teams[level].hot_team = team; // remember new hot team
981 hot_teams[level].hot_team_nth = team->t.t_nproc;
983 } else {
984 use_hot_team = 0;
987 #else
988 use_hot_team = team == root->r.r_hot_team;
989 #endif
990 if (!use_hot_team) {
992 /* install the primary thread */
993 team->t.t_threads[0] = master_th;
994 __kmp_initialize_info(master_th, team, 0, master_gtid);
996 /* now, install the worker threads */
997 for (i = 1; i < team->t.t_nproc; i++) {
999 /* fork or reallocate a new thread and install it in team */
1000 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1001 team->t.t_threads[i] = thr;
1002 KMP_DEBUG_ASSERT(thr);
1003 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1004 /* align team and thread arrived states */
1005 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1006 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1007 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1008 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1009 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1010 team->t.t_bar[bs_plain_barrier].b_arrived));
1011 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1012 thr->th.th_teams_level = master_th->th.th_teams_level;
1013 thr->th.th_teams_size = master_th->th.th_teams_size;
1014 { // Initialize threads' barrier data.
1015 int b;
1016 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1017 for (b = 0; b < bs_last_barrier; ++b) {
1018 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1019 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1020 #if USE_DEBUGGER
1021 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1022 #endif
1027 #if KMP_AFFINITY_SUPPORTED
1028 // Do not partition the places list for teams construct workers who
1029 // haven't actually been forked to do real work yet. This partitioning
1030 // will take place in the parallel region nested within the teams construct.
1031 if (!fork_teams_workers) {
1032 __kmp_partition_places(team);
1034 #endif
1036 if (team->t.t_nproc > 1 &&
1037 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1038 team->t.b->update_num_threads(team->t.t_nproc);
1039 __kmp_add_threads_to_team(team, team->t.t_nproc);
1043 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1044 for (i = 0; i < team->t.t_nproc; i++) {
1045 kmp_info_t *thr = team->t.t_threads[i];
1046 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1047 thr->th.th_prev_level != team->t.t_level) {
1048 team->t.t_display_affinity = 1;
1049 break;
1054 KMP_MB();
1057 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1058 // Propagate any changes to the floating point control registers out to the team
1059 // We try to avoid unnecessary writes to the relevant cache line in the team
1060 // structure, so we don't make changes unless they are needed.
1061 inline static void propagateFPControl(kmp_team_t *team) {
1062 if (__kmp_inherit_fp_control) {
1063 kmp_int16 x87_fpu_control_word;
1064 kmp_uint32 mxcsr;
1066 // Get primary thread's values of FPU control flags (both X87 and vector)
1067 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1068 __kmp_store_mxcsr(&mxcsr);
1069 mxcsr &= KMP_X86_MXCSR_MASK;
1071 // There is no point looking at t_fp_control_saved here.
1072 // If it is TRUE, we still have to update the values if they are different
1073 // from those we now have. If it is FALSE we didn't save anything yet, but
1074 // our objective is the same. We have to ensure that the values in the team
1075 // are the same as those we have.
1076 // So, this code achieves what we need whether or not t_fp_control_saved is
1077 // true. By checking whether the value needs updating we avoid unnecessary
1078 // writes that would put the cache-line into a written state, causing all
1079 // threads in the team to have to read it again.
1080 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1081 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1082 // Although we don't use this value, other code in the runtime wants to know
1083 // whether it should restore them. So we must ensure it is correct.
1084 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1085 } else {
1086 // Similarly here. Don't write to this cache-line in the team structure
1087 // unless we have to.
1088 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1092 // Do the opposite, setting the hardware registers to the updated values from
1093 // the team.
1094 inline static void updateHWFPControl(kmp_team_t *team) {
1095 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1096 // Only reset the fp control regs if they have been changed in the team.
1097 // the parallel region that we are exiting.
1098 kmp_int16 x87_fpu_control_word;
1099 kmp_uint32 mxcsr;
1100 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1101 __kmp_store_mxcsr(&mxcsr);
1102 mxcsr &= KMP_X86_MXCSR_MASK;
1104 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1105 __kmp_clear_x87_fpu_status_word();
1106 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1109 if (team->t.t_mxcsr != mxcsr) {
1110 __kmp_load_mxcsr(&team->t.t_mxcsr);
1114 #else
1115 #define propagateFPControl(x) ((void)0)
1116 #define updateHWFPControl(x) ((void)0)
1117 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1119 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1120 int realloc); // forward declaration
1122 /* Run a parallel region that has been serialized, so runs only in a team of the
1123 single primary thread. */
1124 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1125 kmp_info_t *this_thr;
1126 kmp_team_t *serial_team;
1128 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1130 /* Skip all this code for autopar serialized loops since it results in
1131 unacceptable overhead */
1132 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1133 return;
1135 if (!TCR_4(__kmp_init_parallel))
1136 __kmp_parallel_initialize();
1137 __kmp_resume_if_soft_paused();
1139 this_thr = __kmp_threads[global_tid];
1140 serial_team = this_thr->th.th_serial_team;
1142 /* utilize the serialized team held by this thread */
1143 KMP_DEBUG_ASSERT(serial_team);
1144 KMP_MB();
1146 if (__kmp_tasking_mode != tskm_immediate_exec) {
1147 KMP_DEBUG_ASSERT(
1148 this_thr->th.th_task_team ==
1149 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1150 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1151 NULL);
1152 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1153 "team %p, new task_team = NULL\n",
1154 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1155 this_thr->th.th_task_team = NULL;
1158 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1159 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1160 proc_bind = proc_bind_false;
1161 } else if (proc_bind == proc_bind_default) {
1162 // No proc_bind clause was specified, so use the current value
1163 // of proc-bind-var for this parallel region.
1164 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1166 // Reset for next parallel region
1167 this_thr->th.th_set_proc_bind = proc_bind_default;
1169 // Reset num_threads for next parallel region
1170 this_thr->th.th_set_nproc = 0;
1172 #if OMPT_SUPPORT
1173 ompt_data_t ompt_parallel_data = ompt_data_none;
1174 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1175 if (ompt_enabled.enabled &&
1176 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1178 ompt_task_info_t *parent_task_info;
1179 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1181 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1182 if (ompt_enabled.ompt_callback_parallel_begin) {
1183 int team_size = 1;
1185 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1186 &(parent_task_info->task_data), &(parent_task_info->frame),
1187 &ompt_parallel_data, team_size,
1188 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1191 #endif // OMPT_SUPPORT
1193 if (this_thr->th.th_team != serial_team) {
1194 // Nested level will be an index in the nested nthreads array
1195 int level = this_thr->th.th_team->t.t_level;
1197 if (serial_team->t.t_serialized) {
1198 /* this serial team was already used
1199 TODO increase performance by making this locks more specific */
1200 kmp_team_t *new_team;
1202 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1204 new_team =
1205 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1206 #if OMPT_SUPPORT
1207 ompt_parallel_data,
1208 #endif
1209 proc_bind, &this_thr->th.th_current_task->td_icvs,
1210 0 USE_NESTED_HOT_ARG(NULL));
1211 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1212 KMP_ASSERT(new_team);
1214 /* setup new serialized team and install it */
1215 new_team->t.t_threads[0] = this_thr;
1216 new_team->t.t_parent = this_thr->th.th_team;
1217 serial_team = new_team;
1218 this_thr->th.th_serial_team = serial_team;
1220 KF_TRACE(
1222 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1223 global_tid, serial_team));
1225 /* TODO the above breaks the requirement that if we run out of resources,
1226 then we can still guarantee that serialized teams are ok, since we may
1227 need to allocate a new one */
1228 } else {
1229 KF_TRACE(
1231 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1232 global_tid, serial_team));
1235 /* we have to initialize this serial team */
1236 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1237 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1238 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1239 serial_team->t.t_ident = loc;
1240 serial_team->t.t_serialized = 1;
1241 serial_team->t.t_nproc = 1;
1242 serial_team->t.t_parent = this_thr->th.th_team;
1243 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1244 this_thr->th.th_team = serial_team;
1245 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1247 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1248 this_thr->th.th_current_task));
1249 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1250 this_thr->th.th_current_task->td_flags.executing = 0;
1252 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1254 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1255 implicit task for each serialized task represented by
1256 team->t.t_serialized? */
1257 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1258 &this_thr->th.th_current_task->td_parent->td_icvs);
1260 // Thread value exists in the nested nthreads array for the next nested
1261 // level
1262 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1263 this_thr->th.th_current_task->td_icvs.nproc =
1264 __kmp_nested_nth.nth[level + 1];
1267 if (__kmp_nested_proc_bind.used &&
1268 (level + 1 < __kmp_nested_proc_bind.used)) {
1269 this_thr->th.th_current_task->td_icvs.proc_bind =
1270 __kmp_nested_proc_bind.bind_types[level + 1];
1273 #if USE_DEBUGGER
1274 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1275 #endif
1276 this_thr->th.th_info.ds.ds_tid = 0;
1278 /* set thread cache values */
1279 this_thr->th.th_team_nproc = 1;
1280 this_thr->th.th_team_master = this_thr;
1281 this_thr->th.th_team_serialized = 1;
1283 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1284 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1285 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1287 propagateFPControl(serial_team);
1289 /* check if we need to allocate dispatch buffers stack */
1290 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1291 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1292 serial_team->t.t_dispatch->th_disp_buffer =
1293 (dispatch_private_info_t *)__kmp_allocate(
1294 sizeof(dispatch_private_info_t));
1296 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1298 KMP_MB();
1300 } else {
1301 /* this serialized team is already being used,
1302 * that's fine, just add another nested level */
1303 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1304 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1305 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1306 ++serial_team->t.t_serialized;
1307 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1309 // Nested level will be an index in the nested nthreads array
1310 int level = this_thr->th.th_team->t.t_level;
1311 // Thread value exists in the nested nthreads array for the next nested
1312 // level
1313 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1314 this_thr->th.th_current_task->td_icvs.nproc =
1315 __kmp_nested_nth.nth[level + 1];
1317 serial_team->t.t_level++;
1318 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1319 "of serial team %p to %d\n",
1320 global_tid, serial_team, serial_team->t.t_level));
1322 /* allocate/push dispatch buffers stack */
1323 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1325 dispatch_private_info_t *disp_buffer =
1326 (dispatch_private_info_t *)__kmp_allocate(
1327 sizeof(dispatch_private_info_t));
1328 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1329 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1331 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1333 KMP_MB();
1335 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1337 // Perform the display affinity functionality for
1338 // serialized parallel regions
1339 if (__kmp_display_affinity) {
1340 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1341 this_thr->th.th_prev_num_threads != 1) {
1342 // NULL means use the affinity-format-var ICV
1343 __kmp_aux_display_affinity(global_tid, NULL);
1344 this_thr->th.th_prev_level = serial_team->t.t_level;
1345 this_thr->th.th_prev_num_threads = 1;
1349 if (__kmp_env_consistency_check)
1350 __kmp_push_parallel(global_tid, NULL);
1351 #if OMPT_SUPPORT
1352 serial_team->t.ompt_team_info.master_return_address = codeptr;
1353 if (ompt_enabled.enabled &&
1354 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1355 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1356 OMPT_GET_FRAME_ADDRESS(0);
1358 ompt_lw_taskteam_t lw_taskteam;
1359 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1360 &ompt_parallel_data, codeptr);
1362 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1363 // don't use lw_taskteam after linking. content was swaped
1365 /* OMPT implicit task begin */
1366 if (ompt_enabled.ompt_callback_implicit_task) {
1367 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1368 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1369 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1370 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1371 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1372 __kmp_tid_from_gtid(global_tid);
1375 /* OMPT state */
1376 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1377 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1378 OMPT_GET_FRAME_ADDRESS(0);
1380 #endif
1383 // Test if this fork is for a team closely nested in a teams construct
1384 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1385 microtask_t microtask, int level,
1386 int teams_level, kmp_va_list ap) {
1387 return (master_th->th.th_teams_microtask && ap &&
1388 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1391 // Test if this fork is for the teams construct, i.e. to form the outer league
1392 // of teams
1393 static inline bool __kmp_is_entering_teams(int active_level, int level,
1394 int teams_level, kmp_va_list ap) {
1395 return ((ap == NULL && active_level == 0) ||
1396 (ap && teams_level > 0 && teams_level == level));
1399 // AC: This is start of parallel that is nested inside teams construct.
1400 // The team is actual (hot), all workers are ready at the fork barrier.
1401 // No lock needed to initialize the team a bit, then free workers.
1402 static inline int
1403 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1404 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1405 enum fork_context_e call_context, microtask_t microtask,
1406 launch_t invoker, int master_set_numthreads, int level,
1407 #if OMPT_SUPPORT
1408 ompt_data_t ompt_parallel_data, void *return_address,
1409 #endif
1410 kmp_va_list ap) {
1411 void **argv;
1412 int i;
1414 parent_team->t.t_ident = loc;
1415 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1416 parent_team->t.t_argc = argc;
1417 argv = (void **)parent_team->t.t_argv;
1418 for (i = argc - 1; i >= 0; --i) {
1419 *argv++ = va_arg(kmp_va_deref(ap), void *);
1421 // Increment our nested depth levels, but not increase the serialization
1422 if (parent_team == master_th->th.th_serial_team) {
1423 // AC: we are in serialized parallel
1424 __kmpc_serialized_parallel(loc, gtid);
1425 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1427 if (call_context == fork_context_gnu) {
1428 // AC: need to decrement t_serialized for enquiry functions to work
1429 // correctly, will restore at join time
1430 parent_team->t.t_serialized--;
1431 return TRUE;
1434 #if OMPD_SUPPORT
1435 parent_team->t.t_pkfn = microtask;
1436 #endif
1438 #if OMPT_SUPPORT
1439 void *dummy;
1440 void **exit_frame_p;
1441 ompt_data_t *implicit_task_data;
1442 ompt_lw_taskteam_t lw_taskteam;
1444 if (ompt_enabled.enabled) {
1445 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1446 &ompt_parallel_data, return_address);
1447 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1449 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1450 // Don't use lw_taskteam after linking. Content was swapped.
1452 /* OMPT implicit task begin */
1453 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1454 if (ompt_enabled.ompt_callback_implicit_task) {
1455 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1456 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1457 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1458 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1461 /* OMPT state */
1462 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1463 } else {
1464 exit_frame_p = &dummy;
1466 #endif
1468 // AC: need to decrement t_serialized for enquiry functions to work
1469 // correctly, will restore at join time
1470 parent_team->t.t_serialized--;
1473 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1474 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1475 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1476 #if OMPT_SUPPORT
1478 exit_frame_p
1479 #endif
1483 #if OMPT_SUPPORT
1484 if (ompt_enabled.enabled) {
1485 *exit_frame_p = NULL;
1486 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1487 if (ompt_enabled.ompt_callback_implicit_task) {
1488 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1489 ompt_scope_end, NULL, implicit_task_data, 1,
1490 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1492 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1493 __ompt_lw_taskteam_unlink(master_th);
1494 if (ompt_enabled.ompt_callback_parallel_end) {
1495 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1496 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1497 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1499 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1501 #endif
1502 return TRUE;
1505 parent_team->t.t_pkfn = microtask;
1506 parent_team->t.t_invoke = invoker;
1507 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1508 parent_team->t.t_active_level++;
1509 parent_team->t.t_level++;
1510 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1512 // If the threads allocated to the team are less than the thread limit, update
1513 // the thread limit here. th_teams_size.nth is specific to this team nested
1514 // in a teams construct, the team is fully created, and we're about to do
1515 // the actual fork. Best to do this here so that the subsequent uses below
1516 // and in the join have the correct value.
1517 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1519 #if OMPT_SUPPORT
1520 if (ompt_enabled.enabled) {
1521 ompt_lw_taskteam_t lw_taskteam;
1522 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1523 return_address);
1524 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1526 #endif
1528 /* Change number of threads in the team if requested */
1529 if (master_set_numthreads) { // The parallel has num_threads clause
1530 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1531 // AC: only can reduce number of threads dynamically, can't increase
1532 kmp_info_t **other_threads = parent_team->t.t_threads;
1533 // NOTE: if using distributed barrier, we need to run this code block
1534 // even when the team size appears not to have changed from the max.
1535 int old_proc = master_th->th.th_teams_size.nth;
1536 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1537 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1538 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1540 parent_team->t.t_nproc = master_set_numthreads;
1541 for (i = 0; i < master_set_numthreads; ++i) {
1542 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1545 // Keep extra threads hot in the team for possible next parallels
1546 master_th->th.th_set_nproc = 0;
1549 #if USE_DEBUGGER
1550 if (__kmp_debugging) { // Let debugger override number of threads.
1551 int nth = __kmp_omp_num_threads(loc);
1552 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1553 master_set_numthreads = nth;
1556 #endif
1558 // Figure out the proc_bind policy for the nested parallel within teams
1559 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1560 // proc_bind_default means don't update
1561 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1562 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1563 proc_bind = proc_bind_false;
1564 } else {
1565 // No proc_bind clause specified; use current proc-bind-var
1566 if (proc_bind == proc_bind_default) {
1567 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1569 /* else: The proc_bind policy was specified explicitly on parallel clause.
1570 This overrides proc-bind-var for this parallel region, but does not
1571 change proc-bind-var. */
1572 // Figure the value of proc-bind-var for the child threads.
1573 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1574 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1575 master_th->th.th_current_task->td_icvs.proc_bind)) {
1576 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1579 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1580 // Need to change the bind-var ICV to correct value for each implicit task
1581 if (proc_bind_icv != proc_bind_default &&
1582 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1583 kmp_info_t **other_threads = parent_team->t.t_threads;
1584 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1585 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1588 // Reset for next parallel region
1589 master_th->th.th_set_proc_bind = proc_bind_default;
1591 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1592 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1593 KMP_ITT_DEBUG) &&
1594 __kmp_forkjoin_frames_mode == 3 &&
1595 parent_team->t.t_active_level == 1 // only report frames at level 1
1596 && master_th->th.th_teams_size.nteams == 1) {
1597 kmp_uint64 tmp_time = __itt_get_timestamp();
1598 master_th->th.th_frame_time = tmp_time;
1599 parent_team->t.t_region_time = tmp_time;
1601 if (__itt_stack_caller_create_ptr) {
1602 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1603 // create new stack stitching id before entering fork barrier
1604 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1606 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1607 #if KMP_AFFINITY_SUPPORTED
1608 __kmp_partition_places(parent_team);
1609 #endif
1611 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1612 "master_th=%p, gtid=%d\n",
1613 root, parent_team, master_th, gtid));
1614 __kmp_internal_fork(loc, gtid, parent_team);
1615 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1616 "master_th=%p, gtid=%d\n",
1617 root, parent_team, master_th, gtid));
1619 if (call_context == fork_context_gnu)
1620 return TRUE;
1622 /* Invoke microtask for PRIMARY thread */
1623 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1624 parent_team->t.t_id, parent_team->t.t_pkfn));
1626 if (!parent_team->t.t_invoke(gtid)) {
1627 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1629 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1630 parent_team->t.t_id, parent_team->t.t_pkfn));
1631 KMP_MB(); /* Flush all pending memory write invalidates. */
1633 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1635 return TRUE;
1638 // Create a serialized parallel region
1639 static inline int
1640 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1641 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1642 kmp_info_t *master_th, kmp_team_t *parent_team,
1643 #if OMPT_SUPPORT
1644 ompt_data_t *ompt_parallel_data, void **return_address,
1645 ompt_data_t **parent_task_data,
1646 #endif
1647 kmp_va_list ap) {
1648 kmp_team_t *team;
1649 int i;
1650 void **argv;
1652 /* josh todo: hypothetical question: what do we do for OS X*? */
1653 #if KMP_OS_LINUX && \
1654 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1655 void *args[argc];
1656 #else
1657 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1658 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1659 KMP_ARCH_AARCH64) */
1661 KA_TRACE(
1662 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1664 __kmpc_serialized_parallel(loc, gtid);
1666 #if OMPD_SUPPORT
1667 master_th->th.th_serial_team->t.t_pkfn = microtask;
1668 #endif
1670 if (call_context == fork_context_intel) {
1671 /* TODO this sucks, use the compiler itself to pass args! :) */
1672 master_th->th.th_serial_team->t.t_ident = loc;
1673 if (!ap) {
1674 // revert change made in __kmpc_serialized_parallel()
1675 master_th->th.th_serial_team->t.t_level--;
1676 // Get args from parent team for teams construct
1678 #if OMPT_SUPPORT
1679 void *dummy;
1680 void **exit_frame_p;
1681 ompt_task_info_t *task_info;
1682 ompt_lw_taskteam_t lw_taskteam;
1684 if (ompt_enabled.enabled) {
1685 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1686 ompt_parallel_data, *return_address);
1688 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1689 // don't use lw_taskteam after linking. content was swaped
1690 task_info = OMPT_CUR_TASK_INFO(master_th);
1691 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1692 if (ompt_enabled.ompt_callback_implicit_task) {
1693 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1694 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1695 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1696 &(task_info->task_data), 1,
1697 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1700 /* OMPT state */
1701 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1702 } else {
1703 exit_frame_p = &dummy;
1705 #endif
1708 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1709 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1710 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1711 #if OMPT_SUPPORT
1713 exit_frame_p
1714 #endif
1718 #if OMPT_SUPPORT
1719 if (ompt_enabled.enabled) {
1720 *exit_frame_p = NULL;
1721 if (ompt_enabled.ompt_callback_implicit_task) {
1722 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1723 ompt_scope_end, NULL, &(task_info->task_data), 1,
1724 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1726 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1727 __ompt_lw_taskteam_unlink(master_th);
1728 if (ompt_enabled.ompt_callback_parallel_end) {
1729 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1730 ompt_parallel_data, *parent_task_data,
1731 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1733 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1735 #endif
1736 } else if (microtask == (microtask_t)__kmp_teams_master) {
1737 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1738 team = master_th->th.th_team;
1739 // team->t.t_pkfn = microtask;
1740 team->t.t_invoke = invoker;
1741 __kmp_alloc_argv_entries(argc, team, TRUE);
1742 team->t.t_argc = argc;
1743 argv = (void **)team->t.t_argv;
1744 if (ap) {
1745 for (i = argc - 1; i >= 0; --i)
1746 *argv++ = va_arg(kmp_va_deref(ap), void *);
1747 } else {
1748 for (i = 0; i < argc; ++i)
1749 // Get args from parent team for teams construct
1750 argv[i] = parent_team->t.t_argv[i];
1752 // AC: revert change made in __kmpc_serialized_parallel()
1753 // because initial code in teams should have level=0
1754 team->t.t_level--;
1755 // AC: call special invoker for outer "parallel" of teams construct
1756 invoker(gtid);
1757 #if OMPT_SUPPORT
1758 if (ompt_enabled.enabled) {
1759 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1760 if (ompt_enabled.ompt_callback_implicit_task) {
1761 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1762 ompt_scope_end, NULL, &(task_info->task_data), 0,
1763 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1765 if (ompt_enabled.ompt_callback_parallel_end) {
1766 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1767 ompt_parallel_data, *parent_task_data,
1768 OMPT_INVOKER(call_context) | ompt_parallel_league,
1769 *return_address);
1771 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1773 #endif
1774 } else {
1775 argv = args;
1776 for (i = argc - 1; i >= 0; --i)
1777 *argv++ = va_arg(kmp_va_deref(ap), void *);
1778 KMP_MB();
1780 #if OMPT_SUPPORT
1781 void *dummy;
1782 void **exit_frame_p;
1783 ompt_task_info_t *task_info;
1784 ompt_lw_taskteam_t lw_taskteam;
1785 ompt_data_t *implicit_task_data;
1787 if (ompt_enabled.enabled) {
1788 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1789 ompt_parallel_data, *return_address);
1790 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1791 // don't use lw_taskteam after linking. content was swaped
1792 task_info = OMPT_CUR_TASK_INFO(master_th);
1793 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1795 /* OMPT implicit task begin */
1796 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1797 if (ompt_enabled.ompt_callback_implicit_task) {
1798 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1799 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1800 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1801 ompt_task_implicit);
1802 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1805 /* OMPT state */
1806 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1807 } else {
1808 exit_frame_p = &dummy;
1810 #endif
1813 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1814 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1815 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1816 #if OMPT_SUPPORT
1818 exit_frame_p
1819 #endif
1823 #if OMPT_SUPPORT
1824 if (ompt_enabled.enabled) {
1825 *exit_frame_p = NULL;
1826 if (ompt_enabled.ompt_callback_implicit_task) {
1827 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828 ompt_scope_end, NULL, &(task_info->task_data), 1,
1829 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1832 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1833 __ompt_lw_taskteam_unlink(master_th);
1834 if (ompt_enabled.ompt_callback_parallel_end) {
1835 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1836 ompt_parallel_data, *parent_task_data,
1837 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1839 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1841 #endif
1843 } else if (call_context == fork_context_gnu) {
1844 #if OMPT_SUPPORT
1845 if (ompt_enabled.enabled) {
1846 ompt_lw_taskteam_t lwt;
1847 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1848 *return_address);
1850 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1851 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1853 // don't use lw_taskteam after linking. content was swaped
1854 #endif
1856 // we were called from GNU native code
1857 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1858 return FALSE;
1859 } else {
1860 KMP_ASSERT2(call_context < fork_context_last,
1861 "__kmp_serial_fork_call: unknown fork_context parameter");
1864 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1865 KMP_MB();
1866 return FALSE;
1869 /* most of the work for a fork */
1870 /* return true if we really went parallel, false if serialized */
1871 int __kmp_fork_call(ident_t *loc, int gtid,
1872 enum fork_context_e call_context, // Intel, GNU, ...
1873 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1874 kmp_va_list ap) {
1875 void **argv;
1876 int i;
1877 int master_tid;
1878 int master_this_cons;
1879 kmp_team_t *team;
1880 kmp_team_t *parent_team;
1881 kmp_info_t *master_th;
1882 kmp_root_t *root;
1883 int nthreads;
1884 int master_active;
1885 int master_set_numthreads;
1886 int task_thread_limit = 0;
1887 int level;
1888 int active_level;
1889 int teams_level;
1890 #if KMP_NESTED_HOT_TEAMS
1891 kmp_hot_team_ptr_t **p_hot_teams;
1892 #endif
1893 { // KMP_TIME_BLOCK
1894 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1895 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1897 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1898 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1899 /* Some systems prefer the stack for the root thread(s) to start with */
1900 /* some gap from the parent stack to prevent false sharing. */
1901 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1902 /* These 2 lines below are so this does not get optimized out */
1903 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1904 __kmp_stkpadding += (short)((kmp_int64)dummy);
1907 /* initialize if needed */
1908 KMP_DEBUG_ASSERT(
1909 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1910 if (!TCR_4(__kmp_init_parallel))
1911 __kmp_parallel_initialize();
1912 __kmp_resume_if_soft_paused();
1914 /* setup current data */
1915 // AC: potentially unsafe, not in sync with library shutdown,
1916 // __kmp_threads can be freed
1917 master_th = __kmp_threads[gtid];
1919 parent_team = master_th->th.th_team;
1920 master_tid = master_th->th.th_info.ds.ds_tid;
1921 master_this_cons = master_th->th.th_local.this_construct;
1922 root = master_th->th.th_root;
1923 master_active = root->r.r_active;
1924 master_set_numthreads = master_th->th.th_set_nproc;
1925 task_thread_limit =
1926 master_th->th.th_current_task->td_icvs.task_thread_limit;
1928 #if OMPT_SUPPORT
1929 ompt_data_t ompt_parallel_data = ompt_data_none;
1930 ompt_data_t *parent_task_data;
1931 ompt_frame_t *ompt_frame;
1932 void *return_address = NULL;
1934 if (ompt_enabled.enabled) {
1935 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1936 NULL, NULL);
1937 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1939 #endif
1941 // Assign affinity to root thread if it hasn't happened yet
1942 __kmp_assign_root_init_mask();
1944 // Nested level will be an index in the nested nthreads array
1945 level = parent_team->t.t_level;
1946 // used to launch non-serial teams even if nested is not allowed
1947 active_level = parent_team->t.t_active_level;
1948 // needed to check nesting inside the teams
1949 teams_level = master_th->th.th_teams_level;
1950 #if KMP_NESTED_HOT_TEAMS
1951 p_hot_teams = &master_th->th.th_hot_teams;
1952 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1953 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1954 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1955 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1956 // it is either actual or not needed (when active_level > 0)
1957 (*p_hot_teams)[0].hot_team_nth = 1;
1959 #endif
1961 #if OMPT_SUPPORT
1962 if (ompt_enabled.enabled) {
1963 if (ompt_enabled.ompt_callback_parallel_begin) {
1964 int team_size = master_set_numthreads
1965 ? master_set_numthreads
1966 : get__nproc_2(parent_team, master_tid);
1967 int flags = OMPT_INVOKER(call_context) |
1968 ((microtask == (microtask_t)__kmp_teams_master)
1969 ? ompt_parallel_league
1970 : ompt_parallel_team);
1971 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1972 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1973 return_address);
1975 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1977 #endif
1979 master_th->th.th_ident = loc;
1981 // Parallel closely nested in teams construct:
1982 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1983 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1984 call_context, microtask, invoker,
1985 master_set_numthreads, level,
1986 #if OMPT_SUPPORT
1987 ompt_parallel_data, return_address,
1988 #endif
1989 ap);
1990 } // End parallel closely nested in teams construct
1992 #if KMP_DEBUG
1993 if (__kmp_tasking_mode != tskm_immediate_exec) {
1994 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1995 parent_team->t.t_task_team[master_th->th.th_task_state]);
1997 #endif
1999 // Need this to happen before we determine the number of threads, not while
2000 // we are allocating the team
2001 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2003 // Determine the number of threads
2004 int enter_teams =
2005 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2006 if ((!enter_teams &&
2007 (parent_team->t.t_active_level >=
2008 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2009 (__kmp_library == library_serial)) {
2010 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2011 nthreads = 1;
2012 } else {
2013 nthreads = master_set_numthreads
2014 ? master_set_numthreads
2015 // TODO: get nproc directly from current task
2016 : get__nproc_2(parent_team, master_tid);
2017 // Use the thread_limit set for the current target task if exists, else go
2018 // with the deduced nthreads
2019 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2020 ? task_thread_limit
2021 : nthreads;
2022 // Check if we need to take forkjoin lock? (no need for serialized
2023 // parallel out of teams construct).
2024 if (nthreads > 1) {
2025 /* determine how many new threads we can use */
2026 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2027 /* AC: If we execute teams from parallel region (on host), then teams
2028 should be created but each can only have 1 thread if nesting is
2029 disabled. If teams called from serial region, then teams and their
2030 threads should be created regardless of the nesting setting. */
2031 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2032 nthreads, enter_teams);
2033 if (nthreads == 1) {
2034 // Free lock for single thread execution here; for multi-thread
2035 // execution it will be freed later after team of threads created
2036 // and initialized
2037 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2041 KMP_DEBUG_ASSERT(nthreads > 0);
2043 // If we temporarily changed the set number of threads then restore it now
2044 master_th->th.th_set_nproc = 0;
2046 if (nthreads == 1) {
2047 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2048 invoker, master_th, parent_team,
2049 #if OMPT_SUPPORT
2050 &ompt_parallel_data, &return_address,
2051 &parent_task_data,
2052 #endif
2053 ap);
2054 } // if (nthreads == 1)
2056 // GEH: only modify the executing flag in the case when not serialized
2057 // serialized case is handled in kmpc_serialized_parallel
2058 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2059 "curtask=%p, curtask_max_aclevel=%d\n",
2060 parent_team->t.t_active_level, master_th,
2061 master_th->th.th_current_task,
2062 master_th->th.th_current_task->td_icvs.max_active_levels));
2063 // TODO: GEH - cannot do this assertion because root thread not set up as
2064 // executing
2065 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2066 master_th->th.th_current_task->td_flags.executing = 0;
2068 if (!master_th->th.th_teams_microtask || level > teams_level) {
2069 /* Increment our nested depth level */
2070 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2073 // See if we need to make a copy of the ICVs.
2074 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2075 if ((level + 1 < __kmp_nested_nth.used) &&
2076 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2077 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2078 } else {
2079 nthreads_icv = 0; // don't update
2082 // Figure out the proc_bind_policy for the new team.
2083 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2084 // proc_bind_default means don't update
2085 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2086 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2087 proc_bind = proc_bind_false;
2088 } else {
2089 // No proc_bind clause specified; use current proc-bind-var for this
2090 // parallel region
2091 if (proc_bind == proc_bind_default) {
2092 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2094 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2095 if (master_th->th.th_teams_microtask &&
2096 microtask == (microtask_t)__kmp_teams_master) {
2097 proc_bind = __kmp_teams_proc_bind;
2099 /* else: The proc_bind policy was specified explicitly on parallel clause.
2100 This overrides proc-bind-var for this parallel region, but does not
2101 change proc-bind-var. */
2102 // Figure the value of proc-bind-var for the child threads.
2103 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2104 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2105 master_th->th.th_current_task->td_icvs.proc_bind)) {
2106 // Do not modify the proc bind icv for the two teams construct forks
2107 // They just let the proc bind icv pass through
2108 if (!master_th->th.th_teams_microtask ||
2109 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2110 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2114 // Reset for next parallel region
2115 master_th->th.th_set_proc_bind = proc_bind_default;
2117 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2118 kmp_internal_control_t new_icvs;
2119 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2120 new_icvs.next = NULL;
2121 if (nthreads_icv > 0) {
2122 new_icvs.nproc = nthreads_icv;
2124 if (proc_bind_icv != proc_bind_default) {
2125 new_icvs.proc_bind = proc_bind_icv;
2128 /* allocate a new parallel team */
2129 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2130 team = __kmp_allocate_team(root, nthreads, nthreads,
2131 #if OMPT_SUPPORT
2132 ompt_parallel_data,
2133 #endif
2134 proc_bind, &new_icvs,
2135 argc USE_NESTED_HOT_ARG(master_th));
2136 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2137 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2138 } else {
2139 /* allocate a new parallel team */
2140 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2141 team = __kmp_allocate_team(root, nthreads, nthreads,
2142 #if OMPT_SUPPORT
2143 ompt_parallel_data,
2144 #endif
2145 proc_bind,
2146 &master_th->th.th_current_task->td_icvs,
2147 argc USE_NESTED_HOT_ARG(master_th));
2148 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2149 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2150 &master_th->th.th_current_task->td_icvs);
2152 KF_TRACE(
2153 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2155 /* setup the new team */
2156 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2157 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2158 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2159 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2160 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2161 #if OMPT_SUPPORT
2162 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2163 return_address);
2164 #endif
2165 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2166 // TODO: parent_team->t.t_level == INT_MAX ???
2167 if (!master_th->th.th_teams_microtask || level > teams_level) {
2168 int new_level = parent_team->t.t_level + 1;
2169 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2170 new_level = parent_team->t.t_active_level + 1;
2171 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2172 } else {
2173 // AC: Do not increase parallel level at start of the teams construct
2174 int new_level = parent_team->t.t_level;
2175 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2176 new_level = parent_team->t.t_active_level;
2177 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2179 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2180 // set primary thread's schedule as new run-time schedule
2181 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2183 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2184 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2186 // Update the floating point rounding in the team if required.
2187 propagateFPControl(team);
2188 #if OMPD_SUPPORT
2189 if (ompd_state & OMPD_ENABLE_BP)
2190 ompd_bp_parallel_begin();
2191 #endif
2193 if (__kmp_tasking_mode != tskm_immediate_exec) {
2194 // Set primary thread's task team to team's task team. Unless this is hot
2195 // team, it should be NULL.
2196 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2197 parent_team->t.t_task_team[master_th->th.th_task_state]);
2198 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2199 "%p, new task_team %p / team %p\n",
2200 __kmp_gtid_from_thread(master_th),
2201 master_th->th.th_task_team, parent_team,
2202 team->t.t_task_team[master_th->th.th_task_state], team));
2204 if (active_level || master_th->th.th_task_team) {
2205 // Take a memo of primary thread's task_state
2206 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2207 if (master_th->th.th_task_state_top >=
2208 master_th->th.th_task_state_stack_sz) { // increase size
2209 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2210 kmp_uint8 *old_stack, *new_stack;
2211 kmp_uint32 i;
2212 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2213 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2214 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2216 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2217 ++i) { // zero-init rest of stack
2218 new_stack[i] = 0;
2220 old_stack = master_th->th.th_task_state_memo_stack;
2221 master_th->th.th_task_state_memo_stack = new_stack;
2222 master_th->th.th_task_state_stack_sz = new_size;
2223 __kmp_free(old_stack);
2225 // Store primary thread's task_state on stack
2226 master_th->th
2227 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2228 master_th->th.th_task_state;
2229 master_th->th.th_task_state_top++;
2230 #if KMP_NESTED_HOT_TEAMS
2231 if (master_th->th.th_hot_teams &&
2232 active_level < __kmp_hot_teams_max_level &&
2233 team == master_th->th.th_hot_teams[active_level].hot_team) {
2234 // Restore primary thread's nested state if nested hot team
2235 master_th->th.th_task_state =
2236 master_th->th
2237 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2238 } else {
2239 #endif
2240 master_th->th.th_task_state = 0;
2241 #if KMP_NESTED_HOT_TEAMS
2243 #endif
2245 #if !KMP_NESTED_HOT_TEAMS
2246 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2247 (team == root->r.r_hot_team));
2248 #endif
2251 KA_TRACE(
2253 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2254 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2255 team->t.t_nproc));
2256 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2257 (team->t.t_master_tid == 0 &&
2258 (team->t.t_parent == root->r.r_root_team ||
2259 team->t.t_parent->t.t_serialized)));
2260 KMP_MB();
2262 /* now, setup the arguments */
2263 argv = (void **)team->t.t_argv;
2264 if (ap) {
2265 for (i = argc - 1; i >= 0; --i) {
2266 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2267 KMP_CHECK_UPDATE(*argv, new_argv);
2268 argv++;
2270 } else {
2271 for (i = 0; i < argc; ++i) {
2272 // Get args from parent team for teams construct
2273 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2277 /* now actually fork the threads */
2278 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2279 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2280 root->r.r_active = TRUE;
2282 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2283 __kmp_setup_icv_copy(team, nthreads,
2284 &master_th->th.th_current_task->td_icvs, loc);
2286 #if OMPT_SUPPORT
2287 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2288 #endif
2290 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2292 #if USE_ITT_BUILD
2293 if (team->t.t_active_level == 1 // only report frames at level 1
2294 && !master_th->th.th_teams_microtask) { // not in teams construct
2295 #if USE_ITT_NOTIFY
2296 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2297 (__kmp_forkjoin_frames_mode == 3 ||
2298 __kmp_forkjoin_frames_mode == 1)) {
2299 kmp_uint64 tmp_time = 0;
2300 if (__itt_get_timestamp_ptr)
2301 tmp_time = __itt_get_timestamp();
2302 // Internal fork - report frame begin
2303 master_th->th.th_frame_time = tmp_time;
2304 if (__kmp_forkjoin_frames_mode == 3)
2305 team->t.t_region_time = tmp_time;
2306 } else
2307 // only one notification scheme (either "submit" or "forking/joined", not both)
2308 #endif /* USE_ITT_NOTIFY */
2309 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2310 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2311 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2312 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2315 #endif /* USE_ITT_BUILD */
2317 /* now go on and do the work */
2318 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2319 KMP_MB();
2320 KF_TRACE(10,
2321 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2322 root, team, master_th, gtid));
2324 #if USE_ITT_BUILD
2325 if (__itt_stack_caller_create_ptr) {
2326 // create new stack stitching id before entering fork barrier
2327 if (!enter_teams) {
2328 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2329 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2330 } else if (parent_team->t.t_serialized) {
2331 // keep stack stitching id in the serialized parent_team;
2332 // current team will be used for parallel inside the teams;
2333 // if parent_team is active, then it already keeps stack stitching id
2334 // for the league of teams
2335 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2336 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2339 #endif /* USE_ITT_BUILD */
2341 // AC: skip __kmp_internal_fork at teams construct, let only primary
2342 // threads execute
2343 if (ap) {
2344 __kmp_internal_fork(loc, gtid, team);
2345 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2346 "master_th=%p, gtid=%d\n",
2347 root, team, master_th, gtid));
2350 if (call_context == fork_context_gnu) {
2351 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352 return TRUE;
2355 /* Invoke microtask for PRIMARY thread */
2356 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2357 team->t.t_id, team->t.t_pkfn));
2358 } // END of timer KMP_fork_call block
2360 #if KMP_STATS_ENABLED
2361 // If beginning a teams construct, then change thread state
2362 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2363 if (!ap) {
2364 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2366 #endif
2368 if (!team->t.t_invoke(gtid)) {
2369 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2372 #if KMP_STATS_ENABLED
2373 // If was beginning of a teams construct, then reset thread state
2374 if (!ap) {
2375 KMP_SET_THREAD_STATE(previous_state);
2377 #endif
2379 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2380 team->t.t_id, team->t.t_pkfn));
2381 KMP_MB(); /* Flush all pending memory write invalidates. */
2383 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2384 #if OMPT_SUPPORT
2385 if (ompt_enabled.enabled) {
2386 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2388 #endif
2390 return TRUE;
2393 #if OMPT_SUPPORT
2394 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2395 kmp_team_t *team) {
2396 // restore state outside the region
2397 thread->th.ompt_thread_info.state =
2398 ((team->t.t_serialized) ? ompt_state_work_serial
2399 : ompt_state_work_parallel);
2402 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2403 kmp_team_t *team, ompt_data_t *parallel_data,
2404 int flags, void *codeptr) {
2405 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2406 if (ompt_enabled.ompt_callback_parallel_end) {
2407 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2408 parallel_data, &(task_info->task_data), flags, codeptr);
2411 task_info->frame.enter_frame = ompt_data_none;
2412 __kmp_join_restore_state(thread, team);
2414 #endif
2416 void __kmp_join_call(ident_t *loc, int gtid
2417 #if OMPT_SUPPORT
2419 enum fork_context_e fork_context
2420 #endif
2422 int exit_teams) {
2423 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2424 kmp_team_t *team;
2425 kmp_team_t *parent_team;
2426 kmp_info_t *master_th;
2427 kmp_root_t *root;
2428 int master_active;
2430 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2432 /* setup current data */
2433 master_th = __kmp_threads[gtid];
2434 root = master_th->th.th_root;
2435 team = master_th->th.th_team;
2436 parent_team = team->t.t_parent;
2438 master_th->th.th_ident = loc;
2440 #if OMPT_SUPPORT
2441 void *team_microtask = (void *)team->t.t_pkfn;
2442 // For GOMP interface with serialized parallel, need the
2443 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2444 // and end-parallel events.
2445 if (ompt_enabled.enabled &&
2446 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2447 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2449 #endif
2451 #if KMP_DEBUG
2452 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2453 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2454 "th_task_team = %p\n",
2455 __kmp_gtid_from_thread(master_th), team,
2456 team->t.t_task_team[master_th->th.th_task_state],
2457 master_th->th.th_task_team));
2458 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2459 team->t.t_task_team[master_th->th.th_task_state]);
2461 #endif
2463 if (team->t.t_serialized) {
2464 if (master_th->th.th_teams_microtask) {
2465 // We are in teams construct
2466 int level = team->t.t_level;
2467 int tlevel = master_th->th.th_teams_level;
2468 if (level == tlevel) {
2469 // AC: we haven't incremented it earlier at start of teams construct,
2470 // so do it here - at the end of teams construct
2471 team->t.t_level++;
2472 } else if (level == tlevel + 1) {
2473 // AC: we are exiting parallel inside teams, need to increment
2474 // serialization in order to restore it in the next call to
2475 // __kmpc_end_serialized_parallel
2476 team->t.t_serialized++;
2479 __kmpc_end_serialized_parallel(loc, gtid);
2481 #if OMPT_SUPPORT
2482 if (ompt_enabled.enabled) {
2483 if (fork_context == fork_context_gnu) {
2484 __ompt_lw_taskteam_unlink(master_th);
2486 __kmp_join_restore_state(master_th, parent_team);
2488 #endif
2490 return;
2493 master_active = team->t.t_master_active;
2495 if (!exit_teams) {
2496 // AC: No barrier for internal teams at exit from teams construct.
2497 // But there is barrier for external team (league).
2498 __kmp_internal_join(loc, gtid, team);
2499 #if USE_ITT_BUILD
2500 if (__itt_stack_caller_create_ptr) {
2501 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2502 // destroy the stack stitching id after join barrier
2503 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2504 team->t.t_stack_id = NULL;
2506 #endif
2507 } else {
2508 master_th->th.th_task_state =
2509 0; // AC: no tasking in teams (out of any parallel)
2510 #if USE_ITT_BUILD
2511 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2512 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2513 // destroy the stack stitching id on exit from the teams construct
2514 // if parent_team is active, then the id will be destroyed later on
2515 // by master of the league of teams
2516 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2517 parent_team->t.t_stack_id = NULL;
2519 #endif
2522 KMP_MB();
2524 #if OMPT_SUPPORT
2525 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2526 void *codeptr = team->t.ompt_team_info.master_return_address;
2527 #endif
2529 #if USE_ITT_BUILD
2530 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2531 if (team->t.t_active_level == 1 &&
2532 (!master_th->th.th_teams_microtask || /* not in teams construct */
2533 master_th->th.th_teams_size.nteams == 1)) {
2534 master_th->th.th_ident = loc;
2535 // only one notification scheme (either "submit" or "forking/joined", not
2536 // both)
2537 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2538 __kmp_forkjoin_frames_mode == 3)
2539 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2540 master_th->th.th_frame_time, 0, loc,
2541 master_th->th.th_team_nproc, 1);
2542 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2543 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2544 __kmp_itt_region_joined(gtid);
2545 } // active_level == 1
2546 #endif /* USE_ITT_BUILD */
2548 #if KMP_AFFINITY_SUPPORTED
2549 if (!exit_teams) {
2550 // Restore master thread's partition.
2551 master_th->th.th_first_place = team->t.t_first_place;
2552 master_th->th.th_last_place = team->t.t_last_place;
2554 #endif // KMP_AFFINITY_SUPPORTED
2556 if (master_th->th.th_teams_microtask && !exit_teams &&
2557 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2558 team->t.t_level == master_th->th.th_teams_level + 1) {
2559 // AC: We need to leave the team structure intact at the end of parallel
2560 // inside the teams construct, so that at the next parallel same (hot) team
2561 // works, only adjust nesting levels
2562 #if OMPT_SUPPORT
2563 ompt_data_t ompt_parallel_data = ompt_data_none;
2564 if (ompt_enabled.enabled) {
2565 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2566 if (ompt_enabled.ompt_callback_implicit_task) {
2567 int ompt_team_size = team->t.t_nproc;
2568 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2569 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2570 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2572 task_info->frame.exit_frame = ompt_data_none;
2573 task_info->task_data = ompt_data_none;
2574 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2575 __ompt_lw_taskteam_unlink(master_th);
2577 #endif
2578 /* Decrement our nested depth level */
2579 team->t.t_level--;
2580 team->t.t_active_level--;
2581 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2583 // Restore number of threads in the team if needed. This code relies on
2584 // the proper adjustment of th_teams_size.nth after the fork in
2585 // __kmp_teams_master on each teams primary thread in the case that
2586 // __kmp_reserve_threads reduced it.
2587 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2588 int old_num = master_th->th.th_team_nproc;
2589 int new_num = master_th->th.th_teams_size.nth;
2590 kmp_info_t **other_threads = team->t.t_threads;
2591 team->t.t_nproc = new_num;
2592 for (int i = 0; i < old_num; ++i) {
2593 other_threads[i]->th.th_team_nproc = new_num;
2595 // Adjust states of non-used threads of the team
2596 for (int i = old_num; i < new_num; ++i) {
2597 // Re-initialize thread's barrier data.
2598 KMP_DEBUG_ASSERT(other_threads[i]);
2599 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2600 for (int b = 0; b < bs_last_barrier; ++b) {
2601 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2602 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2603 #if USE_DEBUGGER
2604 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2605 #endif
2607 if (__kmp_tasking_mode != tskm_immediate_exec) {
2608 // Synchronize thread's task state
2609 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2614 #if OMPT_SUPPORT
2615 if (ompt_enabled.enabled) {
2616 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2617 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2619 #endif
2621 return;
2624 /* do cleanup and restore the parent team */
2625 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2626 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2628 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2630 /* jc: The following lock has instructions with REL and ACQ semantics,
2631 separating the parallel user code called in this parallel region
2632 from the serial user code called after this function returns. */
2633 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2635 if (!master_th->th.th_teams_microtask ||
2636 team->t.t_level > master_th->th.th_teams_level) {
2637 /* Decrement our nested depth level */
2638 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2640 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2642 #if OMPT_SUPPORT
2643 if (ompt_enabled.enabled) {
2644 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2645 if (ompt_enabled.ompt_callback_implicit_task) {
2646 int flags = (team_microtask == (void *)__kmp_teams_master)
2647 ? ompt_task_initial
2648 : ompt_task_implicit;
2649 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2650 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2651 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2652 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2654 task_info->frame.exit_frame = ompt_data_none;
2655 task_info->task_data = ompt_data_none;
2657 #endif
2659 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2660 master_th, team));
2661 __kmp_pop_current_task_from_thread(master_th);
2663 master_th->th.th_def_allocator = team->t.t_def_allocator;
2665 #if OMPD_SUPPORT
2666 if (ompd_state & OMPD_ENABLE_BP)
2667 ompd_bp_parallel_end();
2668 #endif
2669 updateHWFPControl(team);
2671 if (root->r.r_active != master_active)
2672 root->r.r_active = master_active;
2674 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2675 master_th)); // this will free worker threads
2677 /* this race was fun to find. make sure the following is in the critical
2678 region otherwise assertions may fail occasionally since the old team may be
2679 reallocated and the hierarchy appears inconsistent. it is actually safe to
2680 run and won't cause any bugs, but will cause those assertion failures. it's
2681 only one deref&assign so might as well put this in the critical region */
2682 master_th->th.th_team = parent_team;
2683 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2684 master_th->th.th_team_master = parent_team->t.t_threads[0];
2685 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2687 /* restore serialized team, if need be */
2688 if (parent_team->t.t_serialized &&
2689 parent_team != master_th->th.th_serial_team &&
2690 parent_team != root->r.r_root_team) {
2691 __kmp_free_team(root,
2692 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2693 master_th->th.th_serial_team = parent_team;
2696 if (__kmp_tasking_mode != tskm_immediate_exec) {
2697 if (master_th->th.th_task_state_top >
2698 0) { // Restore task state from memo stack
2699 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2700 // Remember primary thread's state if we re-use this nested hot team
2701 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2702 master_th->th.th_task_state;
2703 --master_th->th.th_task_state_top; // pop
2704 // Now restore state at this level
2705 master_th->th.th_task_state =
2706 master_th->th
2707 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2708 } else if (team != root->r.r_hot_team) {
2709 // Reset the task state of primary thread if we are not hot team because
2710 // in this case all the worker threads will be free, and their task state
2711 // will be reset. If not reset the primary's, the task state will be
2712 // inconsistent.
2713 master_th->th.th_task_state = 0;
2715 // Copy the task team from the parent team to the primary thread
2716 master_th->th.th_task_team =
2717 parent_team->t.t_task_team[master_th->th.th_task_state];
2718 KA_TRACE(20,
2719 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2720 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2721 parent_team));
2724 // TODO: GEH - cannot do this assertion because root thread not set up as
2725 // executing
2726 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2727 master_th->th.th_current_task->td_flags.executing = 1;
2729 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2731 #if KMP_AFFINITY_SUPPORTED
2732 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2733 __kmp_reset_root_init_mask(gtid);
2735 #endif
2736 #if OMPT_SUPPORT
2737 int flags =
2738 OMPT_INVOKER(fork_context) |
2739 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2740 : ompt_parallel_team);
2741 if (ompt_enabled.enabled) {
2742 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2743 codeptr);
2745 #endif
2747 KMP_MB();
2748 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2751 /* Check whether we should push an internal control record onto the
2752 serial team stack. If so, do it. */
2753 void __kmp_save_internal_controls(kmp_info_t *thread) {
2755 if (thread->th.th_team != thread->th.th_serial_team) {
2756 return;
2758 if (thread->th.th_team->t.t_serialized > 1) {
2759 int push = 0;
2761 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2762 push = 1;
2763 } else {
2764 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2765 thread->th.th_team->t.t_serialized) {
2766 push = 1;
2769 if (push) { /* push a record on the serial team's stack */
2770 kmp_internal_control_t *control =
2771 (kmp_internal_control_t *)__kmp_allocate(
2772 sizeof(kmp_internal_control_t));
2774 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2776 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2778 control->next = thread->th.th_team->t.t_control_stack_top;
2779 thread->th.th_team->t.t_control_stack_top = control;
2784 /* Changes set_nproc */
2785 void __kmp_set_num_threads(int new_nth, int gtid) {
2786 kmp_info_t *thread;
2787 kmp_root_t *root;
2789 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2790 KMP_DEBUG_ASSERT(__kmp_init_serial);
2792 if (new_nth < 1)
2793 new_nth = 1;
2794 else if (new_nth > __kmp_max_nth)
2795 new_nth = __kmp_max_nth;
2797 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2798 thread = __kmp_threads[gtid];
2799 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2800 return; // nothing to do
2802 __kmp_save_internal_controls(thread);
2804 set__nproc(thread, new_nth);
2806 // If this omp_set_num_threads() call will cause the hot team size to be
2807 // reduced (in the absence of a num_threads clause), then reduce it now,
2808 // rather than waiting for the next parallel region.
2809 root = thread->th.th_root;
2810 if (__kmp_init_parallel && (!root->r.r_active) &&
2811 (root->r.r_hot_team->t.t_nproc > new_nth)
2812 #if KMP_NESTED_HOT_TEAMS
2813 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2814 #endif
2816 kmp_team_t *hot_team = root->r.r_hot_team;
2817 int f;
2819 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2821 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2822 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2824 // Release the extra threads we don't need any more.
2825 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2826 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2827 if (__kmp_tasking_mode != tskm_immediate_exec) {
2828 // When decreasing team size, threads no longer in the team should unref
2829 // task team.
2830 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2832 __kmp_free_thread(hot_team->t.t_threads[f]);
2833 hot_team->t.t_threads[f] = NULL;
2835 hot_team->t.t_nproc = new_nth;
2836 #if KMP_NESTED_HOT_TEAMS
2837 if (thread->th.th_hot_teams) {
2838 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2839 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2841 #endif
2843 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2844 hot_team->t.b->update_num_threads(new_nth);
2845 __kmp_add_threads_to_team(hot_team, new_nth);
2848 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2850 // Update the t_nproc field in the threads that are still active.
2851 for (f = 0; f < new_nth; f++) {
2852 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2853 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2855 // Special flag in case omp_set_num_threads() call
2856 hot_team->t.t_size_changed = -1;
2860 /* Changes max_active_levels */
2861 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2862 kmp_info_t *thread;
2864 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2865 "%d = (%d)\n",
2866 gtid, max_active_levels));
2867 KMP_DEBUG_ASSERT(__kmp_init_serial);
2869 // validate max_active_levels
2870 if (max_active_levels < 0) {
2871 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2872 // We ignore this call if the user has specified a negative value.
2873 // The current setting won't be changed. The last valid setting will be
2874 // used. A warning will be issued (if warnings are allowed as controlled by
2875 // the KMP_WARNINGS env var).
2876 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2877 "max_active_levels for thread %d = (%d)\n",
2878 gtid, max_active_levels));
2879 return;
2881 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2882 // it's OK, the max_active_levels is within the valid range: [ 0;
2883 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2884 // We allow a zero value. (implementation defined behavior)
2885 } else {
2886 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2887 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2888 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2889 // Current upper limit is MAX_INT. (implementation defined behavior)
2890 // If the input exceeds the upper limit, we correct the input to be the
2891 // upper limit. (implementation defined behavior)
2892 // Actually, the flow should never get here until we use MAX_INT limit.
2894 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2895 "max_active_levels for thread %d = (%d)\n",
2896 gtid, max_active_levels));
2898 thread = __kmp_threads[gtid];
2900 __kmp_save_internal_controls(thread);
2902 set__max_active_levels(thread, max_active_levels);
2905 /* Gets max_active_levels */
2906 int __kmp_get_max_active_levels(int gtid) {
2907 kmp_info_t *thread;
2909 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2910 KMP_DEBUG_ASSERT(__kmp_init_serial);
2912 thread = __kmp_threads[gtid];
2913 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2914 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2915 "curtask_maxaclevel=%d\n",
2916 gtid, thread->th.th_current_task,
2917 thread->th.th_current_task->td_icvs.max_active_levels));
2918 return thread->th.th_current_task->td_icvs.max_active_levels;
2921 // nteams-var per-device ICV
2922 void __kmp_set_num_teams(int num_teams) {
2923 if (num_teams > 0)
2924 __kmp_nteams = num_teams;
2926 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2927 // teams-thread-limit-var per-device ICV
2928 void __kmp_set_teams_thread_limit(int limit) {
2929 if (limit > 0)
2930 __kmp_teams_thread_limit = limit;
2932 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2934 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2935 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2937 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2938 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2939 kmp_info_t *thread;
2940 kmp_sched_t orig_kind;
2941 // kmp_team_t *team;
2943 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2944 gtid, (int)kind, chunk));
2945 KMP_DEBUG_ASSERT(__kmp_init_serial);
2947 // Check if the kind parameter is valid, correct if needed.
2948 // Valid parameters should fit in one of two intervals - standard or extended:
2949 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2950 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2951 orig_kind = kind;
2952 kind = __kmp_sched_without_mods(kind);
2954 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2955 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2956 // TODO: Hint needs attention in case we change the default schedule.
2957 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2958 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2959 __kmp_msg_null);
2960 kind = kmp_sched_default;
2961 chunk = 0; // ignore chunk value in case of bad kind
2964 thread = __kmp_threads[gtid];
2966 __kmp_save_internal_controls(thread);
2968 if (kind < kmp_sched_upper_std) {
2969 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2970 // differ static chunked vs. unchunked: chunk should be invalid to
2971 // indicate unchunked schedule (which is the default)
2972 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2973 } else {
2974 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2975 __kmp_sch_map[kind - kmp_sched_lower - 1];
2977 } else {
2978 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2979 // kmp_sched_lower - 2 ];
2980 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2982 kmp_sched_lower - 2];
2984 __kmp_sched_apply_mods_intkind(
2985 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2986 if (kind == kmp_sched_auto || chunk < 1) {
2987 // ignore parameter chunk for schedule auto
2988 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2989 } else {
2990 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2994 /* Gets def_sched_var ICV values */
2995 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2996 kmp_info_t *thread;
2997 enum sched_type th_type;
2999 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3000 KMP_DEBUG_ASSERT(__kmp_init_serial);
3002 thread = __kmp_threads[gtid];
3004 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3005 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3006 case kmp_sch_static:
3007 case kmp_sch_static_greedy:
3008 case kmp_sch_static_balanced:
3009 *kind = kmp_sched_static;
3010 __kmp_sched_apply_mods_stdkind(kind, th_type);
3011 *chunk = 0; // chunk was not set, try to show this fact via zero value
3012 return;
3013 case kmp_sch_static_chunked:
3014 *kind = kmp_sched_static;
3015 break;
3016 case kmp_sch_dynamic_chunked:
3017 *kind = kmp_sched_dynamic;
3018 break;
3019 case kmp_sch_guided_chunked:
3020 case kmp_sch_guided_iterative_chunked:
3021 case kmp_sch_guided_analytical_chunked:
3022 *kind = kmp_sched_guided;
3023 break;
3024 case kmp_sch_auto:
3025 *kind = kmp_sched_auto;
3026 break;
3027 case kmp_sch_trapezoidal:
3028 *kind = kmp_sched_trapezoidal;
3029 break;
3030 #if KMP_STATIC_STEAL_ENABLED
3031 case kmp_sch_static_steal:
3032 *kind = kmp_sched_static_steal;
3033 break;
3034 #endif
3035 default:
3036 KMP_FATAL(UnknownSchedulingType, th_type);
3039 __kmp_sched_apply_mods_stdkind(kind, th_type);
3040 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3043 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3045 int ii, dd;
3046 kmp_team_t *team;
3047 kmp_info_t *thr;
3049 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3050 KMP_DEBUG_ASSERT(__kmp_init_serial);
3052 // validate level
3053 if (level == 0)
3054 return 0;
3055 if (level < 0)
3056 return -1;
3057 thr = __kmp_threads[gtid];
3058 team = thr->th.th_team;
3059 ii = team->t.t_level;
3060 if (level > ii)
3061 return -1;
3063 if (thr->th.th_teams_microtask) {
3064 // AC: we are in teams region where multiple nested teams have same level
3065 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3066 if (level <=
3067 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3068 KMP_DEBUG_ASSERT(ii >= tlevel);
3069 // AC: As we need to pass by the teams league, we need to artificially
3070 // increase ii
3071 if (ii == tlevel) {
3072 ii += 2; // three teams have same level
3073 } else {
3074 ii++; // two teams have same level
3079 if (ii == level)
3080 return __kmp_tid_from_gtid(gtid);
3082 dd = team->t.t_serialized;
3083 level++;
3084 while (ii > level) {
3085 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3087 if ((team->t.t_serialized) && (!dd)) {
3088 team = team->t.t_parent;
3089 continue;
3091 if (ii > level) {
3092 team = team->t.t_parent;
3093 dd = team->t.t_serialized;
3094 ii--;
3098 return (dd > 1) ? (0) : (team->t.t_master_tid);
3101 int __kmp_get_team_size(int gtid, int level) {
3103 int ii, dd;
3104 kmp_team_t *team;
3105 kmp_info_t *thr;
3107 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3108 KMP_DEBUG_ASSERT(__kmp_init_serial);
3110 // validate level
3111 if (level == 0)
3112 return 1;
3113 if (level < 0)
3114 return -1;
3115 thr = __kmp_threads[gtid];
3116 team = thr->th.th_team;
3117 ii = team->t.t_level;
3118 if (level > ii)
3119 return -1;
3121 if (thr->th.th_teams_microtask) {
3122 // AC: we are in teams region where multiple nested teams have same level
3123 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3124 if (level <=
3125 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3126 KMP_DEBUG_ASSERT(ii >= tlevel);
3127 // AC: As we need to pass by the teams league, we need to artificially
3128 // increase ii
3129 if (ii == tlevel) {
3130 ii += 2; // three teams have same level
3131 } else {
3132 ii++; // two teams have same level
3137 while (ii > level) {
3138 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3140 if (team->t.t_serialized && (!dd)) {
3141 team = team->t.t_parent;
3142 continue;
3144 if (ii > level) {
3145 team = team->t.t_parent;
3146 ii--;
3150 return team->t.t_nproc;
3153 kmp_r_sched_t __kmp_get_schedule_global() {
3154 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3155 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3156 // independently. So one can get the updated schedule here.
3158 kmp_r_sched_t r_sched;
3160 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3161 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3162 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3163 // different roots (even in OMP 2.5)
3164 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3165 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3166 if (s == kmp_sch_static) {
3167 // replace STATIC with more detailed schedule (balanced or greedy)
3168 r_sched.r_sched_type = __kmp_static;
3169 } else if (s == kmp_sch_guided_chunked) {
3170 // replace GUIDED with more detailed schedule (iterative or analytical)
3171 r_sched.r_sched_type = __kmp_guided;
3172 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3173 r_sched.r_sched_type = __kmp_sched;
3175 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3177 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3178 // __kmp_chunk may be wrong here (if it was not ever set)
3179 r_sched.chunk = KMP_DEFAULT_CHUNK;
3180 } else {
3181 r_sched.chunk = __kmp_chunk;
3184 return r_sched;
3187 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3188 at least argc number of *t_argv entries for the requested team. */
3189 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3191 KMP_DEBUG_ASSERT(team);
3192 if (!realloc || argc > team->t.t_max_argc) {
3194 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3195 "current entries=%d\n",
3196 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3197 /* if previously allocated heap space for args, free them */
3198 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3199 __kmp_free((void *)team->t.t_argv);
3201 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3202 /* use unused space in the cache line for arguments */
3203 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3204 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3205 "argv entries\n",
3206 team->t.t_id, team->t.t_max_argc));
3207 team->t.t_argv = &team->t.t_inline_argv[0];
3208 if (__kmp_storage_map) {
3209 __kmp_print_storage_map_gtid(
3210 -1, &team->t.t_inline_argv[0],
3211 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3212 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3213 team->t.t_id);
3215 } else {
3216 /* allocate space for arguments in the heap */
3217 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3218 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3219 : 2 * argc;
3220 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3221 "argv entries\n",
3222 team->t.t_id, team->t.t_max_argc));
3223 team->t.t_argv =
3224 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3225 if (__kmp_storage_map) {
3226 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3227 &team->t.t_argv[team->t.t_max_argc],
3228 sizeof(void *) * team->t.t_max_argc,
3229 "team_%d.t_argv", team->t.t_id);
3235 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3236 int i;
3237 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3238 team->t.t_threads =
3239 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3240 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3241 sizeof(dispatch_shared_info_t) * num_disp_buff);
3242 team->t.t_dispatch =
3243 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3244 team->t.t_implicit_task_taskdata =
3245 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3246 team->t.t_max_nproc = max_nth;
3248 /* setup dispatch buffers */
3249 for (i = 0; i < num_disp_buff; ++i) {
3250 team->t.t_disp_buffer[i].buffer_index = i;
3251 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3255 static void __kmp_free_team_arrays(kmp_team_t *team) {
3256 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3257 int i;
3258 for (i = 0; i < team->t.t_max_nproc; ++i) {
3259 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3260 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3261 team->t.t_dispatch[i].th_disp_buffer = NULL;
3264 #if KMP_USE_HIER_SCHED
3265 __kmp_dispatch_free_hierarchies(team);
3266 #endif
3267 __kmp_free(team->t.t_threads);
3268 __kmp_free(team->t.t_disp_buffer);
3269 __kmp_free(team->t.t_dispatch);
3270 __kmp_free(team->t.t_implicit_task_taskdata);
3271 team->t.t_threads = NULL;
3272 team->t.t_disp_buffer = NULL;
3273 team->t.t_dispatch = NULL;
3274 team->t.t_implicit_task_taskdata = 0;
3277 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3278 kmp_info_t **oldThreads = team->t.t_threads;
3280 __kmp_free(team->t.t_disp_buffer);
3281 __kmp_free(team->t.t_dispatch);
3282 __kmp_free(team->t.t_implicit_task_taskdata);
3283 __kmp_allocate_team_arrays(team, max_nth);
3285 KMP_MEMCPY(team->t.t_threads, oldThreads,
3286 team->t.t_nproc * sizeof(kmp_info_t *));
3288 __kmp_free(oldThreads);
3291 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3293 kmp_r_sched_t r_sched =
3294 __kmp_get_schedule_global(); // get current state of scheduling globals
3296 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3298 kmp_internal_control_t g_icvs = {
3299 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3300 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3301 // adjustment of threads (per thread)
3302 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3303 // whether blocktime is explicitly set
3304 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3305 #if KMP_USE_MONITOR
3306 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3307 // intervals
3308 #endif
3309 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3310 // next parallel region (per thread)
3311 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3312 __kmp_cg_max_nth, // int thread_limit;
3313 __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3314 // on task. This is used in the case of target thread_limit
3315 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3316 // for max_active_levels
3317 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3318 // {sched,chunk} pair
3319 __kmp_nested_proc_bind.bind_types[0],
3320 __kmp_default_device,
3321 NULL // struct kmp_internal_control *next;
3324 return g_icvs;
3327 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3329 kmp_internal_control_t gx_icvs;
3330 gx_icvs.serial_nesting_level =
3331 0; // probably =team->t.t_serial like in save_inter_controls
3332 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3333 gx_icvs.next = NULL;
3335 return gx_icvs;
3338 static void __kmp_initialize_root(kmp_root_t *root) {
3339 int f;
3340 kmp_team_t *root_team;
3341 kmp_team_t *hot_team;
3342 int hot_team_max_nth;
3343 kmp_r_sched_t r_sched =
3344 __kmp_get_schedule_global(); // get current state of scheduling globals
3345 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3346 KMP_DEBUG_ASSERT(root);
3347 KMP_ASSERT(!root->r.r_begin);
3349 /* setup the root state structure */
3350 __kmp_init_lock(&root->r.r_begin_lock);
3351 root->r.r_begin = FALSE;
3352 root->r.r_active = FALSE;
3353 root->r.r_in_parallel = 0;
3354 root->r.r_blocktime = __kmp_dflt_blocktime;
3355 #if KMP_AFFINITY_SUPPORTED
3356 root->r.r_affinity_assigned = FALSE;
3357 #endif
3359 /* setup the root team for this task */
3360 /* allocate the root team structure */
3361 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3363 root_team =
3364 __kmp_allocate_team(root,
3365 1, // new_nproc
3366 1, // max_nproc
3367 #if OMPT_SUPPORT
3368 ompt_data_none, // root parallel id
3369 #endif
3370 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3371 0 // argc
3372 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3374 #if USE_DEBUGGER
3375 // Non-NULL value should be assigned to make the debugger display the root
3376 // team.
3377 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3378 #endif
3380 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3382 root->r.r_root_team = root_team;
3383 root_team->t.t_control_stack_top = NULL;
3385 /* initialize root team */
3386 root_team->t.t_threads[0] = NULL;
3387 root_team->t.t_nproc = 1;
3388 root_team->t.t_serialized = 1;
3389 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3390 root_team->t.t_sched.sched = r_sched.sched;
3391 KA_TRACE(
3393 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3394 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3396 /* setup the hot team for this task */
3397 /* allocate the hot team structure */
3398 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3400 hot_team =
3401 __kmp_allocate_team(root,
3402 1, // new_nproc
3403 __kmp_dflt_team_nth_ub * 2, // max_nproc
3404 #if OMPT_SUPPORT
3405 ompt_data_none, // root parallel id
3406 #endif
3407 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3408 0 // argc
3409 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3411 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3413 root->r.r_hot_team = hot_team;
3414 root_team->t.t_control_stack_top = NULL;
3416 /* first-time initialization */
3417 hot_team->t.t_parent = root_team;
3419 /* initialize hot team */
3420 hot_team_max_nth = hot_team->t.t_max_nproc;
3421 for (f = 0; f < hot_team_max_nth; ++f) {
3422 hot_team->t.t_threads[f] = NULL;
3424 hot_team->t.t_nproc = 1;
3425 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3426 hot_team->t.t_sched.sched = r_sched.sched;
3427 hot_team->t.t_size_changed = 0;
3430 #ifdef KMP_DEBUG
3432 typedef struct kmp_team_list_item {
3433 kmp_team_p const *entry;
3434 struct kmp_team_list_item *next;
3435 } kmp_team_list_item_t;
3436 typedef kmp_team_list_item_t *kmp_team_list_t;
3438 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3439 kmp_team_list_t list, // List of teams.
3440 kmp_team_p const *team // Team to add.
3443 // List must terminate with item where both entry and next are NULL.
3444 // Team is added to the list only once.
3445 // List is sorted in ascending order by team id.
3446 // Team id is *not* a key.
3448 kmp_team_list_t l;
3450 KMP_DEBUG_ASSERT(list != NULL);
3451 if (team == NULL) {
3452 return;
3455 __kmp_print_structure_team_accum(list, team->t.t_parent);
3456 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3458 // Search list for the team.
3459 l = list;
3460 while (l->next != NULL && l->entry != team) {
3461 l = l->next;
3463 if (l->next != NULL) {
3464 return; // Team has been added before, exit.
3467 // Team is not found. Search list again for insertion point.
3468 l = list;
3469 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3470 l = l->next;
3473 // Insert team.
3475 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3476 sizeof(kmp_team_list_item_t));
3477 *item = *l;
3478 l->entry = team;
3479 l->next = item;
3483 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3486 __kmp_printf("%s", title);
3487 if (team != NULL) {
3488 __kmp_printf("%2x %p\n", team->t.t_id, team);
3489 } else {
3490 __kmp_printf(" - (nil)\n");
3494 static void __kmp_print_structure_thread(char const *title,
3495 kmp_info_p const *thread) {
3496 __kmp_printf("%s", title);
3497 if (thread != NULL) {
3498 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3499 } else {
3500 __kmp_printf(" - (nil)\n");
3504 void __kmp_print_structure(void) {
3506 kmp_team_list_t list;
3508 // Initialize list of teams.
3509 list =
3510 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3511 list->entry = NULL;
3512 list->next = NULL;
3514 __kmp_printf("\n------------------------------\nGlobal Thread "
3515 "Table\n------------------------------\n");
3517 int gtid;
3518 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3519 __kmp_printf("%2d", gtid);
3520 if (__kmp_threads != NULL) {
3521 __kmp_printf(" %p", __kmp_threads[gtid]);
3523 if (__kmp_root != NULL) {
3524 __kmp_printf(" %p", __kmp_root[gtid]);
3526 __kmp_printf("\n");
3530 // Print out __kmp_threads array.
3531 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3532 "----------\n");
3533 if (__kmp_threads != NULL) {
3534 int gtid;
3535 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3536 kmp_info_t const *thread = __kmp_threads[gtid];
3537 if (thread != NULL) {
3538 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3539 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3540 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3541 __kmp_print_structure_team(" Serial Team: ",
3542 thread->th.th_serial_team);
3543 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3544 __kmp_print_structure_thread(" Primary: ",
3545 thread->th.th_team_master);
3546 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3547 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3548 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3549 __kmp_print_structure_thread(" Next in pool: ",
3550 thread->th.th_next_pool);
3551 __kmp_printf("\n");
3552 __kmp_print_structure_team_accum(list, thread->th.th_team);
3553 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3556 } else {
3557 __kmp_printf("Threads array is not allocated.\n");
3560 // Print out __kmp_root array.
3561 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3562 "--------\n");
3563 if (__kmp_root != NULL) {
3564 int gtid;
3565 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3566 kmp_root_t const *root = __kmp_root[gtid];
3567 if (root != NULL) {
3568 __kmp_printf("GTID %2d %p:\n", gtid, root);
3569 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3570 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3571 __kmp_print_structure_thread(" Uber Thread: ",
3572 root->r.r_uber_thread);
3573 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3574 __kmp_printf(" In Parallel: %2d\n",
3575 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3576 __kmp_printf("\n");
3577 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3578 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3581 } else {
3582 __kmp_printf("Ubers array is not allocated.\n");
3585 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3586 "--------\n");
3587 while (list->next != NULL) {
3588 kmp_team_p const *team = list->entry;
3589 int i;
3590 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3591 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3592 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3593 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3594 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3595 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3596 for (i = 0; i < team->t.t_nproc; ++i) {
3597 __kmp_printf(" Thread %2d: ", i);
3598 __kmp_print_structure_thread("", team->t.t_threads[i]);
3600 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3601 __kmp_printf("\n");
3602 list = list->next;
3605 // Print out __kmp_thread_pool and __kmp_team_pool.
3606 __kmp_printf("\n------------------------------\nPools\n----------------------"
3607 "--------\n");
3608 __kmp_print_structure_thread("Thread pool: ",
3609 CCAST(kmp_info_t *, __kmp_thread_pool));
3610 __kmp_print_structure_team("Team pool: ",
3611 CCAST(kmp_team_t *, __kmp_team_pool));
3612 __kmp_printf("\n");
3614 // Free team list.
3615 while (list != NULL) {
3616 kmp_team_list_item_t *item = list;
3617 list = list->next;
3618 KMP_INTERNAL_FREE(item);
3622 #endif
3624 //---------------------------------------------------------------------------
3625 // Stuff for per-thread fast random number generator
3626 // Table of primes
3627 static const unsigned __kmp_primes[] = {
3628 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3629 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3630 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3631 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3632 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3633 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3634 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3635 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3636 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3637 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3638 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3640 //---------------------------------------------------------------------------
3641 // __kmp_get_random: Get a random number using a linear congruential method.
3642 unsigned short __kmp_get_random(kmp_info_t *thread) {
3643 unsigned x = thread->th.th_x;
3644 unsigned short r = (unsigned short)(x >> 16);
3646 thread->th.th_x = x * thread->th.th_a + 1;
3648 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3649 thread->th.th_info.ds.ds_tid, r));
3651 return r;
3653 //--------------------------------------------------------
3654 // __kmp_init_random: Initialize a random number generator
3655 void __kmp_init_random(kmp_info_t *thread) {
3656 unsigned seed = thread->th.th_info.ds.ds_tid;
3658 thread->th.th_a =
3659 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3660 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3661 KA_TRACE(30,
3662 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3665 #if KMP_OS_WINDOWS
3666 /* reclaim array entries for root threads that are already dead, returns number
3667 * reclaimed */
3668 static int __kmp_reclaim_dead_roots(void) {
3669 int i, r = 0;
3671 for (i = 0; i < __kmp_threads_capacity; ++i) {
3672 if (KMP_UBER_GTID(i) &&
3673 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3674 !__kmp_root[i]
3675 ->r.r_active) { // AC: reclaim only roots died in non-active state
3676 r += __kmp_unregister_root_other_thread(i);
3679 return r;
3681 #endif
3683 /* This function attempts to create free entries in __kmp_threads and
3684 __kmp_root, and returns the number of free entries generated.
3686 For Windows* OS static library, the first mechanism used is to reclaim array
3687 entries for root threads that are already dead.
3689 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3690 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3691 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3692 threadprivate cache array has been created. Synchronization with
3693 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3695 After any dead root reclamation, if the clipping value allows array expansion
3696 to result in the generation of a total of nNeed free slots, the function does
3697 that expansion. If not, nothing is done beyond the possible initial root
3698 thread reclamation.
3700 If any argument is negative, the behavior is undefined. */
3701 static int __kmp_expand_threads(int nNeed) {
3702 int added = 0;
3703 int minimumRequiredCapacity;
3704 int newCapacity;
3705 kmp_info_t **newThreads;
3706 kmp_root_t **newRoot;
3708 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3709 // resizing __kmp_threads does not need additional protection if foreign
3710 // threads are present
3712 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3713 /* only for Windows static library */
3714 /* reclaim array entries for root threads that are already dead */
3715 added = __kmp_reclaim_dead_roots();
3717 if (nNeed) {
3718 nNeed -= added;
3719 if (nNeed < 0)
3720 nNeed = 0;
3722 #endif
3723 if (nNeed <= 0)
3724 return added;
3726 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3727 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3728 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3729 // > __kmp_max_nth in one of two ways:
3731 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3732 // may not be reused by another thread, so we may need to increase
3733 // __kmp_threads_capacity to __kmp_max_nth + 1.
3735 // 2) New foreign root(s) are encountered. We always register new foreign
3736 // roots. This may cause a smaller # of threads to be allocated at
3737 // subsequent parallel regions, but the worker threads hang around (and
3738 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3740 // Anyway, that is the reason for moving the check to see if
3741 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3742 // instead of having it performed here. -BB
3744 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3746 /* compute expansion headroom to check if we can expand */
3747 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3748 /* possible expansion too small -- give up */
3749 return added;
3751 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3753 newCapacity = __kmp_threads_capacity;
3754 do {
3755 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3756 : __kmp_sys_max_nth;
3757 } while (newCapacity < minimumRequiredCapacity);
3758 newThreads = (kmp_info_t **)__kmp_allocate(
3759 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3760 newRoot =
3761 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3762 KMP_MEMCPY(newThreads, __kmp_threads,
3763 __kmp_threads_capacity * sizeof(kmp_info_t *));
3764 KMP_MEMCPY(newRoot, __kmp_root,
3765 __kmp_threads_capacity * sizeof(kmp_root_t *));
3766 // Put old __kmp_threads array on a list. Any ongoing references to the old
3767 // list will be valid. This list is cleaned up at library shutdown.
3768 kmp_old_threads_list_t *node =
3769 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3770 node->threads = __kmp_threads;
3771 node->next = __kmp_old_threads_list;
3772 __kmp_old_threads_list = node;
3774 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3775 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3776 added += newCapacity - __kmp_threads_capacity;
3777 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3779 if (newCapacity > __kmp_tp_capacity) {
3780 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3781 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3782 __kmp_threadprivate_resize_cache(newCapacity);
3783 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3784 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3786 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3789 return added;
3792 /* Register the current thread as a root thread and obtain our gtid. We must
3793 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3794 thread that calls from __kmp_do_serial_initialize() */
3795 int __kmp_register_root(int initial_thread) {
3796 kmp_info_t *root_thread;
3797 kmp_root_t *root;
3798 int gtid;
3799 int capacity;
3800 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3801 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3802 KMP_MB();
3804 /* 2007-03-02:
3805 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3806 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3807 work as expected -- it may return false (that means there is at least one
3808 empty slot in __kmp_threads array), but it is possible the only free slot
3809 is #0, which is reserved for initial thread and so cannot be used for this
3810 one. Following code workarounds this bug.
3812 However, right solution seems to be not reserving slot #0 for initial
3813 thread because:
3814 (1) there is no magic in slot #0,
3815 (2) we cannot detect initial thread reliably (the first thread which does
3816 serial initialization may be not a real initial thread).
3818 capacity = __kmp_threads_capacity;
3819 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3820 --capacity;
3823 // If it is not for initializing the hidden helper team, we need to take
3824 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3825 // in __kmp_threads_capacity.
3826 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3827 capacity -= __kmp_hidden_helper_threads_num;
3830 /* see if there are too many threads */
3831 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3832 if (__kmp_tp_cached) {
3833 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3834 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3835 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3836 } else {
3837 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3838 __kmp_msg_null);
3842 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3843 // 0: initial thread, also a regular OpenMP thread.
3844 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3845 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3846 // regular OpenMP threads.
3847 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3848 // Find an available thread slot for hidden helper thread. Slots for hidden
3849 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3850 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3851 gtid <= __kmp_hidden_helper_threads_num;
3852 gtid++)
3854 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3855 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3856 "hidden helper thread: T#%d\n",
3857 gtid));
3858 } else {
3859 /* find an available thread slot */
3860 // Don't reassign the zero slot since we need that to only be used by
3861 // initial thread. Slots for hidden helper threads should also be skipped.
3862 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3863 gtid = 0;
3864 } else {
3865 for (gtid = __kmp_hidden_helper_threads_num + 1;
3866 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3869 KA_TRACE(
3870 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3871 KMP_ASSERT(gtid < __kmp_threads_capacity);
3874 /* update global accounting */
3875 __kmp_all_nth++;
3876 TCW_4(__kmp_nth, __kmp_nth + 1);
3878 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3879 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3880 if (__kmp_adjust_gtid_mode) {
3881 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3882 if (TCR_4(__kmp_gtid_mode) != 2) {
3883 TCW_4(__kmp_gtid_mode, 2);
3885 } else {
3886 if (TCR_4(__kmp_gtid_mode) != 1) {
3887 TCW_4(__kmp_gtid_mode, 1);
3892 #ifdef KMP_ADJUST_BLOCKTIME
3893 /* Adjust blocktime to zero if necessary */
3894 /* Middle initialization might not have occurred yet */
3895 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3896 if (__kmp_nth > __kmp_avail_proc) {
3897 __kmp_zero_bt = TRUE;
3900 #endif /* KMP_ADJUST_BLOCKTIME */
3902 /* setup this new hierarchy */
3903 if (!(root = __kmp_root[gtid])) {
3904 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3905 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3908 #if KMP_STATS_ENABLED
3909 // Initialize stats as soon as possible (right after gtid assignment).
3910 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3911 __kmp_stats_thread_ptr->startLife();
3912 KMP_SET_THREAD_STATE(SERIAL_REGION);
3913 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3914 #endif
3915 __kmp_initialize_root(root);
3917 /* setup new root thread structure */
3918 if (root->r.r_uber_thread) {
3919 root_thread = root->r.r_uber_thread;
3920 } else {
3921 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3922 if (__kmp_storage_map) {
3923 __kmp_print_thread_storage_map(root_thread, gtid);
3925 root_thread->th.th_info.ds.ds_gtid = gtid;
3926 #if OMPT_SUPPORT
3927 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3928 #endif
3929 root_thread->th.th_root = root;
3930 if (__kmp_env_consistency_check) {
3931 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3933 #if USE_FAST_MEMORY
3934 __kmp_initialize_fast_memory(root_thread);
3935 #endif /* USE_FAST_MEMORY */
3937 #if KMP_USE_BGET
3938 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3939 __kmp_initialize_bget(root_thread);
3940 #endif
3941 __kmp_init_random(root_thread); // Initialize random number generator
3944 /* setup the serial team held in reserve by the root thread */
3945 if (!root_thread->th.th_serial_team) {
3946 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3947 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3948 root_thread->th.th_serial_team = __kmp_allocate_team(
3949 root, 1, 1,
3950 #if OMPT_SUPPORT
3951 ompt_data_none, // root parallel id
3952 #endif
3953 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3955 KMP_ASSERT(root_thread->th.th_serial_team);
3956 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3957 root_thread->th.th_serial_team));
3959 /* drop root_thread into place */
3960 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3962 root->r.r_root_team->t.t_threads[0] = root_thread;
3963 root->r.r_hot_team->t.t_threads[0] = root_thread;
3964 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3965 // AC: the team created in reserve, not for execution (it is unused for now).
3966 root_thread->th.th_serial_team->t.t_serialized = 0;
3967 root->r.r_uber_thread = root_thread;
3969 /* initialize the thread, get it ready to go */
3970 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3971 TCW_4(__kmp_init_gtid, TRUE);
3973 /* prepare the primary thread for get_gtid() */
3974 __kmp_gtid_set_specific(gtid);
3976 #if USE_ITT_BUILD
3977 __kmp_itt_thread_name(gtid);
3978 #endif /* USE_ITT_BUILD */
3980 #ifdef KMP_TDATA_GTID
3981 __kmp_gtid = gtid;
3982 #endif
3983 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3984 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3986 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3987 "plain=%u\n",
3988 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3989 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3990 KMP_INIT_BARRIER_STATE));
3991 { // Initialize barrier data.
3992 int b;
3993 for (b = 0; b < bs_last_barrier; ++b) {
3994 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3995 #if USE_DEBUGGER
3996 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3997 #endif
4000 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4001 KMP_INIT_BARRIER_STATE);
4003 #if KMP_AFFINITY_SUPPORTED
4004 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4005 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4006 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4007 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4008 #endif /* KMP_AFFINITY_SUPPORTED */
4009 root_thread->th.th_def_allocator = __kmp_def_allocator;
4010 root_thread->th.th_prev_level = 0;
4011 root_thread->th.th_prev_num_threads = 1;
4013 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4014 tmp->cg_root = root_thread;
4015 tmp->cg_thread_limit = __kmp_cg_max_nth;
4016 tmp->cg_nthreads = 1;
4017 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4018 " cg_nthreads init to 1\n",
4019 root_thread, tmp));
4020 tmp->up = NULL;
4021 root_thread->th.th_cg_roots = tmp;
4023 __kmp_root_counter++;
4025 #if OMPT_SUPPORT
4026 if (!initial_thread && ompt_enabled.enabled) {
4028 kmp_info_t *root_thread = ompt_get_thread();
4030 ompt_set_thread_state(root_thread, ompt_state_overhead);
4032 if (ompt_enabled.ompt_callback_thread_begin) {
4033 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4034 ompt_thread_initial, __ompt_get_thread_data_internal());
4036 ompt_data_t *task_data;
4037 ompt_data_t *parallel_data;
4038 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4039 NULL);
4040 if (ompt_enabled.ompt_callback_implicit_task) {
4041 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4042 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4045 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4047 #endif
4048 #if OMPD_SUPPORT
4049 if (ompd_state & OMPD_ENABLE_BP)
4050 ompd_bp_thread_begin();
4051 #endif
4053 KMP_MB();
4054 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4056 return gtid;
4059 #if KMP_NESTED_HOT_TEAMS
4060 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4061 const int max_level) {
4062 int i, n, nth;
4063 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4064 if (!hot_teams || !hot_teams[level].hot_team) {
4065 return 0;
4067 KMP_DEBUG_ASSERT(level < max_level);
4068 kmp_team_t *team = hot_teams[level].hot_team;
4069 nth = hot_teams[level].hot_team_nth;
4070 n = nth - 1; // primary thread is not freed
4071 if (level < max_level - 1) {
4072 for (i = 0; i < nth; ++i) {
4073 kmp_info_t *th = team->t.t_threads[i];
4074 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4075 if (i > 0 && th->th.th_hot_teams) {
4076 __kmp_free(th->th.th_hot_teams);
4077 th->th.th_hot_teams = NULL;
4081 __kmp_free_team(root, team, NULL);
4082 return n;
4084 #endif
4086 // Resets a root thread and clear its root and hot teams.
4087 // Returns the number of __kmp_threads entries directly and indirectly freed.
4088 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4089 kmp_team_t *root_team = root->r.r_root_team;
4090 kmp_team_t *hot_team = root->r.r_hot_team;
4091 int n = hot_team->t.t_nproc;
4092 int i;
4094 KMP_DEBUG_ASSERT(!root->r.r_active);
4096 root->r.r_root_team = NULL;
4097 root->r.r_hot_team = NULL;
4098 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4099 // before call to __kmp_free_team().
4100 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4101 #if KMP_NESTED_HOT_TEAMS
4102 if (__kmp_hot_teams_max_level >
4103 0) { // need to free nested hot teams and their threads if any
4104 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4105 kmp_info_t *th = hot_team->t.t_threads[i];
4106 if (__kmp_hot_teams_max_level > 1) {
4107 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4109 if (th->th.th_hot_teams) {
4110 __kmp_free(th->th.th_hot_teams);
4111 th->th.th_hot_teams = NULL;
4115 #endif
4116 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4118 // Before we can reap the thread, we need to make certain that all other
4119 // threads in the teams that had this root as ancestor have stopped trying to
4120 // steal tasks.
4121 if (__kmp_tasking_mode != tskm_immediate_exec) {
4122 __kmp_wait_to_unref_task_teams();
4125 #if KMP_OS_WINDOWS
4126 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4127 KA_TRACE(
4128 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4129 "\n",
4130 (LPVOID) & (root->r.r_uber_thread->th),
4131 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4132 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4133 #endif /* KMP_OS_WINDOWS */
4135 #if OMPD_SUPPORT
4136 if (ompd_state & OMPD_ENABLE_BP)
4137 ompd_bp_thread_end();
4138 #endif
4140 #if OMPT_SUPPORT
4141 ompt_data_t *task_data;
4142 ompt_data_t *parallel_data;
4143 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4144 NULL);
4145 if (ompt_enabled.ompt_callback_implicit_task) {
4146 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4147 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4149 if (ompt_enabled.ompt_callback_thread_end) {
4150 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4151 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4153 #endif
4155 TCW_4(__kmp_nth,
4156 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4157 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4158 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4159 " to %d\n",
4160 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4161 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4162 if (i == 1) {
4163 // need to free contention group structure
4164 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4165 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4166 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4167 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4168 root->r.r_uber_thread->th.th_cg_roots = NULL;
4170 __kmp_reap_thread(root->r.r_uber_thread, 1);
4172 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4173 // instead of freeing.
4174 root->r.r_uber_thread = NULL;
4175 /* mark root as no longer in use */
4176 root->r.r_begin = FALSE;
4178 return n;
4181 void __kmp_unregister_root_current_thread(int gtid) {
4182 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4183 /* this lock should be ok, since unregister_root_current_thread is never
4184 called during an abort, only during a normal close. furthermore, if you
4185 have the forkjoin lock, you should never try to get the initz lock */
4186 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4187 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4188 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4189 "exiting T#%d\n",
4190 gtid));
4191 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4192 return;
4194 kmp_root_t *root = __kmp_root[gtid];
4196 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4197 KMP_ASSERT(KMP_UBER_GTID(gtid));
4198 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4199 KMP_ASSERT(root->r.r_active == FALSE);
4201 KMP_MB();
4203 kmp_info_t *thread = __kmp_threads[gtid];
4204 kmp_team_t *team = thread->th.th_team;
4205 kmp_task_team_t *task_team = thread->th.th_task_team;
4207 // we need to wait for the proxy tasks before finishing the thread
4208 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4209 task_team->tt.tt_hidden_helper_task_encountered)) {
4210 #if OMPT_SUPPORT
4211 // the runtime is shutting down so we won't report any events
4212 thread->th.ompt_thread_info.state = ompt_state_undefined;
4213 #endif
4214 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4217 __kmp_reset_root(gtid, root);
4219 KMP_MB();
4220 KC_TRACE(10,
4221 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4223 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4226 #if KMP_OS_WINDOWS
4227 /* __kmp_forkjoin_lock must be already held
4228 Unregisters a root thread that is not the current thread. Returns the number
4229 of __kmp_threads entries freed as a result. */
4230 static int __kmp_unregister_root_other_thread(int gtid) {
4231 kmp_root_t *root = __kmp_root[gtid];
4232 int r;
4234 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4235 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4236 KMP_ASSERT(KMP_UBER_GTID(gtid));
4237 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4238 KMP_ASSERT(root->r.r_active == FALSE);
4240 r = __kmp_reset_root(gtid, root);
4241 KC_TRACE(10,
4242 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4243 return r;
4245 #endif
4247 #if KMP_DEBUG
4248 void __kmp_task_info() {
4250 kmp_int32 gtid = __kmp_entry_gtid();
4251 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4252 kmp_info_t *this_thr = __kmp_threads[gtid];
4253 kmp_team_t *steam = this_thr->th.th_serial_team;
4254 kmp_team_t *team = this_thr->th.th_team;
4256 __kmp_printf(
4257 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4258 "ptask=%p\n",
4259 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4260 team->t.t_implicit_task_taskdata[tid].td_parent);
4262 #endif // KMP_DEBUG
4264 /* TODO optimize with one big memclr, take out what isn't needed, split
4265 responsibility to workers as much as possible, and delay initialization of
4266 features as much as possible */
4267 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4268 int tid, int gtid) {
4269 /* this_thr->th.th_info.ds.ds_gtid is setup in
4270 kmp_allocate_thread/create_worker.
4271 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4272 KMP_DEBUG_ASSERT(this_thr != NULL);
4273 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4274 KMP_DEBUG_ASSERT(team);
4275 KMP_DEBUG_ASSERT(team->t.t_threads);
4276 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4277 kmp_info_t *master = team->t.t_threads[0];
4278 KMP_DEBUG_ASSERT(master);
4279 KMP_DEBUG_ASSERT(master->th.th_root);
4281 KMP_MB();
4283 TCW_SYNC_PTR(this_thr->th.th_team, team);
4285 this_thr->th.th_info.ds.ds_tid = tid;
4286 this_thr->th.th_set_nproc = 0;
4287 if (__kmp_tasking_mode != tskm_immediate_exec)
4288 // When tasking is possible, threads are not safe to reap until they are
4289 // done tasking; this will be set when tasking code is exited in wait
4290 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4291 else // no tasking --> always safe to reap
4292 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4293 this_thr->th.th_set_proc_bind = proc_bind_default;
4294 #if KMP_AFFINITY_SUPPORTED
4295 this_thr->th.th_new_place = this_thr->th.th_current_place;
4296 #endif
4297 this_thr->th.th_root = master->th.th_root;
4299 /* setup the thread's cache of the team structure */
4300 this_thr->th.th_team_nproc = team->t.t_nproc;
4301 this_thr->th.th_team_master = master;
4302 this_thr->th.th_team_serialized = team->t.t_serialized;
4304 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4306 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4307 tid, gtid, this_thr, this_thr->th.th_current_task));
4309 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4310 team, tid, TRUE);
4312 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4313 tid, gtid, this_thr, this_thr->th.th_current_task));
4314 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4315 // __kmp_initialize_team()?
4317 /* TODO no worksharing in speculative threads */
4318 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4320 this_thr->th.th_local.this_construct = 0;
4322 if (!this_thr->th.th_pri_common) {
4323 this_thr->th.th_pri_common =
4324 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4325 if (__kmp_storage_map) {
4326 __kmp_print_storage_map_gtid(
4327 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4328 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4330 this_thr->th.th_pri_head = NULL;
4333 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4334 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4335 // Make new thread's CG root same as primary thread's
4336 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4337 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4338 if (tmp) {
4339 // worker changes CG, need to check if old CG should be freed
4340 int i = tmp->cg_nthreads--;
4341 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4342 " on node %p of thread %p to %d\n",
4343 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4344 if (i == 1) {
4345 __kmp_free(tmp); // last thread left CG --> free it
4348 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4349 // Increment new thread's CG root's counter to add the new thread
4350 this_thr->th.th_cg_roots->cg_nthreads++;
4351 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4352 " node %p of thread %p to %d\n",
4353 this_thr, this_thr->th.th_cg_roots,
4354 this_thr->th.th_cg_roots->cg_root,
4355 this_thr->th.th_cg_roots->cg_nthreads));
4356 this_thr->th.th_current_task->td_icvs.thread_limit =
4357 this_thr->th.th_cg_roots->cg_thread_limit;
4360 /* Initialize dynamic dispatch */
4362 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4363 // Use team max_nproc since this will never change for the team.
4364 size_t disp_size =
4365 sizeof(dispatch_private_info_t) *
4366 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4367 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4368 team->t.t_max_nproc));
4369 KMP_ASSERT(dispatch);
4370 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4371 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4373 dispatch->th_disp_index = 0;
4374 dispatch->th_doacross_buf_idx = 0;
4375 if (!dispatch->th_disp_buffer) {
4376 dispatch->th_disp_buffer =
4377 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4379 if (__kmp_storage_map) {
4380 __kmp_print_storage_map_gtid(
4381 gtid, &dispatch->th_disp_buffer[0],
4382 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4384 : __kmp_dispatch_num_buffers],
4385 disp_size,
4386 "th_%d.th_dispatch.th_disp_buffer "
4387 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4388 gtid, team->t.t_id, gtid);
4390 } else {
4391 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4394 dispatch->th_dispatch_pr_current = 0;
4395 dispatch->th_dispatch_sh_current = 0;
4397 dispatch->th_deo_fcn = 0; /* ORDERED */
4398 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4401 this_thr->th.th_next_pool = NULL;
4403 if (!this_thr->th.th_task_state_memo_stack) {
4404 size_t i;
4405 this_thr->th.th_task_state_memo_stack =
4406 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4407 this_thr->th.th_task_state_top = 0;
4408 this_thr->th.th_task_state_stack_sz = 4;
4409 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4410 ++i) // zero init the stack
4411 this_thr->th.th_task_state_memo_stack[i] = 0;
4414 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4415 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4417 KMP_MB();
4420 /* allocate a new thread for the requesting team. this is only called from
4421 within a forkjoin critical section. we will first try to get an available
4422 thread from the thread pool. if none is available, we will fork a new one
4423 assuming we are able to create a new one. this should be assured, as the
4424 caller should check on this first. */
4425 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4426 int new_tid) {
4427 kmp_team_t *serial_team;
4428 kmp_info_t *new_thr;
4429 int new_gtid;
4431 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4432 KMP_DEBUG_ASSERT(root && team);
4433 #if !KMP_NESTED_HOT_TEAMS
4434 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4435 #endif
4436 KMP_MB();
4438 /* first, try to get one from the thread pool */
4439 if (__kmp_thread_pool) {
4440 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442 if (new_thr == __kmp_thread_pool_insert_pt) {
4443 __kmp_thread_pool_insert_pt = NULL;
4445 TCW_4(new_thr->th.th_in_pool, FALSE);
4446 __kmp_suspend_initialize_thread(new_thr);
4447 __kmp_lock_suspend_mx(new_thr);
4448 if (new_thr->th.th_active_in_pool == TRUE) {
4449 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451 new_thr->th.th_active_in_pool = FALSE;
4453 __kmp_unlock_suspend_mx(new_thr);
4455 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457 KMP_ASSERT(!new_thr->th.th_team);
4458 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4460 /* setup the thread structure */
4461 __kmp_initialize_info(new_thr, team, new_tid,
4462 new_thr->th.th_info.ds.ds_gtid);
4463 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4465 TCW_4(__kmp_nth, __kmp_nth + 1);
4467 new_thr->th.th_task_state = 0;
4468 new_thr->th.th_task_state_top = 0;
4469 new_thr->th.th_task_state_stack_sz = 4;
4471 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4472 // Make sure pool thread has transitioned to waiting on own thread struct
4473 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4474 // Thread activated in __kmp_allocate_team when increasing team size
4477 #ifdef KMP_ADJUST_BLOCKTIME
4478 /* Adjust blocktime back to zero if necessary */
4479 /* Middle initialization might not have occurred yet */
4480 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4481 if (__kmp_nth > __kmp_avail_proc) {
4482 __kmp_zero_bt = TRUE;
4485 #endif /* KMP_ADJUST_BLOCKTIME */
4487 #if KMP_DEBUG
4488 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4489 // KMP_BARRIER_PARENT_FLAG.
4490 int b;
4491 kmp_balign_t *balign = new_thr->th.th_bar;
4492 for (b = 0; b < bs_last_barrier; ++b)
4493 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4494 #endif
4496 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4497 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4499 KMP_MB();
4500 return new_thr;
4503 /* no, well fork a new one */
4504 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4505 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4507 #if KMP_USE_MONITOR
4508 // If this is the first worker thread the RTL is creating, then also
4509 // launch the monitor thread. We try to do this as early as possible.
4510 if (!TCR_4(__kmp_init_monitor)) {
4511 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4512 if (!TCR_4(__kmp_init_monitor)) {
4513 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4514 TCW_4(__kmp_init_monitor, 1);
4515 __kmp_create_monitor(&__kmp_monitor);
4516 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4517 #if KMP_OS_WINDOWS
4518 // AC: wait until monitor has started. This is a fix for CQ232808.
4519 // The reason is that if the library is loaded/unloaded in a loop with
4520 // small (parallel) work in between, then there is high probability that
4521 // monitor thread started after the library shutdown. At shutdown it is
4522 // too late to cope with the problem, because when the primary thread is
4523 // in DllMain (process detach) the monitor has no chances to start (it is
4524 // blocked), and primary thread has no means to inform the monitor that
4525 // the library has gone, because all the memory which the monitor can
4526 // access is going to be released/reset.
4527 while (TCR_4(__kmp_init_monitor) < 2) {
4528 KMP_YIELD(TRUE);
4530 KF_TRACE(10, ("after monitor thread has started\n"));
4531 #endif
4533 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4535 #endif
4537 KMP_MB();
4540 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4542 : __kmp_hidden_helper_threads_num + 1;
4544 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4545 ++new_gtid) {
4546 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4549 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4550 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4554 /* allocate space for it. */
4555 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4557 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4559 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4560 // suppress race conditions detection on synchronization flags in debug mode
4561 // this helps to analyze library internals eliminating false positives
4562 __itt_suppress_mark_range(
4563 __itt_suppress_range, __itt_suppress_threading_errors,
4564 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4565 __itt_suppress_mark_range(
4566 __itt_suppress_range, __itt_suppress_threading_errors,
4567 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4568 #if KMP_OS_WINDOWS
4569 __itt_suppress_mark_range(
4570 __itt_suppress_range, __itt_suppress_threading_errors,
4571 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4572 #else
4573 __itt_suppress_mark_range(__itt_suppress_range,
4574 __itt_suppress_threading_errors,
4575 &new_thr->th.th_suspend_init_count,
4576 sizeof(new_thr->th.th_suspend_init_count));
4577 #endif
4578 // TODO: check if we need to also suppress b_arrived flags
4579 __itt_suppress_mark_range(__itt_suppress_range,
4580 __itt_suppress_threading_errors,
4581 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4582 sizeof(new_thr->th.th_bar[0].bb.b_go));
4583 __itt_suppress_mark_range(__itt_suppress_range,
4584 __itt_suppress_threading_errors,
4585 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4586 sizeof(new_thr->th.th_bar[1].bb.b_go));
4587 __itt_suppress_mark_range(__itt_suppress_range,
4588 __itt_suppress_threading_errors,
4589 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4590 sizeof(new_thr->th.th_bar[2].bb.b_go));
4591 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4592 if (__kmp_storage_map) {
4593 __kmp_print_thread_storage_map(new_thr, new_gtid);
4596 // add the reserve serialized team, initialized from the team's primary thread
4598 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4599 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4600 new_thr->th.th_serial_team = serial_team =
4601 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4602 #if OMPT_SUPPORT
4603 ompt_data_none, // root parallel id
4604 #endif
4605 proc_bind_default, &r_icvs,
4606 0 USE_NESTED_HOT_ARG(NULL));
4608 KMP_ASSERT(serial_team);
4609 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4610 // execution (it is unused for now).
4611 serial_team->t.t_threads[0] = new_thr;
4612 KF_TRACE(10,
4613 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4614 new_thr));
4616 /* setup the thread structures */
4617 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4619 #if USE_FAST_MEMORY
4620 __kmp_initialize_fast_memory(new_thr);
4621 #endif /* USE_FAST_MEMORY */
4623 #if KMP_USE_BGET
4624 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4625 __kmp_initialize_bget(new_thr);
4626 #endif
4628 __kmp_init_random(new_thr); // Initialize random number generator
4630 /* Initialize these only once when thread is grabbed for a team allocation */
4631 KA_TRACE(20,
4632 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4633 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4635 int b;
4636 kmp_balign_t *balign = new_thr->th.th_bar;
4637 for (b = 0; b < bs_last_barrier; ++b) {
4638 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4639 balign[b].bb.team = NULL;
4640 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4641 balign[b].bb.use_oncore_barrier = 0;
4644 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4645 new_thr->th.th_sleep_loc_type = flag_unset;
4647 new_thr->th.th_spin_here = FALSE;
4648 new_thr->th.th_next_waiting = 0;
4649 #if KMP_OS_UNIX
4650 new_thr->th.th_blocking = false;
4651 #endif
4653 #if KMP_AFFINITY_SUPPORTED
4654 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4655 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4656 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4657 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4658 #endif
4659 new_thr->th.th_def_allocator = __kmp_def_allocator;
4660 new_thr->th.th_prev_level = 0;
4661 new_thr->th.th_prev_num_threads = 1;
4663 TCW_4(new_thr->th.th_in_pool, FALSE);
4664 new_thr->th.th_active_in_pool = FALSE;
4665 TCW_4(new_thr->th.th_active, TRUE);
4667 /* adjust the global counters */
4668 __kmp_all_nth++;
4669 __kmp_nth++;
4671 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4672 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4673 if (__kmp_adjust_gtid_mode) {
4674 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4675 if (TCR_4(__kmp_gtid_mode) != 2) {
4676 TCW_4(__kmp_gtid_mode, 2);
4678 } else {
4679 if (TCR_4(__kmp_gtid_mode) != 1) {
4680 TCW_4(__kmp_gtid_mode, 1);
4685 #ifdef KMP_ADJUST_BLOCKTIME
4686 /* Adjust blocktime back to zero if necessary */
4687 /* Middle initialization might not have occurred yet */
4688 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4689 if (__kmp_nth > __kmp_avail_proc) {
4690 __kmp_zero_bt = TRUE;
4693 #endif /* KMP_ADJUST_BLOCKTIME */
4695 #if KMP_AFFINITY_SUPPORTED
4696 // Set the affinity and topology information for new thread
4697 __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4698 #endif
4700 /* actually fork it and create the new worker thread */
4701 KF_TRACE(
4702 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4703 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4704 KF_TRACE(10,
4705 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4707 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4708 new_gtid));
4709 KMP_MB();
4710 return new_thr;
4713 /* Reinitialize team for reuse.
4714 The hot team code calls this case at every fork barrier, so EPCC barrier
4715 test are extremely sensitive to changes in it, esp. writes to the team
4716 struct, which cause a cache invalidation in all threads.
4717 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4718 static void __kmp_reinitialize_team(kmp_team_t *team,
4719 kmp_internal_control_t *new_icvs,
4720 ident_t *loc) {
4721 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4722 team->t.t_threads[0], team));
4723 KMP_DEBUG_ASSERT(team && new_icvs);
4724 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4725 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4727 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4728 // Copy ICVs to the primary thread's implicit taskdata
4729 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4730 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4732 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4733 team->t.t_threads[0], team));
4736 /* Initialize the team data structure.
4737 This assumes the t_threads and t_max_nproc are already set.
4738 Also, we don't touch the arguments */
4739 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4740 kmp_internal_control_t *new_icvs,
4741 ident_t *loc) {
4742 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4744 /* verify */
4745 KMP_DEBUG_ASSERT(team);
4746 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4747 KMP_DEBUG_ASSERT(team->t.t_threads);
4748 KMP_MB();
4750 team->t.t_master_tid = 0; /* not needed */
4751 /* team->t.t_master_bar; not needed */
4752 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4753 team->t.t_nproc = new_nproc;
4755 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4756 team->t.t_next_pool = NULL;
4757 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4758 * up hot team */
4760 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4761 team->t.t_invoke = NULL; /* not needed */
4763 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4764 team->t.t_sched.sched = new_icvs->sched.sched;
4766 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4767 team->t.t_fp_control_saved = FALSE; /* not needed */
4768 team->t.t_x87_fpu_control_word = 0; /* not needed */
4769 team->t.t_mxcsr = 0; /* not needed */
4770 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4772 team->t.t_construct = 0;
4774 team->t.t_ordered.dt.t_value = 0;
4775 team->t.t_master_active = FALSE;
4777 #ifdef KMP_DEBUG
4778 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4779 #endif
4780 #if KMP_OS_WINDOWS
4781 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4782 #endif
4784 team->t.t_control_stack_top = NULL;
4786 __kmp_reinitialize_team(team, new_icvs, loc);
4788 KMP_MB();
4789 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4792 #if KMP_AFFINITY_SUPPORTED
4793 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4794 int first, int last, int newp) {
4795 th->th.th_first_place = first;
4796 th->th.th_last_place = last;
4797 th->th.th_new_place = newp;
4798 if (newp != th->th.th_current_place) {
4799 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4800 team->t.t_display_affinity = 1;
4801 // Copy topology information associated with the new place
4802 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4803 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4807 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4808 // It calculates the worker + primary thread's partition based upon the parent
4809 // thread's partition, and binds each worker to a thread in their partition.
4810 // The primary thread's partition should already include its current binding.
4811 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4812 // Do not partition places for the hidden helper team
4813 if (KMP_HIDDEN_HELPER_TEAM(team))
4814 return;
4815 // Copy the primary thread's place partition to the team struct
4816 kmp_info_t *master_th = team->t.t_threads[0];
4817 KMP_DEBUG_ASSERT(master_th != NULL);
4818 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4819 int first_place = master_th->th.th_first_place;
4820 int last_place = master_th->th.th_last_place;
4821 int masters_place = master_th->th.th_current_place;
4822 int num_masks = __kmp_affinity.num_masks;
4823 team->t.t_first_place = first_place;
4824 team->t.t_last_place = last_place;
4826 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4827 "bound to place %d partition = [%d,%d]\n",
4828 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4829 team->t.t_id, masters_place, first_place, last_place));
4831 switch (proc_bind) {
4833 case proc_bind_default:
4834 // Serial teams might have the proc_bind policy set to proc_bind_default.
4835 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4836 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4837 break;
4839 case proc_bind_primary: {
4840 int f;
4841 int n_th = team->t.t_nproc;
4842 for (f = 1; f < n_th; f++) {
4843 kmp_info_t *th = team->t.t_threads[f];
4844 KMP_DEBUG_ASSERT(th != NULL);
4845 __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4847 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4848 "partition = [%d,%d]\n",
4849 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4850 f, masters_place, first_place, last_place));
4852 } break;
4854 case proc_bind_close: {
4855 int f;
4856 int n_th = team->t.t_nproc;
4857 int n_places;
4858 if (first_place <= last_place) {
4859 n_places = last_place - first_place + 1;
4860 } else {
4861 n_places = num_masks - first_place + last_place + 1;
4863 if (n_th <= n_places) {
4864 int place = masters_place;
4865 for (f = 1; f < n_th; f++) {
4866 kmp_info_t *th = team->t.t_threads[f];
4867 KMP_DEBUG_ASSERT(th != NULL);
4869 if (place == last_place) {
4870 place = first_place;
4871 } else if (place == (num_masks - 1)) {
4872 place = 0;
4873 } else {
4874 place++;
4876 __kmp_set_thread_place(team, th, first_place, last_place, place);
4878 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4879 "partition = [%d,%d]\n",
4880 __kmp_gtid_from_thread(team->t.t_threads[f]),
4881 team->t.t_id, f, place, first_place, last_place));
4883 } else {
4884 int S, rem, gap, s_count;
4885 S = n_th / n_places;
4886 s_count = 0;
4887 rem = n_th - (S * n_places);
4888 gap = rem > 0 ? n_places / rem : n_places;
4889 int place = masters_place;
4890 int gap_ct = gap;
4891 for (f = 0; f < n_th; f++) {
4892 kmp_info_t *th = team->t.t_threads[f];
4893 KMP_DEBUG_ASSERT(th != NULL);
4895 __kmp_set_thread_place(team, th, first_place, last_place, place);
4896 s_count++;
4898 if ((s_count == S) && rem && (gap_ct == gap)) {
4899 // do nothing, add an extra thread to place on next iteration
4900 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4901 // we added an extra thread to this place; move to next place
4902 if (place == last_place) {
4903 place = first_place;
4904 } else if (place == (num_masks - 1)) {
4905 place = 0;
4906 } else {
4907 place++;
4909 s_count = 0;
4910 gap_ct = 1;
4911 rem--;
4912 } else if (s_count == S) { // place full; don't add extra
4913 if (place == last_place) {
4914 place = first_place;
4915 } else if (place == (num_masks - 1)) {
4916 place = 0;
4917 } else {
4918 place++;
4920 gap_ct++;
4921 s_count = 0;
4924 KA_TRACE(100,
4925 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4926 "partition = [%d,%d]\n",
4927 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4928 th->th.th_new_place, first_place, last_place));
4930 KMP_DEBUG_ASSERT(place == masters_place);
4932 } break;
4934 case proc_bind_spread: {
4935 int f;
4936 int n_th = team->t.t_nproc;
4937 int n_places;
4938 int thidx;
4939 if (first_place <= last_place) {
4940 n_places = last_place - first_place + 1;
4941 } else {
4942 n_places = num_masks - first_place + last_place + 1;
4944 if (n_th <= n_places) {
4945 int place = -1;
4947 if (n_places != num_masks) {
4948 int S = n_places / n_th;
4949 int s_count, rem, gap, gap_ct;
4951 place = masters_place;
4952 rem = n_places - n_th * S;
4953 gap = rem ? n_th / rem : 1;
4954 gap_ct = gap;
4955 thidx = n_th;
4956 if (update_master_only == 1)
4957 thidx = 1;
4958 for (f = 0; f < thidx; f++) {
4959 kmp_info_t *th = team->t.t_threads[f];
4960 KMP_DEBUG_ASSERT(th != NULL);
4962 int fplace = place, nplace = place;
4963 s_count = 1;
4964 while (s_count < S) {
4965 if (place == last_place) {
4966 place = first_place;
4967 } else if (place == (num_masks - 1)) {
4968 place = 0;
4969 } else {
4970 place++;
4972 s_count++;
4974 if (rem && (gap_ct == gap)) {
4975 if (place == last_place) {
4976 place = first_place;
4977 } else if (place == (num_masks - 1)) {
4978 place = 0;
4979 } else {
4980 place++;
4982 rem--;
4983 gap_ct = 0;
4985 __kmp_set_thread_place(team, th, fplace, place, nplace);
4986 gap_ct++;
4988 if (place == last_place) {
4989 place = first_place;
4990 } else if (place == (num_masks - 1)) {
4991 place = 0;
4992 } else {
4993 place++;
4996 KA_TRACE(100,
4997 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4998 "partition = [%d,%d], num_masks: %u\n",
4999 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5000 f, th->th.th_new_place, th->th.th_first_place,
5001 th->th.th_last_place, num_masks));
5003 } else {
5004 /* Having uniform space of available computation places I can create
5005 T partitions of round(P/T) size and put threads into the first
5006 place of each partition. */
5007 double current = static_cast<double>(masters_place);
5008 double spacing =
5009 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5010 int first, last;
5011 kmp_info_t *th;
5013 thidx = n_th + 1;
5014 if (update_master_only == 1)
5015 thidx = 1;
5016 for (f = 0; f < thidx; f++) {
5017 first = static_cast<int>(current);
5018 last = static_cast<int>(current + spacing) - 1;
5019 KMP_DEBUG_ASSERT(last >= first);
5020 if (first >= n_places) {
5021 if (masters_place) {
5022 first -= n_places;
5023 last -= n_places;
5024 if (first == (masters_place + 1)) {
5025 KMP_DEBUG_ASSERT(f == n_th);
5026 first--;
5028 if (last == masters_place) {
5029 KMP_DEBUG_ASSERT(f == (n_th - 1));
5030 last--;
5032 } else {
5033 KMP_DEBUG_ASSERT(f == n_th);
5034 first = 0;
5035 last = 0;
5038 if (last >= n_places) {
5039 last = (n_places - 1);
5041 place = first;
5042 current += spacing;
5043 if (f < n_th) {
5044 KMP_DEBUG_ASSERT(0 <= first);
5045 KMP_DEBUG_ASSERT(n_places > first);
5046 KMP_DEBUG_ASSERT(0 <= last);
5047 KMP_DEBUG_ASSERT(n_places > last);
5048 KMP_DEBUG_ASSERT(last_place >= first_place);
5049 th = team->t.t_threads[f];
5050 KMP_DEBUG_ASSERT(th);
5051 __kmp_set_thread_place(team, th, first, last, place);
5052 KA_TRACE(100,
5053 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5054 "partition = [%d,%d], spacing = %.4f\n",
5055 __kmp_gtid_from_thread(team->t.t_threads[f]),
5056 team->t.t_id, f, th->th.th_new_place,
5057 th->th.th_first_place, th->th.th_last_place, spacing));
5061 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5062 } else {
5063 int S, rem, gap, s_count;
5064 S = n_th / n_places;
5065 s_count = 0;
5066 rem = n_th - (S * n_places);
5067 gap = rem > 0 ? n_places / rem : n_places;
5068 int place = masters_place;
5069 int gap_ct = gap;
5070 thidx = n_th;
5071 if (update_master_only == 1)
5072 thidx = 1;
5073 for (f = 0; f < thidx; f++) {
5074 kmp_info_t *th = team->t.t_threads[f];
5075 KMP_DEBUG_ASSERT(th != NULL);
5077 __kmp_set_thread_place(team, th, place, place, place);
5078 s_count++;
5080 if ((s_count == S) && rem && (gap_ct == gap)) {
5081 // do nothing, add an extra thread to place on next iteration
5082 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5083 // we added an extra thread to this place; move on to next place
5084 if (place == last_place) {
5085 place = first_place;
5086 } else if (place == (num_masks - 1)) {
5087 place = 0;
5088 } else {
5089 place++;
5091 s_count = 0;
5092 gap_ct = 1;
5093 rem--;
5094 } else if (s_count == S) { // place is full; don't add extra thread
5095 if (place == last_place) {
5096 place = first_place;
5097 } else if (place == (num_masks - 1)) {
5098 place = 0;
5099 } else {
5100 place++;
5102 gap_ct++;
5103 s_count = 0;
5106 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5107 "partition = [%d,%d]\n",
5108 __kmp_gtid_from_thread(team->t.t_threads[f]),
5109 team->t.t_id, f, th->th.th_new_place,
5110 th->th.th_first_place, th->th.th_last_place));
5112 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5114 } break;
5116 default:
5117 break;
5120 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123 #endif // KMP_AFFINITY_SUPPORTED
5125 /* allocate a new team data structure to use. take one off of the free pool if
5126 available */
5127 kmp_team_t *
5128 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5129 #if OMPT_SUPPORT
5130 ompt_data_t ompt_parallel_data,
5131 #endif
5132 kmp_proc_bind_t new_proc_bind,
5133 kmp_internal_control_t *new_icvs,
5134 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5135 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5136 int f;
5137 kmp_team_t *team;
5138 int use_hot_team = !root->r.r_active;
5139 int level = 0;
5140 int do_place_partition = 1;
5142 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5143 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5144 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5145 KMP_MB();
5147 #if KMP_NESTED_HOT_TEAMS
5148 kmp_hot_team_ptr_t *hot_teams;
5149 if (master) {
5150 team = master->th.th_team;
5151 level = team->t.t_active_level;
5152 if (master->th.th_teams_microtask) { // in teams construct?
5153 if (master->th.th_teams_size.nteams > 1 &&
5154 ( // #teams > 1
5155 team->t.t_pkfn ==
5156 (microtask_t)__kmp_teams_master || // inner fork of the teams
5157 master->th.th_teams_level <
5158 team->t.t_level)) { // or nested parallel inside the teams
5159 ++level; // not increment if #teams==1, or for outer fork of the teams;
5160 // increment otherwise
5162 // Do not perform the place partition if inner fork of the teams
5163 // Wait until nested parallel region encountered inside teams construct
5164 if ((master->th.th_teams_size.nteams == 1 &&
5165 master->th.th_teams_level >= team->t.t_level) ||
5166 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5167 do_place_partition = 0;
5169 hot_teams = master->th.th_hot_teams;
5170 if (level < __kmp_hot_teams_max_level && hot_teams &&
5171 hot_teams[level].hot_team) {
5172 // hot team has already been allocated for given level
5173 use_hot_team = 1;
5174 } else {
5175 use_hot_team = 0;
5177 } else {
5178 // check we won't access uninitialized hot_teams, just in case
5179 KMP_DEBUG_ASSERT(new_nproc == 1);
5181 #endif
5182 // Optimization to use a "hot" team
5183 if (use_hot_team && new_nproc > 1) {
5184 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5185 #if KMP_NESTED_HOT_TEAMS
5186 team = hot_teams[level].hot_team;
5187 #else
5188 team = root->r.r_hot_team;
5189 #endif
5190 #if KMP_DEBUG
5191 if (__kmp_tasking_mode != tskm_immediate_exec) {
5192 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5193 "task_team[1] = %p before reinit\n",
5194 team->t.t_task_team[0], team->t.t_task_team[1]));
5196 #endif
5198 if (team->t.t_nproc != new_nproc &&
5199 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5200 // Distributed barrier may need a resize
5201 int old_nthr = team->t.t_nproc;
5202 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205 // If not doing the place partition, then reset the team's proc bind
5206 // to indicate that partitioning of all threads still needs to take place
5207 if (do_place_partition == 0)
5208 team->t.t_proc_bind = proc_bind_default;
5209 // Has the number of threads changed?
5210 /* Let's assume the most common case is that the number of threads is
5211 unchanged, and put that case first. */
5212 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5213 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5214 // This case can mean that omp_set_num_threads() was called and the hot
5215 // team size was already reduced, so we check the special flag
5216 if (team->t.t_size_changed == -1) {
5217 team->t.t_size_changed = 1;
5218 } else {
5219 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5223 kmp_r_sched_t new_sched = new_icvs->sched;
5224 // set primary thread's schedule as new run-time schedule
5225 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5227 __kmp_reinitialize_team(team, new_icvs,
5228 root->r.r_uber_thread->th.th_ident);
5230 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5231 team->t.t_threads[0], team));
5232 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5234 #if KMP_AFFINITY_SUPPORTED
5235 if ((team->t.t_size_changed == 0) &&
5236 (team->t.t_proc_bind == new_proc_bind)) {
5237 if (new_proc_bind == proc_bind_spread) {
5238 if (do_place_partition) {
5239 // add flag to update only master for spread
5240 __kmp_partition_places(team, 1);
5243 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5244 "proc_bind = %d, partition = [%d,%d]\n",
5245 team->t.t_id, new_proc_bind, team->t.t_first_place,
5246 team->t.t_last_place));
5247 } else {
5248 if (do_place_partition) {
5249 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5250 __kmp_partition_places(team);
5253 #else
5254 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5255 #endif /* KMP_AFFINITY_SUPPORTED */
5256 } else if (team->t.t_nproc > new_nproc) {
5257 KA_TRACE(20,
5258 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5259 new_nproc));
5261 team->t.t_size_changed = 1;
5262 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5263 // Barrier size already reduced earlier in this function
5264 // Activate team threads via th_used_in_team
5265 __kmp_add_threads_to_team(team, new_nproc);
5267 #if KMP_NESTED_HOT_TEAMS
5268 if (__kmp_hot_teams_mode == 0) {
5269 // AC: saved number of threads should correspond to team's value in this
5270 // mode, can be bigger in mode 1, when hot team has threads in reserve
5271 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5272 hot_teams[level].hot_team_nth = new_nproc;
5273 #endif // KMP_NESTED_HOT_TEAMS
5274 /* release the extra threads we don't need any more */
5275 for (f = new_nproc; f < team->t.t_nproc; f++) {
5276 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5277 if (__kmp_tasking_mode != tskm_immediate_exec) {
5278 // When decreasing team size, threads no longer in the team should
5279 // unref task team.
5280 team->t.t_threads[f]->th.th_task_team = NULL;
5282 __kmp_free_thread(team->t.t_threads[f]);
5283 team->t.t_threads[f] = NULL;
5285 #if KMP_NESTED_HOT_TEAMS
5286 } // (__kmp_hot_teams_mode == 0)
5287 else {
5288 // When keeping extra threads in team, switch threads to wait on own
5289 // b_go flag
5290 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5291 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5293 for (int b = 0; b < bs_last_barrier; ++b) {
5294 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5295 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5297 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5301 #endif // KMP_NESTED_HOT_TEAMS
5302 team->t.t_nproc = new_nproc;
5303 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5304 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5305 __kmp_reinitialize_team(team, new_icvs,
5306 root->r.r_uber_thread->th.th_ident);
5308 // Update remaining threads
5309 for (f = 0; f < new_nproc; ++f) {
5310 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313 // restore the current task state of the primary thread: should be the
5314 // implicit task
5315 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5316 team->t.t_threads[0], team));
5318 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5320 #ifdef KMP_DEBUG
5321 for (f = 0; f < team->t.t_nproc; f++) {
5322 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5323 team->t.t_threads[f]->th.th_team_nproc ==
5324 team->t.t_nproc);
5326 #endif
5328 if (do_place_partition) {
5329 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5330 #if KMP_AFFINITY_SUPPORTED
5331 __kmp_partition_places(team);
5332 #endif
5334 } else { // team->t.t_nproc < new_nproc
5336 KA_TRACE(20,
5337 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5338 new_nproc));
5339 int old_nproc = team->t.t_nproc; // save old value and use to update only
5340 team->t.t_size_changed = 1;
5342 #if KMP_NESTED_HOT_TEAMS
5343 int avail_threads = hot_teams[level].hot_team_nth;
5344 if (new_nproc < avail_threads)
5345 avail_threads = new_nproc;
5346 kmp_info_t **other_threads = team->t.t_threads;
5347 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5348 // Adjust barrier data of reserved threads (if any) of the team
5349 // Other data will be set in __kmp_initialize_info() below.
5350 int b;
5351 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5352 for (b = 0; b < bs_last_barrier; ++b) {
5353 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5354 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5355 #if USE_DEBUGGER
5356 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5357 #endif
5360 if (hot_teams[level].hot_team_nth >= new_nproc) {
5361 // we have all needed threads in reserve, no need to allocate any
5362 // this only possible in mode 1, cannot have reserved threads in mode 0
5363 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5364 team->t.t_nproc = new_nproc; // just get reserved threads involved
5365 } else {
5366 // We may have some threads in reserve, but not enough;
5367 // get reserved threads involved if any.
5368 team->t.t_nproc = hot_teams[level].hot_team_nth;
5369 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5370 #endif // KMP_NESTED_HOT_TEAMS
5371 if (team->t.t_max_nproc < new_nproc) {
5372 /* reallocate larger arrays */
5373 __kmp_reallocate_team_arrays(team, new_nproc);
5374 __kmp_reinitialize_team(team, new_icvs, NULL);
5377 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5378 /* Temporarily set full mask for primary thread before creation of
5379 workers. The reason is that workers inherit the affinity from the
5380 primary thread, so if a lot of workers are created on the single
5381 core quickly, they don't get a chance to set their own affinity for
5382 a long time. */
5383 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5384 #endif
5386 /* allocate new threads for the hot team */
5387 for (f = team->t.t_nproc; f < new_nproc; f++) {
5388 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5389 KMP_DEBUG_ASSERT(new_worker);
5390 team->t.t_threads[f] = new_worker;
5392 KA_TRACE(20,
5393 ("__kmp_allocate_team: team %d init T#%d arrived: "
5394 "join=%llu, plain=%llu\n",
5395 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5396 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5397 team->t.t_bar[bs_plain_barrier].b_arrived));
5399 { // Initialize barrier data for new threads.
5400 int b;
5401 kmp_balign_t *balign = new_worker->th.th_bar;
5402 for (b = 0; b < bs_last_barrier; ++b) {
5403 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5404 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5405 KMP_BARRIER_PARENT_FLAG);
5406 #if USE_DEBUGGER
5407 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5408 #endif
5413 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5414 /* Restore initial primary thread's affinity mask */
5415 new_temp_affinity.restore();
5416 #endif
5417 #if KMP_NESTED_HOT_TEAMS
5418 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5419 #endif // KMP_NESTED_HOT_TEAMS
5420 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5421 // Barrier size already increased earlier in this function
5422 // Activate team threads via th_used_in_team
5423 __kmp_add_threads_to_team(team, new_nproc);
5425 /* make sure everyone is syncronized */
5426 // new threads below
5427 __kmp_initialize_team(team, new_nproc, new_icvs,
5428 root->r.r_uber_thread->th.th_ident);
5430 /* reinitialize the threads */
5431 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5432 for (f = 0; f < team->t.t_nproc; ++f)
5433 __kmp_initialize_info(team->t.t_threads[f], team, f,
5434 __kmp_gtid_from_tid(f, team));
5436 // set th_task_state for new threads in hot team with older thread's state
5437 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5438 for (f = old_nproc; f < team->t.t_nproc; ++f)
5439 team->t.t_threads[f]->th.th_task_state = old_state;
5441 #ifdef KMP_DEBUG
5442 for (f = 0; f < team->t.t_nproc; ++f) {
5443 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5444 team->t.t_threads[f]->th.th_team_nproc ==
5445 team->t.t_nproc);
5447 #endif
5449 if (do_place_partition) {
5450 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5451 #if KMP_AFFINITY_SUPPORTED
5452 __kmp_partition_places(team);
5453 #endif
5455 } // Check changes in number of threads
5457 kmp_info_t *master = team->t.t_threads[0];
5458 if (master->th.th_teams_microtask) {
5459 for (f = 1; f < new_nproc; ++f) {
5460 // propagate teams construct specific info to workers
5461 kmp_info_t *thr = team->t.t_threads[f];
5462 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5463 thr->th.th_teams_level = master->th.th_teams_level;
5464 thr->th.th_teams_size = master->th.th_teams_size;
5467 #if KMP_NESTED_HOT_TEAMS
5468 if (level) {
5469 // Sync barrier state for nested hot teams, not needed for outermost hot
5470 // team.
5471 for (f = 1; f < new_nproc; ++f) {
5472 kmp_info_t *thr = team->t.t_threads[f];
5473 int b;
5474 kmp_balign_t *balign = thr->th.th_bar;
5475 for (b = 0; b < bs_last_barrier; ++b) {
5476 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5477 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5478 #if USE_DEBUGGER
5479 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5480 #endif
5484 #endif // KMP_NESTED_HOT_TEAMS
5486 /* reallocate space for arguments if necessary */
5487 __kmp_alloc_argv_entries(argc, team, TRUE);
5488 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5489 // The hot team re-uses the previous task team,
5490 // if untouched during the previous release->gather phase.
5492 KF_TRACE(10, (" hot_team = %p\n", team));
5494 #if KMP_DEBUG
5495 if (__kmp_tasking_mode != tskm_immediate_exec) {
5496 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5497 "task_team[1] = %p after reinit\n",
5498 team->t.t_task_team[0], team->t.t_task_team[1]));
5500 #endif
5502 #if OMPT_SUPPORT
5503 __ompt_team_assign_id(team, ompt_parallel_data);
5504 #endif
5506 KMP_MB();
5508 return team;
5511 /* next, let's try to take one from the team pool */
5512 KMP_MB();
5513 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5514 /* TODO: consider resizing undersized teams instead of reaping them, now
5515 that we have a resizing mechanism */
5516 if (team->t.t_max_nproc >= max_nproc) {
5517 /* take this team from the team pool */
5518 __kmp_team_pool = team->t.t_next_pool;
5520 if (max_nproc > 1 &&
5521 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5522 if (!team->t.b) { // Allocate barrier structure
5523 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5527 /* setup the team for fresh use */
5528 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5530 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5531 "task_team[1] %p to NULL\n",
5532 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5533 team->t.t_task_team[0] = NULL;
5534 team->t.t_task_team[1] = NULL;
5536 /* reallocate space for arguments if necessary */
5537 __kmp_alloc_argv_entries(argc, team, TRUE);
5538 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5540 KA_TRACE(
5541 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5542 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5543 { // Initialize barrier data.
5544 int b;
5545 for (b = 0; b < bs_last_barrier; ++b) {
5546 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5547 #if USE_DEBUGGER
5548 team->t.t_bar[b].b_master_arrived = 0;
5549 team->t.t_bar[b].b_team_arrived = 0;
5550 #endif
5554 team->t.t_proc_bind = new_proc_bind;
5556 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5557 team->t.t_id));
5559 #if OMPT_SUPPORT
5560 __ompt_team_assign_id(team, ompt_parallel_data);
5561 #endif
5563 KMP_MB();
5565 return team;
5568 /* reap team if it is too small, then loop back and check the next one */
5569 // not sure if this is wise, but, will be redone during the hot-teams
5570 // rewrite.
5571 /* TODO: Use technique to find the right size hot-team, don't reap them */
5572 team = __kmp_reap_team(team);
5573 __kmp_team_pool = team;
5576 /* nothing available in the pool, no matter, make a new team! */
5577 KMP_MB();
5578 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5580 /* and set it up */
5581 team->t.t_max_nproc = max_nproc;
5582 if (max_nproc > 1 &&
5583 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5584 // Allocate barrier structure
5585 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5588 /* NOTE well, for some reason allocating one big buffer and dividing it up
5589 seems to really hurt performance a lot on the P4, so, let's not use this */
5590 __kmp_allocate_team_arrays(team, max_nproc);
5592 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5593 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5595 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5596 "%p to NULL\n",
5597 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5598 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5599 // memory, no need to duplicate
5600 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5601 // memory, no need to duplicate
5603 if (__kmp_storage_map) {
5604 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5607 /* allocate space for arguments */
5608 __kmp_alloc_argv_entries(argc, team, FALSE);
5609 team->t.t_argc = argc;
5611 KA_TRACE(20,
5612 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5613 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5614 { // Initialize barrier data.
5615 int b;
5616 for (b = 0; b < bs_last_barrier; ++b) {
5617 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5618 #if USE_DEBUGGER
5619 team->t.t_bar[b].b_master_arrived = 0;
5620 team->t.t_bar[b].b_team_arrived = 0;
5621 #endif
5625 team->t.t_proc_bind = new_proc_bind;
5627 #if OMPT_SUPPORT
5628 __ompt_team_assign_id(team, ompt_parallel_data);
5629 team->t.ompt_serialized_team_info = NULL;
5630 #endif
5632 KMP_MB();
5634 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5635 team->t.t_id));
5637 return team;
5640 /* TODO implement hot-teams at all levels */
5641 /* TODO implement lazy thread release on demand (disband request) */
5643 /* free the team. return it to the team pool. release all the threads
5644 * associated with it */
5645 void __kmp_free_team(kmp_root_t *root,
5646 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5647 int f;
5648 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5649 team->t.t_id));
5651 /* verify state */
5652 KMP_DEBUG_ASSERT(root);
5653 KMP_DEBUG_ASSERT(team);
5654 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5655 KMP_DEBUG_ASSERT(team->t.t_threads);
5657 int use_hot_team = team == root->r.r_hot_team;
5658 #if KMP_NESTED_HOT_TEAMS
5659 int level;
5660 if (master) {
5661 level = team->t.t_active_level - 1;
5662 if (master->th.th_teams_microtask) { // in teams construct?
5663 if (master->th.th_teams_size.nteams > 1) {
5664 ++level; // level was not increased in teams construct for
5665 // team_of_masters
5667 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5668 master->th.th_teams_level == team->t.t_level) {
5669 ++level; // level was not increased in teams construct for
5670 // team_of_workers before the parallel
5671 } // team->t.t_level will be increased inside parallel
5673 #if KMP_DEBUG
5674 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5675 #endif
5676 if (level < __kmp_hot_teams_max_level) {
5677 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5678 use_hot_team = 1;
5681 #endif // KMP_NESTED_HOT_TEAMS
5683 /* team is done working */
5684 TCW_SYNC_PTR(team->t.t_pkfn,
5685 NULL); // Important for Debugging Support Library.
5686 #if KMP_OS_WINDOWS
5687 team->t.t_copyin_counter = 0; // init counter for possible reuse
5688 #endif
5689 // Do not reset pointer to parent team to NULL for hot teams.
5691 /* if we are non-hot team, release our threads */
5692 if (!use_hot_team) {
5693 if (__kmp_tasking_mode != tskm_immediate_exec) {
5694 // Wait for threads to reach reapable state
5695 for (f = 1; f < team->t.t_nproc; ++f) {
5696 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5697 kmp_info_t *th = team->t.t_threads[f];
5698 volatile kmp_uint32 *state = &th->th.th_reap_state;
5699 while (*state != KMP_SAFE_TO_REAP) {
5700 #if KMP_OS_WINDOWS
5701 // On Windows a thread can be killed at any time, check this
5702 DWORD ecode;
5703 if (!__kmp_is_thread_alive(th, &ecode)) {
5704 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5705 break;
5707 #endif
5708 // first check if thread is sleeping
5709 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5710 if (fl.is_sleeping())
5711 fl.resume(__kmp_gtid_from_thread(th));
5712 KMP_CPU_PAUSE();
5716 // Delete task teams
5717 int tt_idx;
5718 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5719 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5720 if (task_team != NULL) {
5721 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5722 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5723 team->t.t_threads[f]->th.th_task_team = NULL;
5725 KA_TRACE(
5727 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5728 __kmp_get_gtid(), task_team, team->t.t_id));
5729 #if KMP_NESTED_HOT_TEAMS
5730 __kmp_free_task_team(master, task_team);
5731 #endif
5732 team->t.t_task_team[tt_idx] = NULL;
5737 // Reset pointer to parent team only for non-hot teams.
5738 team->t.t_parent = NULL;
5739 team->t.t_level = 0;
5740 team->t.t_active_level = 0;
5742 /* free the worker threads */
5743 for (f = 1; f < team->t.t_nproc; ++f) {
5744 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5745 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5746 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5747 1, 2);
5749 __kmp_free_thread(team->t.t_threads[f]);
5752 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5753 if (team->t.b) {
5754 // wake up thread at old location
5755 team->t.b->go_release();
5756 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5757 for (f = 1; f < team->t.t_nproc; ++f) {
5758 if (team->t.b->sleep[f].sleep) {
5759 __kmp_atomic_resume_64(
5760 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5761 (kmp_atomic_flag_64<> *)NULL);
5765 // Wait for threads to be removed from team
5766 for (int f = 1; f < team->t.t_nproc; ++f) {
5767 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5768 KMP_CPU_PAUSE();
5773 for (f = 1; f < team->t.t_nproc; ++f) {
5774 team->t.t_threads[f] = NULL;
5777 if (team->t.t_max_nproc > 1 &&
5778 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5779 distributedBarrier::deallocate(team->t.b);
5780 team->t.b = NULL;
5782 /* put the team back in the team pool */
5783 /* TODO limit size of team pool, call reap_team if pool too large */
5784 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5785 __kmp_team_pool = (volatile kmp_team_t *)team;
5786 } else { // Check if team was created for primary threads in teams construct
5787 // See if first worker is a CG root
5788 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5789 team->t.t_threads[1]->th.th_cg_roots);
5790 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5791 // Clean up the CG root nodes on workers so that this team can be re-used
5792 for (f = 1; f < team->t.t_nproc; ++f) {
5793 kmp_info_t *thr = team->t.t_threads[f];
5794 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5795 thr->th.th_cg_roots->cg_root == thr);
5796 // Pop current CG root off list
5797 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5798 thr->th.th_cg_roots = tmp->up;
5799 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5800 " up to node %p. cg_nthreads was %d\n",
5801 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5802 int i = tmp->cg_nthreads--;
5803 if (i == 1) {
5804 __kmp_free(tmp); // free CG if we are the last thread in it
5806 // Restore current task's thread_limit from CG root
5807 if (thr->th.th_cg_roots)
5808 thr->th.th_current_task->td_icvs.thread_limit =
5809 thr->th.th_cg_roots->cg_thread_limit;
5814 KMP_MB();
5817 /* reap the team. destroy it, reclaim all its resources and free its memory */
5818 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5819 kmp_team_t *next_pool = team->t.t_next_pool;
5821 KMP_DEBUG_ASSERT(team);
5822 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5823 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5824 KMP_DEBUG_ASSERT(team->t.t_threads);
5825 KMP_DEBUG_ASSERT(team->t.t_argv);
5827 /* TODO clean the threads that are a part of this? */
5829 /* free stuff */
5830 __kmp_free_team_arrays(team);
5831 if (team->t.t_argv != &team->t.t_inline_argv[0])
5832 __kmp_free((void *)team->t.t_argv);
5833 __kmp_free(team);
5835 KMP_MB();
5836 return next_pool;
5839 // Free the thread. Don't reap it, just place it on the pool of available
5840 // threads.
5842 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5843 // binding for the affinity mechanism to be useful.
5845 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5846 // However, we want to avoid a potential performance problem by always
5847 // scanning through the list to find the correct point at which to insert
5848 // the thread (potential N**2 behavior). To do this we keep track of the
5849 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5850 // With single-level parallelism, threads will always be added to the tail
5851 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5852 // parallelism, all bets are off and we may need to scan through the entire
5853 // free list.
5855 // This change also has a potentially large performance benefit, for some
5856 // applications. Previously, as threads were freed from the hot team, they
5857 // would be placed back on the free list in inverse order. If the hot team
5858 // grew back to it's original size, then the freed thread would be placed
5859 // back on the hot team in reverse order. This could cause bad cache
5860 // locality problems on programs where the size of the hot team regularly
5861 // grew and shrunk.
5863 // Now, for single-level parallelism, the OMP tid is always == gtid.
5864 void __kmp_free_thread(kmp_info_t *this_th) {
5865 int gtid;
5866 kmp_info_t **scan;
5868 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5869 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5871 KMP_DEBUG_ASSERT(this_th);
5873 // When moving thread to pool, switch thread to wait on own b_go flag, and
5874 // uninitialized (NULL team).
5875 int b;
5876 kmp_balign_t *balign = this_th->th.th_bar;
5877 for (b = 0; b < bs_last_barrier; ++b) {
5878 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5879 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5880 balign[b].bb.team = NULL;
5881 balign[b].bb.leaf_kids = 0;
5883 this_th->th.th_task_state = 0;
5884 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5886 /* put thread back on the free pool */
5887 TCW_PTR(this_th->th.th_team, NULL);
5888 TCW_PTR(this_th->th.th_root, NULL);
5889 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5891 while (this_th->th.th_cg_roots) {
5892 this_th->th.th_cg_roots->cg_nthreads--;
5893 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5894 " %p of thread %p to %d\n",
5895 this_th, this_th->th.th_cg_roots,
5896 this_th->th.th_cg_roots->cg_root,
5897 this_th->th.th_cg_roots->cg_nthreads));
5898 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5899 if (tmp->cg_root == this_th) { // Thread is a cg_root
5900 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5901 KA_TRACE(
5902 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5903 this_th->th.th_cg_roots = tmp->up;
5904 __kmp_free(tmp);
5905 } else { // Worker thread
5906 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5907 __kmp_free(tmp);
5909 this_th->th.th_cg_roots = NULL;
5910 break;
5914 /* If the implicit task assigned to this thread can be used by other threads
5915 * -> multiple threads can share the data and try to free the task at
5916 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5917 * with higher probability when hot team is disabled but can occurs even when
5918 * the hot team is enabled */
5919 __kmp_free_implicit_task(this_th);
5920 this_th->th.th_current_task = NULL;
5922 // If the __kmp_thread_pool_insert_pt is already past the new insert
5923 // point, then we need to re-scan the entire list.
5924 gtid = this_th->th.th_info.ds.ds_gtid;
5925 if (__kmp_thread_pool_insert_pt != NULL) {
5926 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5927 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5928 __kmp_thread_pool_insert_pt = NULL;
5932 // Scan down the list to find the place to insert the thread.
5933 // scan is the address of a link in the list, possibly the address of
5934 // __kmp_thread_pool itself.
5936 // In the absence of nested parallelism, the for loop will have 0 iterations.
5937 if (__kmp_thread_pool_insert_pt != NULL) {
5938 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5939 } else {
5940 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5942 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5943 scan = &((*scan)->th.th_next_pool))
5946 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5947 // to its address.
5948 TCW_PTR(this_th->th.th_next_pool, *scan);
5949 __kmp_thread_pool_insert_pt = *scan = this_th;
5950 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5951 (this_th->th.th_info.ds.ds_gtid <
5952 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5953 TCW_4(this_th->th.th_in_pool, TRUE);
5954 __kmp_suspend_initialize_thread(this_th);
5955 __kmp_lock_suspend_mx(this_th);
5956 if (this_th->th.th_active == TRUE) {
5957 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5958 this_th->th.th_active_in_pool = TRUE;
5960 #if KMP_DEBUG
5961 else {
5962 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5964 #endif
5965 __kmp_unlock_suspend_mx(this_th);
5967 TCW_4(__kmp_nth, __kmp_nth - 1);
5969 #ifdef KMP_ADJUST_BLOCKTIME
5970 /* Adjust blocktime back to user setting or default if necessary */
5971 /* Middle initialization might never have occurred */
5972 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5973 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5974 if (__kmp_nth <= __kmp_avail_proc) {
5975 __kmp_zero_bt = FALSE;
5978 #endif /* KMP_ADJUST_BLOCKTIME */
5980 KMP_MB();
5983 /* ------------------------------------------------------------------------ */
5985 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5986 #if OMP_PROFILING_SUPPORT
5987 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5988 // TODO: add a configuration option for time granularity
5989 if (ProfileTraceFile)
5990 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5991 #endif
5993 int gtid = this_thr->th.th_info.ds.ds_gtid;
5994 /* void *stack_data;*/
5995 kmp_team_t **volatile pteam;
5997 KMP_MB();
5998 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6000 if (__kmp_env_consistency_check) {
6001 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6004 #if OMPD_SUPPORT
6005 if (ompd_state & OMPD_ENABLE_BP)
6006 ompd_bp_thread_begin();
6007 #endif
6009 #if OMPT_SUPPORT
6010 ompt_data_t *thread_data = nullptr;
6011 if (ompt_enabled.enabled) {
6012 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6013 *thread_data = ompt_data_none;
6015 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6016 this_thr->th.ompt_thread_info.wait_id = 0;
6017 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6018 this_thr->th.ompt_thread_info.parallel_flags = 0;
6019 if (ompt_enabled.ompt_callback_thread_begin) {
6020 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6021 ompt_thread_worker, thread_data);
6023 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6025 #endif
6027 /* This is the place where threads wait for work */
6028 while (!TCR_4(__kmp_global.g.g_done)) {
6029 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6030 KMP_MB();
6032 /* wait for work to do */
6033 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6035 /* No tid yet since not part of a team */
6036 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6038 #if OMPT_SUPPORT
6039 if (ompt_enabled.enabled) {
6040 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6042 #endif
6044 pteam = &this_thr->th.th_team;
6046 /* have we been allocated? */
6047 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6048 /* we were just woken up, so run our new task */
6049 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6050 int rc;
6051 KA_TRACE(20,
6052 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6053 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6054 (*pteam)->t.t_pkfn));
6056 updateHWFPControl(*pteam);
6058 #if OMPT_SUPPORT
6059 if (ompt_enabled.enabled) {
6060 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6062 #endif
6064 rc = (*pteam)->t.t_invoke(gtid);
6065 KMP_ASSERT(rc);
6067 KMP_MB();
6068 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6069 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6070 (*pteam)->t.t_pkfn));
6072 #if OMPT_SUPPORT
6073 if (ompt_enabled.enabled) {
6074 /* no frame set while outside task */
6075 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6077 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6079 #endif
6080 /* join barrier after parallel region */
6081 __kmp_join_barrier(gtid);
6085 #if OMPD_SUPPORT
6086 if (ompd_state & OMPD_ENABLE_BP)
6087 ompd_bp_thread_end();
6088 #endif
6090 #if OMPT_SUPPORT
6091 if (ompt_enabled.ompt_callback_thread_end) {
6092 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6094 #endif
6096 this_thr->th.th_task_team = NULL;
6097 /* run the destructors for the threadprivate data for this thread */
6098 __kmp_common_destroy_gtid(gtid);
6100 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6101 KMP_MB();
6103 #if OMP_PROFILING_SUPPORT
6104 llvm::timeTraceProfilerFinishThread();
6105 #endif
6106 return this_thr;
6109 /* ------------------------------------------------------------------------ */
6111 void __kmp_internal_end_dest(void *specific_gtid) {
6112 // Make sure no significant bits are lost
6113 int gtid;
6114 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6116 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6117 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6118 * this is because 0 is reserved for the nothing-stored case */
6120 __kmp_internal_end_thread(gtid);
6123 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6125 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6126 __kmp_internal_end_atexit();
6129 #endif
6131 /* [Windows] josh: when the atexit handler is called, there may still be more
6132 than one thread alive */
6133 void __kmp_internal_end_atexit(void) {
6134 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6135 /* [Windows]
6136 josh: ideally, we want to completely shutdown the library in this atexit
6137 handler, but stat code that depends on thread specific data for gtid fails
6138 because that data becomes unavailable at some point during the shutdown, so
6139 we call __kmp_internal_end_thread instead. We should eventually remove the
6140 dependency on __kmp_get_specific_gtid in the stat code and use
6141 __kmp_internal_end_library to cleanly shutdown the library.
6143 // TODO: Can some of this comment about GVS be removed?
6144 I suspect that the offending stat code is executed when the calling thread
6145 tries to clean up a dead root thread's data structures, resulting in GVS
6146 code trying to close the GVS structures for that thread, but since the stat
6147 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6148 the calling thread is cleaning up itself instead of another thread, it get
6149 confused. This happens because allowing a thread to unregister and cleanup
6150 another thread is a recent modification for addressing an issue.
6151 Based on the current design (20050722), a thread may end up
6152 trying to unregister another thread only if thread death does not trigger
6153 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6154 thread specific data destructor function to detect thread death. For
6155 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6156 is nothing. Thus, the workaround is applicable only for Windows static
6157 stat library. */
6158 __kmp_internal_end_library(-1);
6159 #if KMP_OS_WINDOWS
6160 __kmp_close_console();
6161 #endif
6164 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6165 // It is assumed __kmp_forkjoin_lock is acquired.
6167 int gtid;
6169 KMP_DEBUG_ASSERT(thread != NULL);
6171 gtid = thread->th.th_info.ds.ds_gtid;
6173 if (!is_root) {
6174 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6175 /* Assume the threads are at the fork barrier here */
6176 KA_TRACE(
6177 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6178 gtid));
6179 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6180 while (
6181 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6182 KMP_CPU_PAUSE();
6183 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6184 } else {
6185 /* Need release fence here to prevent seg faults for tree forkjoin
6186 barrier (GEH) */
6187 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6188 thread);
6189 __kmp_release_64(&flag);
6193 // Terminate OS thread.
6194 __kmp_reap_worker(thread);
6196 // The thread was killed asynchronously. If it was actively
6197 // spinning in the thread pool, decrement the global count.
6199 // There is a small timing hole here - if the worker thread was just waking
6200 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6201 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6202 // the global counter might not get updated.
6204 // Currently, this can only happen as the library is unloaded,
6205 // so there are no harmful side effects.
6206 if (thread->th.th_active_in_pool) {
6207 thread->th.th_active_in_pool = FALSE;
6208 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6209 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6213 __kmp_free_implicit_task(thread);
6215 // Free the fast memory for tasking
6216 #if USE_FAST_MEMORY
6217 __kmp_free_fast_memory(thread);
6218 #endif /* USE_FAST_MEMORY */
6220 __kmp_suspend_uninitialize_thread(thread);
6222 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6223 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6225 --__kmp_all_nth;
6226 // __kmp_nth was decremented when thread is added to the pool.
6228 #ifdef KMP_ADJUST_BLOCKTIME
6229 /* Adjust blocktime back to user setting or default if necessary */
6230 /* Middle initialization might never have occurred */
6231 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6232 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6233 if (__kmp_nth <= __kmp_avail_proc) {
6234 __kmp_zero_bt = FALSE;
6237 #endif /* KMP_ADJUST_BLOCKTIME */
6239 /* free the memory being used */
6240 if (__kmp_env_consistency_check) {
6241 if (thread->th.th_cons) {
6242 __kmp_free_cons_stack(thread->th.th_cons);
6243 thread->th.th_cons = NULL;
6247 if (thread->th.th_pri_common != NULL) {
6248 __kmp_free(thread->th.th_pri_common);
6249 thread->th.th_pri_common = NULL;
6252 if (thread->th.th_task_state_memo_stack != NULL) {
6253 __kmp_free(thread->th.th_task_state_memo_stack);
6254 thread->th.th_task_state_memo_stack = NULL;
6257 #if KMP_USE_BGET
6258 if (thread->th.th_local.bget_data != NULL) {
6259 __kmp_finalize_bget(thread);
6261 #endif
6263 #if KMP_AFFINITY_SUPPORTED
6264 if (thread->th.th_affin_mask != NULL) {
6265 KMP_CPU_FREE(thread->th.th_affin_mask);
6266 thread->th.th_affin_mask = NULL;
6268 #endif /* KMP_AFFINITY_SUPPORTED */
6270 #if KMP_USE_HIER_SCHED
6271 if (thread->th.th_hier_bar_data != NULL) {
6272 __kmp_free(thread->th.th_hier_bar_data);
6273 thread->th.th_hier_bar_data = NULL;
6275 #endif
6277 __kmp_reap_team(thread->th.th_serial_team);
6278 thread->th.th_serial_team = NULL;
6279 __kmp_free(thread);
6281 KMP_MB();
6283 } // __kmp_reap_thread
6285 static void __kmp_itthash_clean(kmp_info_t *th) {
6286 #if USE_ITT_NOTIFY
6287 if (__kmp_itt_region_domains.count > 0) {
6288 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6289 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6290 while (bucket) {
6291 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6292 __kmp_thread_free(th, bucket);
6293 bucket = next;
6297 if (__kmp_itt_barrier_domains.count > 0) {
6298 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6299 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6300 while (bucket) {
6301 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6302 __kmp_thread_free(th, bucket);
6303 bucket = next;
6307 #endif
6310 static void __kmp_internal_end(void) {
6311 int i;
6313 /* First, unregister the library */
6314 __kmp_unregister_library();
6316 #if KMP_OS_WINDOWS
6317 /* In Win static library, we can't tell when a root actually dies, so we
6318 reclaim the data structures for any root threads that have died but not
6319 unregistered themselves, in order to shut down cleanly.
6320 In Win dynamic library we also can't tell when a thread dies. */
6321 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6322 // dead roots
6323 #endif
6325 for (i = 0; i < __kmp_threads_capacity; i++)
6326 if (__kmp_root[i])
6327 if (__kmp_root[i]->r.r_active)
6328 break;
6329 KMP_MB(); /* Flush all pending memory write invalidates. */
6330 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6332 if (i < __kmp_threads_capacity) {
6333 #if KMP_USE_MONITOR
6334 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6335 KMP_MB(); /* Flush all pending memory write invalidates. */
6337 // Need to check that monitor was initialized before reaping it. If we are
6338 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6339 // __kmp_monitor will appear to contain valid data, but it is only valid in
6340 // the parent process, not the child.
6341 // New behavior (201008): instead of keying off of the flag
6342 // __kmp_init_parallel, the monitor thread creation is keyed off
6343 // of the new flag __kmp_init_monitor.
6344 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6345 if (TCR_4(__kmp_init_monitor)) {
6346 __kmp_reap_monitor(&__kmp_monitor);
6347 TCW_4(__kmp_init_monitor, 0);
6349 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6350 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6351 #endif // KMP_USE_MONITOR
6352 } else {
6353 /* TODO move this to cleanup code */
6354 #ifdef KMP_DEBUG
6355 /* make sure that everything has properly ended */
6356 for (i = 0; i < __kmp_threads_capacity; i++) {
6357 if (__kmp_root[i]) {
6358 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6359 // there can be uber threads alive here
6360 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6363 #endif
6365 KMP_MB();
6367 // Reap the worker threads.
6368 // This is valid for now, but be careful if threads are reaped sooner.
6369 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6370 // Get the next thread from the pool.
6371 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6372 __kmp_thread_pool = thread->th.th_next_pool;
6373 // Reap it.
6374 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6375 thread->th.th_next_pool = NULL;
6376 thread->th.th_in_pool = FALSE;
6377 __kmp_reap_thread(thread, 0);
6379 __kmp_thread_pool_insert_pt = NULL;
6381 // Reap teams.
6382 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6383 // Get the next team from the pool.
6384 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6385 __kmp_team_pool = team->t.t_next_pool;
6386 // Reap it.
6387 team->t.t_next_pool = NULL;
6388 __kmp_reap_team(team);
6391 __kmp_reap_task_teams();
6393 #if KMP_OS_UNIX
6394 // Threads that are not reaped should not access any resources since they
6395 // are going to be deallocated soon, so the shutdown sequence should wait
6396 // until all threads either exit the final spin-waiting loop or begin
6397 // sleeping after the given blocktime.
6398 for (i = 0; i < __kmp_threads_capacity; i++) {
6399 kmp_info_t *thr = __kmp_threads[i];
6400 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6401 KMP_CPU_PAUSE();
6403 #endif
6405 for (i = 0; i < __kmp_threads_capacity; ++i) {
6406 // TBD: Add some checking...
6407 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6410 /* Make sure all threadprivate destructors get run by joining with all
6411 worker threads before resetting this flag */
6412 TCW_SYNC_4(__kmp_init_common, FALSE);
6414 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6415 KMP_MB();
6417 #if KMP_USE_MONITOR
6418 // See note above: One of the possible fixes for CQ138434 / CQ140126
6420 // FIXME: push both code fragments down and CSE them?
6421 // push them into __kmp_cleanup() ?
6422 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6423 if (TCR_4(__kmp_init_monitor)) {
6424 __kmp_reap_monitor(&__kmp_monitor);
6425 TCW_4(__kmp_init_monitor, 0);
6427 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6428 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6429 #endif
6430 } /* else !__kmp_global.t_active */
6431 TCW_4(__kmp_init_gtid, FALSE);
6432 KMP_MB(); /* Flush all pending memory write invalidates. */
6434 __kmp_cleanup();
6435 #if OMPT_SUPPORT
6436 ompt_fini();
6437 #endif
6440 void __kmp_internal_end_library(int gtid_req) {
6441 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6442 /* this shouldn't be a race condition because __kmp_internal_end() is the
6443 only place to clear __kmp_serial_init */
6444 /* we'll check this later too, after we get the lock */
6445 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6446 // redundant, because the next check will work in any case.
6447 if (__kmp_global.g.g_abort) {
6448 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6449 /* TODO abort? */
6450 return;
6452 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6453 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6454 return;
6457 // If hidden helper team has been initialized, we need to deinit it
6458 if (TCR_4(__kmp_init_hidden_helper) &&
6459 !TCR_4(__kmp_hidden_helper_team_done)) {
6460 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6461 // First release the main thread to let it continue its work
6462 __kmp_hidden_helper_main_thread_release();
6463 // Wait until the hidden helper team has been destroyed
6464 __kmp_hidden_helper_threads_deinitz_wait();
6467 KMP_MB(); /* Flush all pending memory write invalidates. */
6468 /* find out who we are and what we should do */
6470 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6471 KA_TRACE(
6472 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6473 if (gtid == KMP_GTID_SHUTDOWN) {
6474 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6475 "already shutdown\n"));
6476 return;
6477 } else if (gtid == KMP_GTID_MONITOR) {
6478 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6479 "registered, or system shutdown\n"));
6480 return;
6481 } else if (gtid == KMP_GTID_DNE) {
6482 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6483 "shutdown\n"));
6484 /* we don't know who we are, but we may still shutdown the library */
6485 } else if (KMP_UBER_GTID(gtid)) {
6486 /* unregister ourselves as an uber thread. gtid is no longer valid */
6487 if (__kmp_root[gtid]->r.r_active) {
6488 __kmp_global.g.g_abort = -1;
6489 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6490 __kmp_unregister_library();
6491 KA_TRACE(10,
6492 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6493 gtid));
6494 return;
6495 } else {
6496 __kmp_itthash_clean(__kmp_threads[gtid]);
6497 KA_TRACE(
6499 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6500 __kmp_unregister_root_current_thread(gtid);
6502 } else {
6503 /* worker threads may call this function through the atexit handler, if they
6504 * call exit() */
6505 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6506 TODO: do a thorough shutdown instead */
6507 #ifdef DUMP_DEBUG_ON_EXIT
6508 if (__kmp_debug_buf)
6509 __kmp_dump_debug_buffer();
6510 #endif
6511 // added unregister library call here when we switch to shm linux
6512 // if we don't, it will leave lots of files in /dev/shm
6513 // cleanup shared memory file before exiting.
6514 __kmp_unregister_library();
6515 return;
6518 /* synchronize the termination process */
6519 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6521 /* have we already finished */
6522 if (__kmp_global.g.g_abort) {
6523 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6524 /* TODO abort? */
6525 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6526 return;
6528 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6529 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6530 return;
6533 /* We need this lock to enforce mutex between this reading of
6534 __kmp_threads_capacity and the writing by __kmp_register_root.
6535 Alternatively, we can use a counter of roots that is atomically updated by
6536 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6537 __kmp_internal_end_*. */
6538 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6540 /* now we can safely conduct the actual termination */
6541 __kmp_internal_end();
6543 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6544 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6546 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6548 #ifdef DUMP_DEBUG_ON_EXIT
6549 if (__kmp_debug_buf)
6550 __kmp_dump_debug_buffer();
6551 #endif
6553 #if KMP_OS_WINDOWS
6554 __kmp_close_console();
6555 #endif
6557 __kmp_fini_allocator();
6559 } // __kmp_internal_end_library
6561 void __kmp_internal_end_thread(int gtid_req) {
6562 int i;
6564 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6565 /* this shouldn't be a race condition because __kmp_internal_end() is the
6566 * only place to clear __kmp_serial_init */
6567 /* we'll check this later too, after we get the lock */
6568 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6569 // redundant, because the next check will work in any case.
6570 if (__kmp_global.g.g_abort) {
6571 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6572 /* TODO abort? */
6573 return;
6575 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6576 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6577 return;
6580 // If hidden helper team has been initialized, we need to deinit it
6581 if (TCR_4(__kmp_init_hidden_helper) &&
6582 !TCR_4(__kmp_hidden_helper_team_done)) {
6583 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6584 // First release the main thread to let it continue its work
6585 __kmp_hidden_helper_main_thread_release();
6586 // Wait until the hidden helper team has been destroyed
6587 __kmp_hidden_helper_threads_deinitz_wait();
6590 KMP_MB(); /* Flush all pending memory write invalidates. */
6592 /* find out who we are and what we should do */
6594 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6595 KA_TRACE(10,
6596 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6597 if (gtid == KMP_GTID_SHUTDOWN) {
6598 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6599 "already shutdown\n"));
6600 return;
6601 } else if (gtid == KMP_GTID_MONITOR) {
6602 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6603 "registered, or system shutdown\n"));
6604 return;
6605 } else if (gtid == KMP_GTID_DNE) {
6606 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6607 "shutdown\n"));
6608 return;
6609 /* we don't know who we are */
6610 } else if (KMP_UBER_GTID(gtid)) {
6611 /* unregister ourselves as an uber thread. gtid is no longer valid */
6612 if (__kmp_root[gtid]->r.r_active) {
6613 __kmp_global.g.g_abort = -1;
6614 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6615 KA_TRACE(10,
6616 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6617 gtid));
6618 return;
6619 } else {
6620 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6621 gtid));
6622 __kmp_unregister_root_current_thread(gtid);
6624 } else {
6625 /* just a worker thread, let's leave */
6626 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6628 if (gtid >= 0) {
6629 __kmp_threads[gtid]->th.th_task_team = NULL;
6632 KA_TRACE(10,
6633 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6634 gtid));
6635 return;
6638 #if KMP_DYNAMIC_LIB
6639 if (__kmp_pause_status != kmp_hard_paused)
6640 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6641 // because we will better shutdown later in the library destructor.
6643 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6644 return;
6646 #endif
6647 /* synchronize the termination process */
6648 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6650 /* have we already finished */
6651 if (__kmp_global.g.g_abort) {
6652 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6653 /* TODO abort? */
6654 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655 return;
6657 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6658 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6659 return;
6662 /* We need this lock to enforce mutex between this reading of
6663 __kmp_threads_capacity and the writing by __kmp_register_root.
6664 Alternatively, we can use a counter of roots that is atomically updated by
6665 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6666 __kmp_internal_end_*. */
6668 /* should we finish the run-time? are all siblings done? */
6669 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6671 for (i = 0; i < __kmp_threads_capacity; ++i) {
6672 if (KMP_UBER_GTID(i)) {
6673 KA_TRACE(
6675 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6676 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6677 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678 return;
6682 /* now we can safely conduct the actual termination */
6684 __kmp_internal_end();
6686 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6687 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6689 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6691 #ifdef DUMP_DEBUG_ON_EXIT
6692 if (__kmp_debug_buf)
6693 __kmp_dump_debug_buffer();
6694 #endif
6695 } // __kmp_internal_end_thread
6697 // -----------------------------------------------------------------------------
6698 // Library registration stuff.
6700 static long __kmp_registration_flag = 0;
6701 // Random value used to indicate library initialization.
6702 static char *__kmp_registration_str = NULL;
6703 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6705 static inline char *__kmp_reg_status_name() {
6706 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6707 each thread. If registration and unregistration go in different threads
6708 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6709 env var can not be found, because the name will contain different pid. */
6710 // macOS* complains about name being too long with additional getuid()
6711 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6712 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6713 (int)getuid());
6714 #else
6715 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6716 #endif
6717 } // __kmp_reg_status_get
6719 #if defined(KMP_USE_SHM)
6720 bool __kmp_shm_available = false;
6721 bool __kmp_tmp_available = false;
6722 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6723 char *temp_reg_status_file_name = nullptr;
6724 #endif
6726 void __kmp_register_library_startup(void) {
6728 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6729 int done = 0;
6730 union {
6731 double dtime;
6732 long ltime;
6733 } time;
6734 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6735 __kmp_initialize_system_tick();
6736 #endif
6737 __kmp_read_system_time(&time.dtime);
6738 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6739 __kmp_registration_str =
6740 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6741 __kmp_registration_flag, KMP_LIBRARY_FILE);
6743 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6744 __kmp_registration_str));
6746 while (!done) {
6748 char *value = NULL; // Actual value of the environment variable.
6750 #if defined(KMP_USE_SHM)
6751 char *shm_name = nullptr;
6752 char *data1 = nullptr;
6753 __kmp_shm_available = __kmp_detect_shm();
6754 if (__kmp_shm_available) {
6755 int fd1 = -1;
6756 shm_name = __kmp_str_format("/%s", name);
6757 int shm_preexist = 0;
6758 fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6759 if ((fd1 == -1) && (errno == EEXIST)) {
6760 // file didn't open because it already exists.
6761 // try opening existing file
6762 fd1 = shm_open(shm_name, O_RDWR, 0666);
6763 if (fd1 == -1) { // file didn't open
6764 KMP_WARNING(FunctionError, "Can't open SHM");
6765 __kmp_shm_available = false;
6766 } else { // able to open existing file
6767 shm_preexist = 1;
6770 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6771 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6772 KMP_WARNING(FunctionError, "Can't set size of SHM");
6773 __kmp_shm_available = false;
6776 if (__kmp_shm_available) { // SHM exists, now map it
6777 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6778 fd1, 0);
6779 if (data1 == MAP_FAILED) { // failed to map shared memory
6780 KMP_WARNING(FunctionError, "Can't map SHM");
6781 __kmp_shm_available = false;
6784 if (__kmp_shm_available) { // SHM mapped
6785 if (shm_preexist == 0) { // set data to SHM, set value
6786 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6788 // Read value from either what we just wrote or existing file.
6789 value = __kmp_str_format("%s", data1); // read value from SHM
6790 munmap(data1, SHM_SIZE);
6792 if (fd1 != -1)
6793 close(fd1);
6795 if (!__kmp_shm_available)
6796 __kmp_tmp_available = __kmp_detect_tmp();
6797 if (!__kmp_shm_available && __kmp_tmp_available) {
6798 // SHM failed to work due to an error other than that the file already
6799 // exists. Try to create a temp file under /tmp.
6800 // If /tmp isn't accessible, fall back to using environment variable.
6801 // TODO: /tmp might not always be the temporary directory. For now we will
6802 // not consider TMPDIR.
6803 int fd1 = -1;
6804 temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6805 int tmp_preexist = 0;
6806 fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6807 if ((fd1 == -1) && (errno == EEXIST)) {
6808 // file didn't open because it already exists.
6809 // try opening existing file
6810 fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
6811 if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6812 KMP_WARNING(FunctionError, "Can't open TEMP");
6813 __kmp_tmp_available = false;
6814 } else {
6815 tmp_preexist = 1;
6818 if (__kmp_tmp_available && tmp_preexist == 0) {
6819 // we created /tmp file now set size
6820 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6821 KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6822 __kmp_tmp_available = false;
6825 if (__kmp_tmp_available) {
6826 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6827 fd1, 0);
6828 if (data1 == MAP_FAILED) { // failed to map /tmp
6829 KMP_WARNING(FunctionError, "Can't map /tmp");
6830 __kmp_tmp_available = false;
6833 if (__kmp_tmp_available) {
6834 if (tmp_preexist == 0) { // set data to TMP, set value
6835 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6837 // Read value from either what we just wrote or existing file.
6838 value = __kmp_str_format("%s", data1); // read value from SHM
6839 munmap(data1, SHM_SIZE);
6841 if (fd1 != -1)
6842 close(fd1);
6844 if (!__kmp_shm_available && !__kmp_tmp_available) {
6845 // no /dev/shm and no /tmp -- fall back to environment variable
6846 // Set environment variable, but do not overwrite if it exists.
6847 __kmp_env_set(name, __kmp_registration_str, 0);
6848 // read value to see if it got set
6849 value = __kmp_env_get(name);
6851 #else // Windows and unix with static library
6852 // Set environment variable, but do not overwrite if it exists.
6853 __kmp_env_set(name, __kmp_registration_str, 0);
6854 // read value to see if it got set
6855 value = __kmp_env_get(name);
6856 #endif
6858 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6859 done = 1; // Ok, environment variable set successfully, exit the loop.
6860 } else {
6861 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6862 // Check whether it alive or dead.
6863 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6864 char *tail = value;
6865 char *flag_addr_str = NULL;
6866 char *flag_val_str = NULL;
6867 char const *file_name = NULL;
6868 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6869 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6870 file_name = tail;
6871 if (tail != NULL) {
6872 unsigned long *flag_addr = 0;
6873 unsigned long flag_val = 0;
6874 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6875 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6876 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6877 // First, check whether environment-encoded address is mapped into
6878 // addr space.
6879 // If so, dereference it to see if it still has the right value.
6880 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6881 neighbor = 1;
6882 } else {
6883 // If not, then we know the other copy of the library is no longer
6884 // running.
6885 neighbor = 2;
6889 switch (neighbor) {
6890 case 0: // Cannot parse environment variable -- neighbor status unknown.
6891 // Assume it is the incompatible format of future version of the
6892 // library. Assume the other library is alive.
6893 // WARN( ... ); // TODO: Issue a warning.
6894 file_name = "unknown library";
6895 KMP_FALLTHROUGH();
6896 // Attention! Falling to the next case. That's intentional.
6897 case 1: { // Neighbor is alive.
6898 // Check it is allowed.
6899 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6900 if (!__kmp_str_match_true(duplicate_ok)) {
6901 // That's not allowed. Issue fatal error.
6902 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6903 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6905 KMP_INTERNAL_FREE(duplicate_ok);
6906 __kmp_duplicate_library_ok = 1;
6907 done = 1; // Exit the loop.
6908 } break;
6909 case 2: { // Neighbor is dead.
6911 #if defined(KMP_USE_SHM)
6912 if (__kmp_shm_available) { // close shared memory.
6913 shm_unlink(shm_name); // this removes file in /dev/shm
6914 } else if (__kmp_tmp_available) {
6915 unlink(temp_reg_status_file_name); // this removes the temp file
6916 } else {
6917 // Clear the variable and try to register library again.
6918 __kmp_env_unset(name);
6920 #else
6921 // Clear the variable and try to register library again.
6922 __kmp_env_unset(name);
6923 #endif
6924 } break;
6925 default: {
6926 KMP_DEBUG_ASSERT(0);
6927 } break;
6930 KMP_INTERNAL_FREE((void *)value);
6931 #if defined(KMP_USE_SHM)
6932 if (shm_name)
6933 KMP_INTERNAL_FREE((void *)shm_name);
6934 #endif
6935 } // while
6936 KMP_INTERNAL_FREE((void *)name);
6938 } // func __kmp_register_library_startup
6940 void __kmp_unregister_library(void) {
6942 char *name = __kmp_reg_status_name();
6943 char *value = NULL;
6945 #if defined(KMP_USE_SHM)
6946 char *shm_name = nullptr;
6947 int fd1;
6948 if (__kmp_shm_available) {
6949 shm_name = __kmp_str_format("/%s", name);
6950 fd1 = shm_open(shm_name, O_RDONLY, 0666);
6951 if (fd1 != -1) { // File opened successfully
6952 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6953 if (data1 != MAP_FAILED) {
6954 value = __kmp_str_format("%s", data1); // read value from SHM
6955 munmap(data1, SHM_SIZE);
6957 close(fd1);
6959 } else if (__kmp_tmp_available) { // try /tmp
6960 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6961 if (fd1 != -1) { // File opened successfully
6962 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6963 if (data1 != MAP_FAILED) {
6964 value = __kmp_str_format("%s", data1); // read value from /tmp
6965 munmap(data1, SHM_SIZE);
6967 close(fd1);
6969 } else { // fall back to envirable
6970 value = __kmp_env_get(name);
6972 #else
6973 value = __kmp_env_get(name);
6974 #endif
6976 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6977 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6978 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6979 // Ok, this is our variable. Delete it.
6980 #if defined(KMP_USE_SHM)
6981 if (__kmp_shm_available) {
6982 shm_unlink(shm_name); // this removes file in /dev/shm
6983 } else if (__kmp_tmp_available) {
6984 unlink(temp_reg_status_file_name); // this removes the temp file
6985 } else {
6986 __kmp_env_unset(name);
6988 #else
6989 __kmp_env_unset(name);
6990 #endif
6993 #if defined(KMP_USE_SHM)
6994 if (shm_name)
6995 KMP_INTERNAL_FREE(shm_name);
6996 if (temp_reg_status_file_name)
6997 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6998 #endif
7000 KMP_INTERNAL_FREE(__kmp_registration_str);
7001 KMP_INTERNAL_FREE(value);
7002 KMP_INTERNAL_FREE(name);
7004 __kmp_registration_flag = 0;
7005 __kmp_registration_str = NULL;
7007 } // __kmp_unregister_library
7009 // End of Library registration stuff.
7010 // -----------------------------------------------------------------------------
7012 #if KMP_MIC_SUPPORTED
7014 static void __kmp_check_mic_type() {
7015 kmp_cpuid_t cpuid_state = {0};
7016 kmp_cpuid_t *cs_p = &cpuid_state;
7017 __kmp_x86_cpuid(1, 0, cs_p);
7018 // We don't support mic1 at the moment
7019 if ((cs_p->eax & 0xff0) == 0xB10) {
7020 __kmp_mic_type = mic2;
7021 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7022 __kmp_mic_type = mic3;
7023 } else {
7024 __kmp_mic_type = non_mic;
7028 #endif /* KMP_MIC_SUPPORTED */
7030 #if KMP_HAVE_UMWAIT
7031 static void __kmp_user_level_mwait_init() {
7032 struct kmp_cpuid buf;
7033 __kmp_x86_cpuid(7, 0, &buf);
7034 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7035 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7036 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7037 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7038 __kmp_umwait_enabled));
7040 #elif KMP_HAVE_MWAIT
7041 #ifndef AT_INTELPHIUSERMWAIT
7042 // Spurious, non-existent value that should always fail to return anything.
7043 // Will be replaced with the correct value when we know that.
7044 #define AT_INTELPHIUSERMWAIT 10000
7045 #endif
7046 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7047 // earlier OS is used to build the RTL, we'll use the following internal
7048 // function when the entry is not found.
7049 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7050 unsigned long getauxval(unsigned long) { return 0; }
7052 static void __kmp_user_level_mwait_init() {
7053 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7054 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7055 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7056 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7057 if (__kmp_mic_type == mic3) {
7058 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7059 if ((res & 0x1) || __kmp_user_level_mwait) {
7060 __kmp_mwait_enabled = TRUE;
7061 if (__kmp_user_level_mwait) {
7062 KMP_INFORM(EnvMwaitWarn);
7064 } else {
7065 __kmp_mwait_enabled = FALSE;
7068 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7069 "__kmp_mwait_enabled = %d\n",
7070 __kmp_mic_type, __kmp_mwait_enabled));
7072 #endif /* KMP_HAVE_UMWAIT */
7074 static void __kmp_do_serial_initialize(void) {
7075 int i, gtid;
7076 size_t size;
7078 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7080 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7081 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7082 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7083 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7084 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7086 #if OMPT_SUPPORT
7087 ompt_pre_init();
7088 #endif
7089 #if OMPD_SUPPORT
7090 __kmp_env_dump();
7091 ompd_init();
7092 #endif
7094 __kmp_validate_locks();
7096 #if ENABLE_LIBOMPTARGET
7097 /* Initialize functions from libomptarget */
7098 __kmp_init_omptarget();
7099 #endif
7101 /* Initialize internal memory allocator */
7102 __kmp_init_allocator();
7104 /* Register the library startup via an environment variable or via mapped
7105 shared memory file and check to see whether another copy of the library is
7106 already registered. Since forked child process is often terminated, we
7107 postpone the registration till middle initialization in the child */
7108 if (__kmp_need_register_serial)
7109 __kmp_register_library_startup();
7111 /* TODO reinitialization of library */
7112 if (TCR_4(__kmp_global.g.g_done)) {
7113 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7116 __kmp_global.g.g_abort = 0;
7117 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7119 /* initialize the locks */
7120 #if KMP_USE_ADAPTIVE_LOCKS
7121 #if KMP_DEBUG_ADAPTIVE_LOCKS
7122 __kmp_init_speculative_stats();
7123 #endif
7124 #endif
7125 #if KMP_STATS_ENABLED
7126 __kmp_stats_init();
7127 #endif
7128 __kmp_init_lock(&__kmp_global_lock);
7129 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7130 __kmp_init_lock(&__kmp_debug_lock);
7131 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7132 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7133 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7134 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7135 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7136 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7137 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7138 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7139 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7140 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7141 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7142 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7143 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7144 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7145 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7146 #if KMP_USE_MONITOR
7147 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7148 #endif
7149 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7151 /* conduct initialization and initial setup of configuration */
7153 __kmp_runtime_initialize();
7155 #if KMP_MIC_SUPPORTED
7156 __kmp_check_mic_type();
7157 #endif
7159 // Some global variable initialization moved here from kmp_env_initialize()
7160 #ifdef KMP_DEBUG
7161 kmp_diag = 0;
7162 #endif
7163 __kmp_abort_delay = 0;
7165 // From __kmp_init_dflt_team_nth()
7166 /* assume the entire machine will be used */
7167 __kmp_dflt_team_nth_ub = __kmp_xproc;
7168 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7169 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7171 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7172 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7174 __kmp_max_nth = __kmp_sys_max_nth;
7175 __kmp_cg_max_nth = __kmp_sys_max_nth;
7176 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7177 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7178 __kmp_teams_max_nth = __kmp_sys_max_nth;
7181 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7182 // part
7183 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7184 #if KMP_USE_MONITOR
7185 __kmp_monitor_wakeups =
7186 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7187 __kmp_bt_intervals =
7188 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7189 #endif
7190 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7191 __kmp_library = library_throughput;
7192 // From KMP_SCHEDULE initialization
7193 __kmp_static = kmp_sch_static_balanced;
7194 // AC: do not use analytical here, because it is non-monotonous
7195 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7196 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7197 // need to repeat assignment
7198 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7199 // bit control and barrier method control parts
7200 #if KMP_FAST_REDUCTION_BARRIER
7201 #define kmp_reduction_barrier_gather_bb ((int)1)
7202 #define kmp_reduction_barrier_release_bb ((int)1)
7203 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7204 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7205 #endif // KMP_FAST_REDUCTION_BARRIER
7206 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7207 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7208 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7209 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7210 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7211 #if KMP_FAST_REDUCTION_BARRIER
7212 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7213 // lin_64 ): hyper,1
7214 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7215 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7216 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7217 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7219 #endif // KMP_FAST_REDUCTION_BARRIER
7221 #if KMP_FAST_REDUCTION_BARRIER
7222 #undef kmp_reduction_barrier_release_pat
7223 #undef kmp_reduction_barrier_gather_pat
7224 #undef kmp_reduction_barrier_release_bb
7225 #undef kmp_reduction_barrier_gather_bb
7226 #endif // KMP_FAST_REDUCTION_BARRIER
7227 #if KMP_MIC_SUPPORTED
7228 if (__kmp_mic_type == mic2) { // KNC
7229 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7230 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7231 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7232 1; // forkjoin release
7233 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7234 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7236 #if KMP_FAST_REDUCTION_BARRIER
7237 if (__kmp_mic_type == mic2) { // KNC
7238 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7239 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7241 #endif // KMP_FAST_REDUCTION_BARRIER
7242 #endif // KMP_MIC_SUPPORTED
7244 // From KMP_CHECKS initialization
7245 #ifdef KMP_DEBUG
7246 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7247 #else
7248 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7249 #endif
7251 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7252 __kmp_foreign_tp = TRUE;
7254 __kmp_global.g.g_dynamic = FALSE;
7255 __kmp_global.g.g_dynamic_mode = dynamic_default;
7257 __kmp_init_nesting_mode();
7259 __kmp_env_initialize(NULL);
7261 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7262 __kmp_user_level_mwait_init();
7263 #endif
7264 // Print all messages in message catalog for testing purposes.
7265 #ifdef KMP_DEBUG
7266 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7267 if (__kmp_str_match_true(val)) {
7268 kmp_str_buf_t buffer;
7269 __kmp_str_buf_init(&buffer);
7270 __kmp_i18n_dump_catalog(&buffer);
7271 __kmp_printf("%s", buffer.str);
7272 __kmp_str_buf_free(&buffer);
7274 __kmp_env_free(&val);
7275 #endif
7277 __kmp_threads_capacity =
7278 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7279 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7280 __kmp_tp_capacity = __kmp_default_tp_capacity(
7281 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7283 // If the library is shut down properly, both pools must be NULL. Just in
7284 // case, set them to NULL -- some memory may leak, but subsequent code will
7285 // work even if pools are not freed.
7286 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7287 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7288 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7289 __kmp_thread_pool = NULL;
7290 __kmp_thread_pool_insert_pt = NULL;
7291 __kmp_team_pool = NULL;
7293 /* Allocate all of the variable sized records */
7294 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7295 * expandable */
7296 /* Since allocation is cache-aligned, just add extra padding at the end */
7297 size =
7298 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7299 CACHE_LINE;
7300 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7301 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7302 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7304 /* init thread counts */
7305 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7306 0); // Asserts fail if the library is reinitializing and
7307 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7308 __kmp_all_nth = 0;
7309 __kmp_nth = 0;
7311 /* setup the uber master thread and hierarchy */
7312 gtid = __kmp_register_root(TRUE);
7313 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7314 KMP_ASSERT(KMP_UBER_GTID(gtid));
7315 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7317 KMP_MB(); /* Flush all pending memory write invalidates. */
7319 __kmp_common_initialize();
7321 #if KMP_OS_UNIX
7322 /* invoke the child fork handler */
7323 __kmp_register_atfork();
7324 #endif
7326 #if !KMP_DYNAMIC_LIB || \
7327 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7329 /* Invoke the exit handler when the program finishes, only for static
7330 library and macOS* dynamic. For other dynamic libraries, we already
7331 have _fini and DllMain. */
7332 int rc = atexit(__kmp_internal_end_atexit);
7333 if (rc != 0) {
7334 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7335 __kmp_msg_null);
7338 #endif
7340 #if KMP_HANDLE_SIGNALS
7341 #if KMP_OS_UNIX
7342 /* NOTE: make sure that this is called before the user installs their own
7343 signal handlers so that the user handlers are called first. this way they
7344 can return false, not call our handler, avoid terminating the library, and
7345 continue execution where they left off. */
7346 __kmp_install_signals(FALSE);
7347 #endif /* KMP_OS_UNIX */
7348 #if KMP_OS_WINDOWS
7349 __kmp_install_signals(TRUE);
7350 #endif /* KMP_OS_WINDOWS */
7351 #endif
7353 /* we have finished the serial initialization */
7354 __kmp_init_counter++;
7356 __kmp_init_serial = TRUE;
7358 if (__kmp_version) {
7359 __kmp_print_version_1();
7362 if (__kmp_settings) {
7363 __kmp_env_print();
7366 if (__kmp_display_env || __kmp_display_env_verbose) {
7367 __kmp_env_print_2();
7370 #if OMPT_SUPPORT
7371 ompt_post_init();
7372 #endif
7374 KMP_MB();
7376 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7379 void __kmp_serial_initialize(void) {
7380 if (__kmp_init_serial) {
7381 return;
7383 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7384 if (__kmp_init_serial) {
7385 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7386 return;
7388 __kmp_do_serial_initialize();
7389 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7392 static void __kmp_do_middle_initialize(void) {
7393 int i, j;
7394 int prev_dflt_team_nth;
7396 if (!__kmp_init_serial) {
7397 __kmp_do_serial_initialize();
7400 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7402 if (UNLIKELY(!__kmp_need_register_serial)) {
7403 // We are in a forked child process. The registration was skipped during
7404 // serial initialization in __kmp_atfork_child handler. Do it here.
7405 __kmp_register_library_startup();
7408 // Save the previous value for the __kmp_dflt_team_nth so that
7409 // we can avoid some reinitialization if it hasn't changed.
7410 prev_dflt_team_nth = __kmp_dflt_team_nth;
7412 #if KMP_AFFINITY_SUPPORTED
7413 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7414 // number of cores on the machine.
7415 __kmp_affinity_initialize(__kmp_affinity);
7417 #endif /* KMP_AFFINITY_SUPPORTED */
7419 KMP_ASSERT(__kmp_xproc > 0);
7420 if (__kmp_avail_proc == 0) {
7421 __kmp_avail_proc = __kmp_xproc;
7424 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7425 // correct them now
7426 j = 0;
7427 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7428 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7429 __kmp_avail_proc;
7430 j++;
7433 if (__kmp_dflt_team_nth == 0) {
7434 #ifdef KMP_DFLT_NTH_CORES
7435 // Default #threads = #cores
7436 __kmp_dflt_team_nth = __kmp_ncores;
7437 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7438 "__kmp_ncores (%d)\n",
7439 __kmp_dflt_team_nth));
7440 #else
7441 // Default #threads = #available OS procs
7442 __kmp_dflt_team_nth = __kmp_avail_proc;
7443 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7444 "__kmp_avail_proc(%d)\n",
7445 __kmp_dflt_team_nth));
7446 #endif /* KMP_DFLT_NTH_CORES */
7449 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7450 __kmp_dflt_team_nth = KMP_MIN_NTH;
7452 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7453 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7456 if (__kmp_nesting_mode > 0)
7457 __kmp_set_nesting_mode_threads();
7459 // There's no harm in continuing if the following check fails,
7460 // but it indicates an error in the previous logic.
7461 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7463 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7464 // Run through the __kmp_threads array and set the num threads icv for each
7465 // root thread that is currently registered with the RTL (which has not
7466 // already explicitly set its nthreads-var with a call to
7467 // omp_set_num_threads()).
7468 for (i = 0; i < __kmp_threads_capacity; i++) {
7469 kmp_info_t *thread = __kmp_threads[i];
7470 if (thread == NULL)
7471 continue;
7472 if (thread->th.th_current_task->td_icvs.nproc != 0)
7473 continue;
7475 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7478 KA_TRACE(
7480 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7481 __kmp_dflt_team_nth));
7483 #ifdef KMP_ADJUST_BLOCKTIME
7484 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7485 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7486 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7487 if (__kmp_nth > __kmp_avail_proc) {
7488 __kmp_zero_bt = TRUE;
7491 #endif /* KMP_ADJUST_BLOCKTIME */
7493 /* we have finished middle initialization */
7494 TCW_SYNC_4(__kmp_init_middle, TRUE);
7496 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7499 void __kmp_middle_initialize(void) {
7500 if (__kmp_init_middle) {
7501 return;
7503 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7504 if (__kmp_init_middle) {
7505 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7506 return;
7508 __kmp_do_middle_initialize();
7509 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7512 void __kmp_parallel_initialize(void) {
7513 int gtid = __kmp_entry_gtid(); // this might be a new root
7515 /* synchronize parallel initialization (for sibling) */
7516 if (TCR_4(__kmp_init_parallel))
7517 return;
7518 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7519 if (TCR_4(__kmp_init_parallel)) {
7520 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7521 return;
7524 /* TODO reinitialization after we have already shut down */
7525 if (TCR_4(__kmp_global.g.g_done)) {
7526 KA_TRACE(
7528 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7529 __kmp_infinite_loop();
7532 /* jc: The lock __kmp_initz_lock is already held, so calling
7533 __kmp_serial_initialize would cause a deadlock. So we call
7534 __kmp_do_serial_initialize directly. */
7535 if (!__kmp_init_middle) {
7536 __kmp_do_middle_initialize();
7538 __kmp_assign_root_init_mask();
7539 __kmp_resume_if_hard_paused();
7541 /* begin initialization */
7542 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7543 KMP_ASSERT(KMP_UBER_GTID(gtid));
7545 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7546 // Save the FP control regs.
7547 // Worker threads will set theirs to these values at thread startup.
7548 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7549 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7550 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7551 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7553 #if KMP_OS_UNIX
7554 #if KMP_HANDLE_SIGNALS
7555 /* must be after __kmp_serial_initialize */
7556 __kmp_install_signals(TRUE);
7557 #endif
7558 #endif
7560 __kmp_suspend_initialize();
7562 #if defined(USE_LOAD_BALANCE)
7563 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7564 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7566 #else
7567 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7568 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7570 #endif
7572 if (__kmp_version) {
7573 __kmp_print_version_2();
7576 /* we have finished parallel initialization */
7577 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7579 KMP_MB();
7580 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7582 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7585 void __kmp_hidden_helper_initialize() {
7586 if (TCR_4(__kmp_init_hidden_helper))
7587 return;
7589 // __kmp_parallel_initialize is required before we initialize hidden helper
7590 if (!TCR_4(__kmp_init_parallel))
7591 __kmp_parallel_initialize();
7593 // Double check. Note that this double check should not be placed before
7594 // __kmp_parallel_initialize as it will cause dead lock.
7595 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7596 if (TCR_4(__kmp_init_hidden_helper)) {
7597 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7598 return;
7601 #if KMP_AFFINITY_SUPPORTED
7602 // Initialize hidden helper affinity settings.
7603 // The above __kmp_parallel_initialize() will initialize
7604 // regular affinity (and topology) if not already done.
7605 if (!__kmp_hh_affinity.flags.initialized)
7606 __kmp_affinity_initialize(__kmp_hh_affinity);
7607 #endif
7609 // Set the count of hidden helper tasks to be executed to zero
7610 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7612 // Set the global variable indicating that we're initializing hidden helper
7613 // team/threads
7614 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7616 // Platform independent initialization
7617 __kmp_do_initialize_hidden_helper_threads();
7619 // Wait here for the finish of initialization of hidden helper teams
7620 __kmp_hidden_helper_threads_initz_wait();
7622 // We have finished hidden helper initialization
7623 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7625 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7628 /* ------------------------------------------------------------------------ */
7630 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7631 kmp_team_t *team) {
7632 kmp_disp_t *dispatch;
7634 KMP_MB();
7636 /* none of the threads have encountered any constructs, yet. */
7637 this_thr->th.th_local.this_construct = 0;
7638 #if KMP_CACHE_MANAGE
7639 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7640 #endif /* KMP_CACHE_MANAGE */
7641 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7642 KMP_DEBUG_ASSERT(dispatch);
7643 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7644 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7645 // this_thr->th.th_info.ds.ds_tid ] );
7647 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7648 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7649 if (__kmp_env_consistency_check)
7650 __kmp_push_parallel(gtid, team->t.t_ident);
7652 KMP_MB(); /* Flush all pending memory write invalidates. */
7655 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7656 kmp_team_t *team) {
7657 if (__kmp_env_consistency_check)
7658 __kmp_pop_parallel(gtid, team->t.t_ident);
7660 __kmp_finish_implicit_task(this_thr);
7663 int __kmp_invoke_task_func(int gtid) {
7664 int rc;
7665 int tid = __kmp_tid_from_gtid(gtid);
7666 kmp_info_t *this_thr = __kmp_threads[gtid];
7667 kmp_team_t *team = this_thr->th.th_team;
7669 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7670 #if USE_ITT_BUILD
7671 if (__itt_stack_caller_create_ptr) {
7672 // inform ittnotify about entering user's code
7673 if (team->t.t_stack_id != NULL) {
7674 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7675 } else {
7676 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7677 __kmp_itt_stack_callee_enter(
7678 (__itt_caller)team->t.t_parent->t.t_stack_id);
7681 #endif /* USE_ITT_BUILD */
7682 #if INCLUDE_SSC_MARKS
7683 SSC_MARK_INVOKING();
7684 #endif
7686 #if OMPT_SUPPORT
7687 void *dummy;
7688 void **exit_frame_p;
7689 ompt_data_t *my_task_data;
7690 ompt_data_t *my_parallel_data;
7691 int ompt_team_size;
7693 if (ompt_enabled.enabled) {
7694 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7695 .ompt_task_info.frame.exit_frame.ptr);
7696 } else {
7697 exit_frame_p = &dummy;
7700 my_task_data =
7701 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7702 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7703 if (ompt_enabled.ompt_callback_implicit_task) {
7704 ompt_team_size = team->t.t_nproc;
7705 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7706 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7707 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7708 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7710 #endif
7712 #if KMP_STATS_ENABLED
7713 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7714 if (previous_state == stats_state_e::TEAMS_REGION) {
7715 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7716 } else {
7717 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7719 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7720 #endif
7722 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7723 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7724 #if OMPT_SUPPORT
7726 exit_frame_p
7727 #endif
7729 #if OMPT_SUPPORT
7730 *exit_frame_p = NULL;
7731 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7732 #endif
7734 #if KMP_STATS_ENABLED
7735 if (previous_state == stats_state_e::TEAMS_REGION) {
7736 KMP_SET_THREAD_STATE(previous_state);
7738 KMP_POP_PARTITIONED_TIMER();
7739 #endif
7741 #if USE_ITT_BUILD
7742 if (__itt_stack_caller_create_ptr) {
7743 // inform ittnotify about leaving user's code
7744 if (team->t.t_stack_id != NULL) {
7745 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7746 } else {
7747 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7748 __kmp_itt_stack_callee_leave(
7749 (__itt_caller)team->t.t_parent->t.t_stack_id);
7752 #endif /* USE_ITT_BUILD */
7753 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7755 return rc;
7758 void __kmp_teams_master(int gtid) {
7759 // This routine is called by all primary threads in teams construct
7760 kmp_info_t *thr = __kmp_threads[gtid];
7761 kmp_team_t *team = thr->th.th_team;
7762 ident_t *loc = team->t.t_ident;
7763 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7764 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7765 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7766 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7767 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7769 // This thread is a new CG root. Set up the proper variables.
7770 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7771 tmp->cg_root = thr; // Make thr the CG root
7772 // Init to thread limit stored when league primary threads were forked
7773 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7774 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7775 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7776 " cg_nthreads to 1\n",
7777 thr, tmp));
7778 tmp->up = thr->th.th_cg_roots;
7779 thr->th.th_cg_roots = tmp;
7781 // Launch league of teams now, but not let workers execute
7782 // (they hang on fork barrier until next parallel)
7783 #if INCLUDE_SSC_MARKS
7784 SSC_MARK_FORKING();
7785 #endif
7786 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7787 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7788 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7789 #if INCLUDE_SSC_MARKS
7790 SSC_MARK_JOINING();
7791 #endif
7792 // If the team size was reduced from the limit, set it to the new size
7793 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7794 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7795 // AC: last parameter "1" eliminates join barrier which won't work because
7796 // worker threads are in a fork barrier waiting for more parallel regions
7797 __kmp_join_call(loc, gtid
7798 #if OMPT_SUPPORT
7800 fork_context_intel
7801 #endif
7806 int __kmp_invoke_teams_master(int gtid) {
7807 kmp_info_t *this_thr = __kmp_threads[gtid];
7808 kmp_team_t *team = this_thr->th.th_team;
7809 #if KMP_DEBUG
7810 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7811 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7812 (void *)__kmp_teams_master);
7813 #endif
7814 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7815 #if OMPT_SUPPORT
7816 int tid = __kmp_tid_from_gtid(gtid);
7817 ompt_data_t *task_data =
7818 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7819 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7820 if (ompt_enabled.ompt_callback_implicit_task) {
7821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7822 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7823 ompt_task_initial);
7824 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7826 #endif
7827 __kmp_teams_master(gtid);
7828 #if OMPT_SUPPORT
7829 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7830 #endif
7831 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7832 return 1;
7835 /* this sets the requested number of threads for the next parallel region
7836 encountered by this team. since this should be enclosed in the forkjoin
7837 critical section it should avoid race conditions with asymmetrical nested
7838 parallelism */
7840 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7841 kmp_info_t *thr = __kmp_threads[gtid];
7843 if (num_threads > 0)
7844 thr->th.th_set_nproc = num_threads;
7847 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7848 int num_threads) {
7849 KMP_DEBUG_ASSERT(thr);
7850 // Remember the number of threads for inner parallel regions
7851 if (!TCR_4(__kmp_init_middle))
7852 __kmp_middle_initialize(); // get internal globals calculated
7853 __kmp_assign_root_init_mask();
7854 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7855 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7857 if (num_threads == 0) {
7858 if (__kmp_teams_thread_limit > 0) {
7859 num_threads = __kmp_teams_thread_limit;
7860 } else {
7861 num_threads = __kmp_avail_proc / num_teams;
7863 // adjust num_threads w/o warning as it is not user setting
7864 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7865 // no thread_limit clause specified - do not change thread-limit-var ICV
7866 if (num_threads > __kmp_dflt_team_nth) {
7867 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7869 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7870 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7871 } // prevent team size to exceed thread-limit-var
7872 if (num_teams * num_threads > __kmp_teams_max_nth) {
7873 num_threads = __kmp_teams_max_nth / num_teams;
7875 if (num_threads == 0) {
7876 num_threads = 1;
7878 } else {
7879 if (num_threads < 0) {
7880 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7881 __kmp_msg_null);
7882 num_threads = 1;
7884 // This thread will be the primary thread of the league primary threads
7885 // Store new thread limit; old limit is saved in th_cg_roots list
7886 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7887 // num_threads = min(num_threads, nthreads-var)
7888 if (num_threads > __kmp_dflt_team_nth) {
7889 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7891 if (num_teams * num_threads > __kmp_teams_max_nth) {
7892 int new_threads = __kmp_teams_max_nth / num_teams;
7893 if (new_threads == 0) {
7894 new_threads = 1;
7896 if (new_threads != num_threads) {
7897 if (!__kmp_reserve_warn) { // user asked for too many threads
7898 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7899 __kmp_msg(kmp_ms_warning,
7900 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7901 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7904 num_threads = new_threads;
7907 thr->th.th_teams_size.nth = num_threads;
7910 /* this sets the requested number of teams for the teams region and/or
7911 the number of threads for the next parallel region encountered */
7912 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7913 int num_threads) {
7914 kmp_info_t *thr = __kmp_threads[gtid];
7915 if (num_teams < 0) {
7916 // OpenMP specification requires requested values to be positive,
7917 // but people can send us any value, so we'd better check
7918 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7919 __kmp_msg_null);
7920 num_teams = 1;
7922 if (num_teams == 0) {
7923 if (__kmp_nteams > 0) {
7924 num_teams = __kmp_nteams;
7925 } else {
7926 num_teams = 1; // default number of teams is 1.
7929 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7930 if (!__kmp_reserve_warn) {
7931 __kmp_reserve_warn = 1;
7932 __kmp_msg(kmp_ms_warning,
7933 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7934 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7936 num_teams = __kmp_teams_max_nth;
7938 // Set number of teams (number of threads in the outer "parallel" of the
7939 // teams)
7940 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7942 __kmp_push_thread_limit(thr, num_teams, num_threads);
7945 /* This sets the requested number of teams for the teams region and/or
7946 the number of threads for the next parallel region encountered */
7947 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7948 int num_teams_ub, int num_threads) {
7949 kmp_info_t *thr = __kmp_threads[gtid];
7950 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7951 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7952 KMP_DEBUG_ASSERT(num_threads >= 0);
7954 if (num_teams_lb > num_teams_ub) {
7955 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7956 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7959 int num_teams = 1; // defalt number of teams is 1.
7961 if (num_teams_lb == 0 && num_teams_ub > 0)
7962 num_teams_lb = num_teams_ub;
7964 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7965 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7966 if (num_teams > __kmp_teams_max_nth) {
7967 if (!__kmp_reserve_warn) {
7968 __kmp_reserve_warn = 1;
7969 __kmp_msg(kmp_ms_warning,
7970 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7971 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7973 num_teams = __kmp_teams_max_nth;
7975 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7976 num_teams = num_teams_ub;
7977 } else { // num_teams_lb <= num_teams <= num_teams_ub
7978 if (num_threads <= 0) {
7979 if (num_teams_ub > __kmp_teams_max_nth) {
7980 num_teams = num_teams_lb;
7981 } else {
7982 num_teams = num_teams_ub;
7984 } else {
7985 num_teams = (num_threads > __kmp_teams_max_nth)
7986 ? num_teams
7987 : __kmp_teams_max_nth / num_threads;
7988 if (num_teams < num_teams_lb) {
7989 num_teams = num_teams_lb;
7990 } else if (num_teams > num_teams_ub) {
7991 num_teams = num_teams_ub;
7995 // Set number of teams (number of threads in the outer "parallel" of the
7996 // teams)
7997 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7999 __kmp_push_thread_limit(thr, num_teams, num_threads);
8002 // Set the proc_bind var to use in the following parallel region.
8003 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8004 kmp_info_t *thr = __kmp_threads[gtid];
8005 thr->th.th_set_proc_bind = proc_bind;
8008 /* Launch the worker threads into the microtask. */
8010 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8011 kmp_info_t *this_thr = __kmp_threads[gtid];
8013 #ifdef KMP_DEBUG
8014 int f;
8015 #endif /* KMP_DEBUG */
8017 KMP_DEBUG_ASSERT(team);
8018 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8019 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8020 KMP_MB(); /* Flush all pending memory write invalidates. */
8022 team->t.t_construct = 0; /* no single directives seen yet */
8023 team->t.t_ordered.dt.t_value =
8024 0; /* thread 0 enters the ordered section first */
8026 /* Reset the identifiers on the dispatch buffer */
8027 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8028 if (team->t.t_max_nproc > 1) {
8029 int i;
8030 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8031 team->t.t_disp_buffer[i].buffer_index = i;
8032 team->t.t_disp_buffer[i].doacross_buf_idx = i;
8034 } else {
8035 team->t.t_disp_buffer[0].buffer_index = 0;
8036 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8039 KMP_MB(); /* Flush all pending memory write invalidates. */
8040 KMP_ASSERT(this_thr->th.th_team == team);
8042 #ifdef KMP_DEBUG
8043 for (f = 0; f < team->t.t_nproc; f++) {
8044 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8045 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8047 #endif /* KMP_DEBUG */
8049 /* release the worker threads so they may begin working */
8050 __kmp_fork_barrier(gtid, 0);
8053 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8054 kmp_info_t *this_thr = __kmp_threads[gtid];
8056 KMP_DEBUG_ASSERT(team);
8057 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8058 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8059 KMP_MB(); /* Flush all pending memory write invalidates. */
8061 /* Join barrier after fork */
8063 #ifdef KMP_DEBUG
8064 if (__kmp_threads[gtid] &&
8065 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8066 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8067 __kmp_threads[gtid]);
8068 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8069 "team->t.t_nproc=%d\n",
8070 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8071 team->t.t_nproc);
8072 __kmp_print_structure();
8074 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8075 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8076 #endif /* KMP_DEBUG */
8078 __kmp_join_barrier(gtid); /* wait for everyone */
8079 #if OMPT_SUPPORT
8080 if (ompt_enabled.enabled &&
8081 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8082 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8083 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8084 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8085 #if OMPT_OPTIONAL
8086 void *codeptr = NULL;
8087 if (KMP_MASTER_TID(ds_tid) &&
8088 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8089 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8090 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8092 if (ompt_enabled.ompt_callback_sync_region_wait) {
8093 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8094 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8095 codeptr);
8097 if (ompt_enabled.ompt_callback_sync_region) {
8098 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8099 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8100 codeptr);
8102 #endif
8103 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8104 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8105 ompt_scope_end, NULL, task_data, 0, ds_tid,
8106 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8109 #endif
8111 KMP_MB(); /* Flush all pending memory write invalidates. */
8112 KMP_ASSERT(this_thr->th.th_team == team);
8115 /* ------------------------------------------------------------------------ */
8117 #ifdef USE_LOAD_BALANCE
8119 // Return the worker threads actively spinning in the hot team, if we
8120 // are at the outermost level of parallelism. Otherwise, return 0.
8121 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8122 int i;
8123 int retval;
8124 kmp_team_t *hot_team;
8126 if (root->r.r_active) {
8127 return 0;
8129 hot_team = root->r.r_hot_team;
8130 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8131 return hot_team->t.t_nproc - 1; // Don't count primary thread
8134 // Skip the primary thread - it is accounted for elsewhere.
8135 retval = 0;
8136 for (i = 1; i < hot_team->t.t_nproc; i++) {
8137 if (hot_team->t.t_threads[i]->th.th_active) {
8138 retval++;
8141 return retval;
8144 // Perform an automatic adjustment to the number of
8145 // threads used by the next parallel region.
8146 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8147 int retval;
8148 int pool_active;
8149 int hot_team_active;
8150 int team_curr_active;
8151 int system_active;
8153 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8154 set_nproc));
8155 KMP_DEBUG_ASSERT(root);
8156 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8157 ->th.th_current_task->td_icvs.dynamic == TRUE);
8158 KMP_DEBUG_ASSERT(set_nproc > 1);
8160 if (set_nproc == 1) {
8161 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8162 return 1;
8165 // Threads that are active in the thread pool, active in the hot team for this
8166 // particular root (if we are at the outer par level), and the currently
8167 // executing thread (to become the primary thread) are available to add to the
8168 // new team, but are currently contributing to the system load, and must be
8169 // accounted for.
8170 pool_active = __kmp_thread_pool_active_nth;
8171 hot_team_active = __kmp_active_hot_team_nproc(root);
8172 team_curr_active = pool_active + hot_team_active + 1;
8174 // Check the system load.
8175 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8176 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8177 "hot team active = %d\n",
8178 system_active, pool_active, hot_team_active));
8180 if (system_active < 0) {
8181 // There was an error reading the necessary info from /proc, so use the
8182 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8183 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8184 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8185 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8187 // Make this call behave like the thread limit algorithm.
8188 retval = __kmp_avail_proc - __kmp_nth +
8189 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8190 if (retval > set_nproc) {
8191 retval = set_nproc;
8193 if (retval < KMP_MIN_NTH) {
8194 retval = KMP_MIN_NTH;
8197 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8198 retval));
8199 return retval;
8202 // There is a slight delay in the load balance algorithm in detecting new
8203 // running procs. The real system load at this instant should be at least as
8204 // large as the #active omp thread that are available to add to the team.
8205 if (system_active < team_curr_active) {
8206 system_active = team_curr_active;
8208 retval = __kmp_avail_proc - system_active + team_curr_active;
8209 if (retval > set_nproc) {
8210 retval = set_nproc;
8212 if (retval < KMP_MIN_NTH) {
8213 retval = KMP_MIN_NTH;
8216 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8217 return retval;
8218 } // __kmp_load_balance_nproc()
8220 #endif /* USE_LOAD_BALANCE */
8222 /* ------------------------------------------------------------------------ */
8224 /* NOTE: this is called with the __kmp_init_lock held */
8225 void __kmp_cleanup(void) {
8226 int f;
8228 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8230 if (TCR_4(__kmp_init_parallel)) {
8231 #if KMP_HANDLE_SIGNALS
8232 __kmp_remove_signals();
8233 #endif
8234 TCW_4(__kmp_init_parallel, FALSE);
8237 if (TCR_4(__kmp_init_middle)) {
8238 #if KMP_AFFINITY_SUPPORTED
8239 __kmp_affinity_uninitialize();
8240 #endif /* KMP_AFFINITY_SUPPORTED */
8241 __kmp_cleanup_hierarchy();
8242 TCW_4(__kmp_init_middle, FALSE);
8245 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8247 if (__kmp_init_serial) {
8248 __kmp_runtime_destroy();
8249 __kmp_init_serial = FALSE;
8252 __kmp_cleanup_threadprivate_caches();
8254 for (f = 0; f < __kmp_threads_capacity; f++) {
8255 if (__kmp_root[f] != NULL) {
8256 __kmp_free(__kmp_root[f]);
8257 __kmp_root[f] = NULL;
8260 __kmp_free(__kmp_threads);
8261 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8262 // there is no need in freeing __kmp_root.
8263 __kmp_threads = NULL;
8264 __kmp_root = NULL;
8265 __kmp_threads_capacity = 0;
8267 // Free old __kmp_threads arrays if they exist.
8268 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8269 while (ptr) {
8270 kmp_old_threads_list_t *next = ptr->next;
8271 __kmp_free(ptr->threads);
8272 __kmp_free(ptr);
8273 ptr = next;
8276 #if KMP_USE_DYNAMIC_LOCK
8277 __kmp_cleanup_indirect_user_locks();
8278 #else
8279 __kmp_cleanup_user_locks();
8280 #endif
8281 #if OMPD_SUPPORT
8282 if (ompd_state) {
8283 __kmp_free(ompd_env_block);
8284 ompd_env_block = NULL;
8285 ompd_env_block_size = 0;
8287 #endif
8289 #if KMP_AFFINITY_SUPPORTED
8290 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8291 __kmp_cpuinfo_file = NULL;
8292 #endif /* KMP_AFFINITY_SUPPORTED */
8294 #if KMP_USE_ADAPTIVE_LOCKS
8295 #if KMP_DEBUG_ADAPTIVE_LOCKS
8296 __kmp_print_speculative_stats();
8297 #endif
8298 #endif
8299 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8300 __kmp_nested_nth.nth = NULL;
8301 __kmp_nested_nth.size = 0;
8302 __kmp_nested_nth.used = 0;
8303 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8304 __kmp_nested_proc_bind.bind_types = NULL;
8305 __kmp_nested_proc_bind.size = 0;
8306 __kmp_nested_proc_bind.used = 0;
8307 if (__kmp_affinity_format) {
8308 KMP_INTERNAL_FREE(__kmp_affinity_format);
8309 __kmp_affinity_format = NULL;
8312 __kmp_i18n_catclose();
8314 #if KMP_USE_HIER_SCHED
8315 __kmp_hier_scheds.deallocate();
8316 #endif
8318 #if KMP_STATS_ENABLED
8319 __kmp_stats_fini();
8320 #endif
8322 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8325 /* ------------------------------------------------------------------------ */
8327 int __kmp_ignore_mppbeg(void) {
8328 char *env;
8330 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8331 if (__kmp_str_match_false(env))
8332 return FALSE;
8334 // By default __kmpc_begin() is no-op.
8335 return TRUE;
8338 int __kmp_ignore_mppend(void) {
8339 char *env;
8341 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8342 if (__kmp_str_match_false(env))
8343 return FALSE;
8345 // By default __kmpc_end() is no-op.
8346 return TRUE;
8349 void __kmp_internal_begin(void) {
8350 int gtid;
8351 kmp_root_t *root;
8353 /* this is a very important step as it will register new sibling threads
8354 and assign these new uber threads a new gtid */
8355 gtid = __kmp_entry_gtid();
8356 root = __kmp_threads[gtid]->th.th_root;
8357 KMP_ASSERT(KMP_UBER_GTID(gtid));
8359 if (root->r.r_begin)
8360 return;
8361 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8362 if (root->r.r_begin) {
8363 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8364 return;
8367 root->r.r_begin = TRUE;
8369 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8372 /* ------------------------------------------------------------------------ */
8374 void __kmp_user_set_library(enum library_type arg) {
8375 int gtid;
8376 kmp_root_t *root;
8377 kmp_info_t *thread;
8379 /* first, make sure we are initialized so we can get our gtid */
8381 gtid = __kmp_entry_gtid();
8382 thread = __kmp_threads[gtid];
8384 root = thread->th.th_root;
8386 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8387 library_serial));
8388 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8389 thread */
8390 KMP_WARNING(SetLibraryIncorrectCall);
8391 return;
8394 switch (arg) {
8395 case library_serial:
8396 thread->th.th_set_nproc = 0;
8397 set__nproc(thread, 1);
8398 break;
8399 case library_turnaround:
8400 thread->th.th_set_nproc = 0;
8401 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8402 : __kmp_dflt_team_nth_ub);
8403 break;
8404 case library_throughput:
8405 thread->th.th_set_nproc = 0;
8406 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8407 : __kmp_dflt_team_nth_ub);
8408 break;
8409 default:
8410 KMP_FATAL(UnknownLibraryType, arg);
8413 __kmp_aux_set_library(arg);
8416 void __kmp_aux_set_stacksize(size_t arg) {
8417 if (!__kmp_init_serial)
8418 __kmp_serial_initialize();
8420 #if KMP_OS_DARWIN
8421 if (arg & (0x1000 - 1)) {
8422 arg &= ~(0x1000 - 1);
8423 if (arg + 0x1000) /* check for overflow if we round up */
8424 arg += 0x1000;
8426 #endif
8427 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8429 /* only change the default stacksize before the first parallel region */
8430 if (!TCR_4(__kmp_init_parallel)) {
8431 size_t value = arg; /* argument is in bytes */
8433 if (value < __kmp_sys_min_stksize)
8434 value = __kmp_sys_min_stksize;
8435 else if (value > KMP_MAX_STKSIZE)
8436 value = KMP_MAX_STKSIZE;
8438 __kmp_stksize = value;
8440 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8443 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8446 /* set the behaviour of the runtime library */
8447 /* TODO this can cause some odd behaviour with sibling parallelism... */
8448 void __kmp_aux_set_library(enum library_type arg) {
8449 __kmp_library = arg;
8451 switch (__kmp_library) {
8452 case library_serial: {
8453 KMP_INFORM(LibraryIsSerial);
8454 } break;
8455 case library_turnaround:
8456 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8457 __kmp_use_yield = 2; // only yield when oversubscribed
8458 break;
8459 case library_throughput:
8460 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8461 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8462 break;
8463 default:
8464 KMP_FATAL(UnknownLibraryType, arg);
8468 /* Getting team information common for all team API */
8469 // Returns NULL if not in teams construct
8470 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8471 kmp_info_t *thr = __kmp_entry_thread();
8472 teams_serialized = 0;
8473 if (thr->th.th_teams_microtask) {
8474 kmp_team_t *team = thr->th.th_team;
8475 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8476 int ii = team->t.t_level;
8477 teams_serialized = team->t.t_serialized;
8478 int level = tlevel + 1;
8479 KMP_DEBUG_ASSERT(ii >= tlevel);
8480 while (ii > level) {
8481 for (teams_serialized = team->t.t_serialized;
8482 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8484 if (team->t.t_serialized && (!teams_serialized)) {
8485 team = team->t.t_parent;
8486 continue;
8488 if (ii > level) {
8489 team = team->t.t_parent;
8490 ii--;
8493 return team;
8495 return NULL;
8498 int __kmp_aux_get_team_num() {
8499 int serialized;
8500 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8501 if (team) {
8502 if (serialized > 1) {
8503 return 0; // teams region is serialized ( 1 team of 1 thread ).
8504 } else {
8505 return team->t.t_master_tid;
8508 return 0;
8511 int __kmp_aux_get_num_teams() {
8512 int serialized;
8513 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8514 if (team) {
8515 if (serialized > 1) {
8516 return 1;
8517 } else {
8518 return team->t.t_parent->t.t_nproc;
8521 return 1;
8524 /* ------------------------------------------------------------------------ */
8527 * Affinity Format Parser
8529 * Field is in form of: %[[[0].]size]type
8530 * % and type are required (%% means print a literal '%')
8531 * type is either single char or long name surrounded by {},
8532 * e.g., N or {num_threads}
8533 * 0 => leading zeros
8534 * . => right justified when size is specified
8535 * by default output is left justified
8536 * size is the *minimum* field length
8537 * All other characters are printed as is
8539 * Available field types:
8540 * L {thread_level} - omp_get_level()
8541 * n {thread_num} - omp_get_thread_num()
8542 * h {host} - name of host machine
8543 * P {process_id} - process id (integer)
8544 * T {thread_identifier} - native thread identifier (integer)
8545 * N {num_threads} - omp_get_num_threads()
8546 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8547 * a {thread_affinity} - comma separated list of integers or integer ranges
8548 * (values of affinity mask)
8550 * Implementation-specific field types can be added
8551 * If a type is unknown, print "undefined"
8554 // Structure holding the short name, long name, and corresponding data type
8555 // for snprintf. A table of these will represent the entire valid keyword
8556 // field types.
8557 typedef struct kmp_affinity_format_field_t {
8558 char short_name; // from spec e.g., L -> thread level
8559 const char *long_name; // from spec thread_level -> thread level
8560 char field_format; // data type for snprintf (typically 'd' or 's'
8561 // for integer or string)
8562 } kmp_affinity_format_field_t;
8564 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8565 #if KMP_AFFINITY_SUPPORTED
8566 {'A', "thread_affinity", 's'},
8567 #endif
8568 {'t', "team_num", 'd'},
8569 {'T', "num_teams", 'd'},
8570 {'L', "nesting_level", 'd'},
8571 {'n', "thread_num", 'd'},
8572 {'N', "num_threads", 'd'},
8573 {'a', "ancestor_tnum", 'd'},
8574 {'H', "host", 's'},
8575 {'P', "process_id", 'd'},
8576 {'i', "native_thread_id", 'd'}};
8578 // Return the number of characters it takes to hold field
8579 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8580 const char **ptr,
8581 kmp_str_buf_t *field_buffer) {
8582 int rc, format_index, field_value;
8583 const char *width_left, *width_right;
8584 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8585 static const int FORMAT_SIZE = 20;
8586 char format[FORMAT_SIZE] = {0};
8587 char absolute_short_name = 0;
8589 KMP_DEBUG_ASSERT(gtid >= 0);
8590 KMP_DEBUG_ASSERT(th);
8591 KMP_DEBUG_ASSERT(**ptr == '%');
8592 KMP_DEBUG_ASSERT(field_buffer);
8594 __kmp_str_buf_clear(field_buffer);
8596 // Skip the initial %
8597 (*ptr)++;
8599 // Check for %% first
8600 if (**ptr == '%') {
8601 __kmp_str_buf_cat(field_buffer, "%", 1);
8602 (*ptr)++; // skip over the second %
8603 return 1;
8606 // Parse field modifiers if they are present
8607 pad_zeros = false;
8608 if (**ptr == '0') {
8609 pad_zeros = true;
8610 (*ptr)++; // skip over 0
8612 right_justify = false;
8613 if (**ptr == '.') {
8614 right_justify = true;
8615 (*ptr)++; // skip over .
8617 // Parse width of field: [width_left, width_right)
8618 width_left = width_right = NULL;
8619 if (**ptr >= '0' && **ptr <= '9') {
8620 width_left = *ptr;
8621 SKIP_DIGITS(*ptr);
8622 width_right = *ptr;
8625 // Create the format for KMP_SNPRINTF based on flags parsed above
8626 format_index = 0;
8627 format[format_index++] = '%';
8628 if (!right_justify)
8629 format[format_index++] = '-';
8630 if (pad_zeros)
8631 format[format_index++] = '0';
8632 if (width_left && width_right) {
8633 int i = 0;
8634 // Only allow 8 digit number widths.
8635 // This also prevents overflowing format variable
8636 while (i < 8 && width_left < width_right) {
8637 format[format_index++] = *width_left;
8638 width_left++;
8639 i++;
8643 // Parse a name (long or short)
8644 // Canonicalize the name into absolute_short_name
8645 found_valid_name = false;
8646 parse_long_name = (**ptr == '{');
8647 if (parse_long_name)
8648 (*ptr)++; // skip initial left brace
8649 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8650 sizeof(__kmp_affinity_format_table[0]);
8651 ++i) {
8652 char short_name = __kmp_affinity_format_table[i].short_name;
8653 const char *long_name = __kmp_affinity_format_table[i].long_name;
8654 char field_format = __kmp_affinity_format_table[i].field_format;
8655 if (parse_long_name) {
8656 size_t length = KMP_STRLEN(long_name);
8657 if (strncmp(*ptr, long_name, length) == 0) {
8658 found_valid_name = true;
8659 (*ptr) += length; // skip the long name
8661 } else if (**ptr == short_name) {
8662 found_valid_name = true;
8663 (*ptr)++; // skip the short name
8665 if (found_valid_name) {
8666 format[format_index++] = field_format;
8667 format[format_index++] = '\0';
8668 absolute_short_name = short_name;
8669 break;
8672 if (parse_long_name) {
8673 if (**ptr != '}') {
8674 absolute_short_name = 0;
8675 } else {
8676 (*ptr)++; // skip over the right brace
8680 // Attempt to fill the buffer with the requested
8681 // value using snprintf within __kmp_str_buf_print()
8682 switch (absolute_short_name) {
8683 case 't':
8684 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8685 break;
8686 case 'T':
8687 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8688 break;
8689 case 'L':
8690 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8691 break;
8692 case 'n':
8693 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8694 break;
8695 case 'H': {
8696 static const int BUFFER_SIZE = 256;
8697 char buf[BUFFER_SIZE];
8698 __kmp_expand_host_name(buf, BUFFER_SIZE);
8699 rc = __kmp_str_buf_print(field_buffer, format, buf);
8700 } break;
8701 case 'P':
8702 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8703 break;
8704 case 'i':
8705 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8706 break;
8707 case 'N':
8708 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8709 break;
8710 case 'a':
8711 field_value =
8712 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8713 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8714 break;
8715 #if KMP_AFFINITY_SUPPORTED
8716 case 'A': {
8717 kmp_str_buf_t buf;
8718 __kmp_str_buf_init(&buf);
8719 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8720 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8721 __kmp_str_buf_free(&buf);
8722 } break;
8723 #endif
8724 default:
8725 // According to spec, If an implementation does not have info for field
8726 // type, then "undefined" is printed
8727 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8728 // Skip the field
8729 if (parse_long_name) {
8730 SKIP_TOKEN(*ptr);
8731 if (**ptr == '}')
8732 (*ptr)++;
8733 } else {
8734 (*ptr)++;
8738 KMP_ASSERT(format_index <= FORMAT_SIZE);
8739 return rc;
8743 * Return number of characters needed to hold the affinity string
8744 * (not including null byte character)
8745 * The resultant string is printed to buffer, which the caller can then
8746 * handle afterwards
8748 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8749 kmp_str_buf_t *buffer) {
8750 const char *parse_ptr;
8751 size_t retval;
8752 const kmp_info_t *th;
8753 kmp_str_buf_t field;
8755 KMP_DEBUG_ASSERT(buffer);
8756 KMP_DEBUG_ASSERT(gtid >= 0);
8758 __kmp_str_buf_init(&field);
8759 __kmp_str_buf_clear(buffer);
8761 th = __kmp_threads[gtid];
8762 retval = 0;
8764 // If format is NULL or zero-length string, then we use
8765 // affinity-format-var ICV
8766 parse_ptr = format;
8767 if (parse_ptr == NULL || *parse_ptr == '\0') {
8768 parse_ptr = __kmp_affinity_format;
8770 KMP_DEBUG_ASSERT(parse_ptr);
8772 while (*parse_ptr != '\0') {
8773 // Parse a field
8774 if (*parse_ptr == '%') {
8775 // Put field in the buffer
8776 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8777 __kmp_str_buf_catbuf(buffer, &field);
8778 retval += rc;
8779 } else {
8780 // Put literal character in buffer
8781 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8782 retval++;
8783 parse_ptr++;
8786 __kmp_str_buf_free(&field);
8787 return retval;
8790 // Displays the affinity string to stdout
8791 void __kmp_aux_display_affinity(int gtid, const char *format) {
8792 kmp_str_buf_t buf;
8793 __kmp_str_buf_init(&buf);
8794 __kmp_aux_capture_affinity(gtid, format, &buf);
8795 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8796 __kmp_str_buf_free(&buf);
8799 /* ------------------------------------------------------------------------ */
8800 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8801 int blocktime = arg; /* argument is in microseconds */
8802 #if KMP_USE_MONITOR
8803 int bt_intervals;
8804 #endif
8805 kmp_int8 bt_set;
8807 __kmp_save_internal_controls(thread);
8809 /* Normalize and set blocktime for the teams */
8810 if (blocktime < KMP_MIN_BLOCKTIME)
8811 blocktime = KMP_MIN_BLOCKTIME;
8812 else if (blocktime > KMP_MAX_BLOCKTIME)
8813 blocktime = KMP_MAX_BLOCKTIME;
8815 set__blocktime_team(thread->th.th_team, tid, blocktime);
8816 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8818 #if KMP_USE_MONITOR
8819 /* Calculate and set blocktime intervals for the teams */
8820 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8822 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8823 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8824 #endif
8826 /* Set whether blocktime has been set to "TRUE" */
8827 bt_set = TRUE;
8829 set__bt_set_team(thread->th.th_team, tid, bt_set);
8830 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8831 #if KMP_USE_MONITOR
8832 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8833 "bt_intervals=%d, monitor_updates=%d\n",
8834 __kmp_gtid_from_tid(tid, thread->th.th_team),
8835 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8836 __kmp_monitor_wakeups));
8837 #else
8838 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8839 __kmp_gtid_from_tid(tid, thread->th.th_team),
8840 thread->th.th_team->t.t_id, tid, blocktime));
8841 #endif
8844 void __kmp_aux_set_defaults(char const *str, size_t len) {
8845 if (!__kmp_init_serial) {
8846 __kmp_serial_initialize();
8848 __kmp_env_initialize(str);
8850 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8851 __kmp_env_print();
8853 } // __kmp_aux_set_defaults
8855 /* ------------------------------------------------------------------------ */
8856 /* internal fast reduction routines */
8858 PACKED_REDUCTION_METHOD_T
8859 __kmp_determine_reduction_method(
8860 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8861 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8862 kmp_critical_name *lck) {
8864 // Default reduction method: critical construct ( lck != NULL, like in current
8865 // PAROPT )
8866 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8867 // can be selected by RTL
8868 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8869 // can be selected by RTL
8870 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8871 // among generated by PAROPT.
8873 PACKED_REDUCTION_METHOD_T retval;
8875 int team_size;
8877 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8879 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8880 (loc && \
8881 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8882 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8884 retval = critical_reduce_block;
8886 // another choice of getting a team size (with 1 dynamic deference) is slower
8887 team_size = __kmp_get_team_num_threads(global_tid);
8888 if (team_size == 1) {
8890 retval = empty_reduce_block;
8892 } else {
8894 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8896 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8897 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
8899 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8900 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8902 int teamsize_cutoff = 4;
8904 #if KMP_MIC_SUPPORTED
8905 if (__kmp_mic_type != non_mic) {
8906 teamsize_cutoff = 8;
8908 #endif
8909 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8910 if (tree_available) {
8911 if (team_size <= teamsize_cutoff) {
8912 if (atomic_available) {
8913 retval = atomic_reduce_block;
8915 } else {
8916 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8918 } else if (atomic_available) {
8919 retval = atomic_reduce_block;
8921 #else
8922 #error "Unknown or unsupported OS"
8923 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8924 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8926 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8928 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8929 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8931 // basic tuning
8933 if (atomic_available) {
8934 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8935 retval = atomic_reduce_block;
8937 } // otherwise: use critical section
8939 #elif KMP_OS_DARWIN
8941 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8942 if (atomic_available && (num_vars <= 3)) {
8943 retval = atomic_reduce_block;
8944 } else if (tree_available) {
8945 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8946 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8947 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8949 } // otherwise: use critical section
8951 #else
8952 #error "Unknown or unsupported OS"
8953 #endif
8955 #else
8956 #error "Unknown or unsupported architecture"
8957 #endif
8960 // KMP_FORCE_REDUCTION
8962 // If the team is serialized (team_size == 1), ignore the forced reduction
8963 // method and stay with the unsynchronized method (empty_reduce_block)
8964 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8965 team_size != 1) {
8967 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8969 int atomic_available, tree_available;
8971 switch ((forced_retval = __kmp_force_reduction_method)) {
8972 case critical_reduce_block:
8973 KMP_ASSERT(lck); // lck should be != 0
8974 break;
8976 case atomic_reduce_block:
8977 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8978 if (!atomic_available) {
8979 KMP_WARNING(RedMethodNotSupported, "atomic");
8980 forced_retval = critical_reduce_block;
8982 break;
8984 case tree_reduce_block:
8985 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8986 if (!tree_available) {
8987 KMP_WARNING(RedMethodNotSupported, "tree");
8988 forced_retval = critical_reduce_block;
8989 } else {
8990 #if KMP_FAST_REDUCTION_BARRIER
8991 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8992 #endif
8994 break;
8996 default:
8997 KMP_ASSERT(0); // "unsupported method specified"
9000 retval = forced_retval;
9003 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9005 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9006 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9008 return (retval);
9010 // this function is for testing set/get/determine reduce method
9011 kmp_int32 __kmp_get_reduce_method(void) {
9012 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9015 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9016 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9017 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9019 // Hard pause shuts down the runtime completely. Resume happens naturally when
9020 // OpenMP is used subsequently.
9021 void __kmp_hard_pause() {
9022 __kmp_pause_status = kmp_hard_paused;
9023 __kmp_internal_end_thread(-1);
9026 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9027 void __kmp_resume_if_soft_paused() {
9028 if (__kmp_pause_status == kmp_soft_paused) {
9029 __kmp_pause_status = kmp_not_paused;
9031 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9032 kmp_info_t *thread = __kmp_threads[gtid];
9033 if (thread) { // Wake it if sleeping
9034 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9035 thread);
9036 if (fl.is_sleeping())
9037 fl.resume(gtid);
9038 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9039 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9040 } else { // thread holds the lock and may sleep soon
9041 do { // until either the thread sleeps, or we can get the lock
9042 if (fl.is_sleeping()) {
9043 fl.resume(gtid);
9044 break;
9045 } else if (__kmp_try_suspend_mx(thread)) {
9046 __kmp_unlock_suspend_mx(thread);
9047 break;
9049 } while (1);
9056 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9057 // TODO: add warning messages
9058 int __kmp_pause_resource(kmp_pause_status_t level) {
9059 if (level == kmp_not_paused) { // requesting resume
9060 if (__kmp_pause_status == kmp_not_paused) {
9061 // error message about runtime not being paused, so can't resume
9062 return 1;
9063 } else {
9064 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9065 __kmp_pause_status == kmp_hard_paused);
9066 __kmp_pause_status = kmp_not_paused;
9067 return 0;
9069 } else if (level == kmp_soft_paused) { // requesting soft pause
9070 if (__kmp_pause_status != kmp_not_paused) {
9071 // error message about already being paused
9072 return 1;
9073 } else {
9074 __kmp_soft_pause();
9075 return 0;
9077 } else if (level == kmp_hard_paused) { // requesting hard pause
9078 if (__kmp_pause_status != kmp_not_paused) {
9079 // error message about already being paused
9080 return 1;
9081 } else {
9082 __kmp_hard_pause();
9083 return 0;
9085 } else {
9086 // error message about invalid level
9087 return 1;
9091 void __kmp_omp_display_env(int verbose) {
9092 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9093 if (__kmp_init_serial == 0)
9094 __kmp_do_serial_initialize();
9095 __kmp_display_env_impl(!verbose, verbose);
9096 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9099 // The team size is changing, so distributed barrier must be modified
9100 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9101 int new_nthreads) {
9102 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9103 bp_dist_bar);
9104 kmp_info_t **other_threads = team->t.t_threads;
9106 // We want all the workers to stop waiting on the barrier while we adjust the
9107 // size of the team.
9108 for (int f = 1; f < old_nthreads; ++f) {
9109 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9110 // Ignore threads that are already inactive or not present in the team
9111 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9112 // teams construct causes thread_limit to get passed in, and some of
9113 // those could be inactive; just ignore them
9114 continue;
9116 // If thread is transitioning still to in_use state, wait for it
9117 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9118 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9119 KMP_CPU_PAUSE();
9121 // The thread should be in_use now
9122 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9123 // Transition to unused state
9124 team->t.t_threads[f]->th.th_used_in_team.store(2);
9125 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9127 // Release all the workers
9128 team->t.b->go_release();
9130 KMP_MFENCE();
9132 // Workers should see transition status 2 and move to 0; but may need to be
9133 // woken up first
9134 int count = old_nthreads - 1;
9135 while (count > 0) {
9136 count = old_nthreads - 1;
9137 for (int f = 1; f < old_nthreads; ++f) {
9138 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9139 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9140 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9141 void *, other_threads[f]->th.th_sleep_loc);
9142 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9144 } else {
9145 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9146 count--;
9150 // Now update the barrier size
9151 team->t.b->update_num_threads(new_nthreads);
9152 team->t.b->go_reset();
9155 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9156 // Add the threads back to the team
9157 KMP_DEBUG_ASSERT(team);
9158 // Threads were paused and pointed at th_used_in_team temporarily during a
9159 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9160 // the thread that it should transition itself back into the team. Then, if
9161 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9162 // to wake it up.
9163 for (int f = 1; f < new_nthreads; ++f) {
9164 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9165 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9167 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9168 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9169 (kmp_flag_32<false, false> *)NULL);
9172 // The threads should be transitioning to the team; when they are done, they
9173 // should have set th_used_in_team to 1. This loop forces master to wait until
9174 // all threads have moved into the team and are waiting in the barrier.
9175 int count = new_nthreads - 1;
9176 while (count > 0) {
9177 count = new_nthreads - 1;
9178 for (int f = 1; f < new_nthreads; ++f) {
9179 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9180 count--;
9186 // Globals and functions for hidden helper task
9187 kmp_info_t **__kmp_hidden_helper_threads;
9188 kmp_info_t *__kmp_hidden_helper_main_thread;
9189 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9190 #if KMP_OS_LINUX
9191 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9192 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9193 #else
9194 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9195 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9196 #endif
9198 namespace {
9199 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9201 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9202 // This is an explicit synchronization on all hidden helper threads in case
9203 // that when a regular thread pushes a hidden helper task to one hidden
9204 // helper thread, the thread has not been awaken once since they're released
9205 // by the main thread after creating the team.
9206 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9207 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9208 __kmp_hidden_helper_threads_num)
9211 // If main thread, then wait for signal
9212 if (__kmpc_master(nullptr, *gtid)) {
9213 // First, unset the initial state and release the initial thread
9214 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9215 __kmp_hidden_helper_initz_release();
9216 __kmp_hidden_helper_main_thread_wait();
9217 // Now wake up all worker threads
9218 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9219 __kmp_hidden_helper_worker_thread_signal();
9223 } // namespace
9225 void __kmp_hidden_helper_threads_initz_routine() {
9226 // Create a new root for hidden helper team/threads
9227 const int gtid = __kmp_register_root(TRUE);
9228 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9229 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9230 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9231 __kmp_hidden_helper_threads_num;
9233 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9235 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9237 // Set the initialization flag to FALSE
9238 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9240 __kmp_hidden_helper_threads_deinitz_release();
9243 /* Nesting Mode:
9244 Set via KMP_NESTING_MODE, which takes an integer.
9245 Note: we skip duplicate topology levels, and skip levels with only
9246 one entity.
9247 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9248 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9249 in the topology, and initializes the number of threads at each of those
9250 levels to the number of entities at each level, respectively, below the
9251 entity at the parent level.
9252 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9253 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9254 the user to turn nesting on explicitly. This is an even more experimental
9255 option to this experimental feature, and may change or go away in the
9256 future.
9259 // Allocate space to store nesting levels
9260 void __kmp_init_nesting_mode() {
9261 int levels = KMP_HW_LAST;
9262 __kmp_nesting_mode_nlevels = levels;
9263 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9264 for (int i = 0; i < levels; ++i)
9265 __kmp_nesting_nth_level[i] = 0;
9266 if (__kmp_nested_nth.size < levels) {
9267 __kmp_nested_nth.nth =
9268 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9269 __kmp_nested_nth.size = levels;
9273 // Set # threads for top levels of nesting; must be called after topology set
9274 void __kmp_set_nesting_mode_threads() {
9275 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9277 if (__kmp_nesting_mode == 1)
9278 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9279 else if (__kmp_nesting_mode > 1)
9280 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9282 if (__kmp_topology) { // use topology info
9283 int loc, hw_level;
9284 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9285 loc < __kmp_nesting_mode_nlevels;
9286 loc++, hw_level++) {
9287 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9288 if (__kmp_nesting_nth_level[loc] == 1)
9289 loc--;
9291 // Make sure all cores are used
9292 if (__kmp_nesting_mode > 1 && loc > 1) {
9293 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9294 int num_cores = __kmp_topology->get_count(core_level);
9295 int upper_levels = 1;
9296 for (int level = 0; level < loc - 1; ++level)
9297 upper_levels *= __kmp_nesting_nth_level[level];
9298 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9299 __kmp_nesting_nth_level[loc - 1] =
9300 num_cores / __kmp_nesting_nth_level[loc - 2];
9302 __kmp_nesting_mode_nlevels = loc;
9303 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9304 } else { // no topology info available; provide a reasonable guesstimation
9305 if (__kmp_avail_proc >= 4) {
9306 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9307 __kmp_nesting_nth_level[1] = 2;
9308 __kmp_nesting_mode_nlevels = 2;
9309 } else {
9310 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9311 __kmp_nesting_mode_nlevels = 1;
9313 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9315 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9316 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9318 set__nproc(thread, __kmp_nesting_nth_level[0]);
9319 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9320 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9321 if (get__max_active_levels(thread) > 1) {
9322 // if max levels was set, set nesting mode levels to same
9323 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9325 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9326 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9329 // Empty symbols to export (see exports_so.txt) when feature is disabled
9330 extern "C" {
9331 #if !KMP_STATS_ENABLED
9332 void __kmp_reset_stats() {}
9333 #endif
9334 #if !USE_DEBUGGER
9335 int __kmp_omp_debug_struct_info = FALSE;
9336 int __kmp_debugging = FALSE;
9337 #endif
9338 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9339 void __kmp_itt_fini_ittlib() {}
9340 void __kmp_itt_init_ittlib() {}
9341 #endif
9344 // end of file