[Dexter] add an optnone attribute debug experience test for loops.
[llvm-project.git] / openmp / runtime / src / kmp_csupport.cpp
blob18c82c8f7cd4fa75fca26717c4e47ae0ae194b96
1 /*
2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3 */
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
23 #define MAX_MESSAGE 512
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
28 /*!
29 * @ingroup STARTUP_SHUTDOWN
30 * @param loc in source location information
31 * @param flags in for future use (currently ignored)
33 * Initialize the runtime library. This call is optional; if it is not made then
34 * it will be implicitly called by attempts to use other library functions.
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37 // By default __kmpc_begin() is no-op.
38 char *env;
39 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40 __kmp_str_match_true(env)) {
41 __kmp_middle_initialize();
42 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43 } else if (__kmp_ignore_mppbeg() == FALSE) {
44 // By default __kmp_ignore_mppbeg() returns TRUE.
45 __kmp_internal_begin();
46 KC_TRACE(10, ("__kmpc_begin: called\n"));
50 /*!
51 * @ingroup STARTUP_SHUTDOWN
52 * @param loc source location information
54 * Shutdown the runtime library. This is also optional, and even if called will
55 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56 * zero.
58 void __kmpc_end(ident_t *loc) {
59 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62 // returns FALSE and __kmpc_end() will unregister this root (it can cause
63 // library shut down).
64 if (__kmp_ignore_mppend() == FALSE) {
65 KC_TRACE(10, ("__kmpc_end: called\n"));
66 KA_TRACE(30, ("__kmpc_end\n"));
68 __kmp_internal_end_thread(-1);
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71 // Normal exit process on Windows does not allow worker threads of the final
72 // parallel region to finish reporting their events, so shutting down the
73 // library here fixes the issue at least for the cases where __kmpc_end() is
74 // placed properly.
75 if (ompt_enabled.enabled)
76 __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
85 This function can be called in any context.
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99 kmp_int32 gtid = __kmp_entry_gtid();
101 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 return gtid;
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121 KC_TRACE(10,
122 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 return TCR_4(__kmp_all_nth);
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135 return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 * @ingroup DEPRECATED
151 * @param loc location description
153 * This function need not be called. It always returns TRUE.
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
158 return TRUE;
160 #else
162 const char *semi2;
163 const char *semi3;
164 int line_no;
166 if (__kmp_par_range == 0) {
167 return TRUE;
169 semi2 = loc->psource;
170 if (semi2 == NULL) {
171 return TRUE;
173 semi2 = strchr(semi2, ';');
174 if (semi2 == NULL) {
175 return TRUE;
177 semi2 = strchr(semi2 + 1, ';');
178 if (semi2 == NULL) {
179 return TRUE;
181 if (__kmp_par_range_filename[0]) {
182 const char *name = semi2 - 1;
183 while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184 name--;
186 if ((*name == '/') || (*name == ';')) {
187 name++;
189 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190 return __kmp_par_range < 0;
193 semi3 = strchr(semi2 + 1, ';');
194 if (__kmp_par_range_routine[0]) {
195 if ((semi3 != NULL) && (semi3 > semi2) &&
196 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197 return __kmp_par_range < 0;
200 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202 return __kmp_par_range > 0;
204 return __kmp_par_range < 0;
206 return TRUE;
208 #endif /* KMP_DEBUG */
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218 return __kmp_entry_thread()->th.th_root->r.r_active;
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231 kmp_int32 num_threads) {
232 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233 global_tid, num_threads));
234 __kmp_assert_valid_gtid(global_tid);
235 __kmp_push_num_threads(loc, global_tid, num_threads);
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240 /* the num_threads are automatically popped */
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244 kmp_int32 proc_bind) {
245 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246 proc_bind));
247 __kmp_assert_valid_gtid(global_tid);
248 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
252 @ingroup PARALLEL
253 @param loc source location information
254 @param argc total number of arguments in the ellipsis
255 @param microtask pointer to callback routine consisting of outlined parallel
256 construct
257 @param ... pointers to shared variables that aren't global
259 Do the actual fork and call the microtask in the relevant number of threads.
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262 int gtid = __kmp_entry_gtid();
264 #if (KMP_STATS_ENABLED)
265 // If we were in a serial region, then stop the serial timer, record
266 // the event, and start parallel region timer
267 stats_state_e previous_state = KMP_GET_THREAD_STATE();
268 if (previous_state == stats_state_e::SERIAL_REGION) {
269 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270 } else {
271 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273 int inParallel = __kmpc_in_parallel(loc);
274 if (inParallel) {
275 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276 } else {
277 KMP_COUNT_BLOCK(OMP_PARALLEL);
279 #endif
281 // maybe to save thr_state is enough here
283 va_list ap;
284 va_start(ap, microtask);
286 #if OMPT_SUPPORT
287 ompt_frame_t *ompt_frame;
288 if (ompt_enabled.enabled) {
289 kmp_info_t *master_th = __kmp_threads[gtid];
290 kmp_team_t *parent_team = master_th->th.th_team;
291 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292 if (lwt)
293 ompt_frame = &(lwt->ompt_task_info.frame);
294 else {
295 int tid = __kmp_tid_from_gtid(gtid);
296 ompt_frame = &(
297 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300 OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
304 #if INCLUDE_SSC_MARKS
305 SSC_MARK_FORKING();
306 #endif
307 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309 VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310 kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312 SSC_MARK_JOINING();
313 #endif
314 __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
317 fork_context_intel
318 #endif
321 va_end(ap);
324 #if KMP_STATS_ENABLED
325 if (previous_state == stats_state_e::SERIAL_REGION) {
326 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327 } else {
328 KMP_POP_PARTITIONED_TIMER();
330 #endif // KMP_STATS_ENABLED
334 @ingroup PARALLEL
335 @param loc source location information
336 @param global_tid global thread number
337 @param num_teams number of teams requested for the teams construct
338 @param num_threads number of threads per team requested for the teams construct
340 Set the number of teams to be used by the teams construct.
341 This call is only required if the teams construct has a `num_teams` clause
342 or a `thread_limit` clause (or both).
344 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
345 kmp_int32 num_teams, kmp_int32 num_threads) {
346 KA_TRACE(20,
347 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
348 global_tid, num_teams, num_threads));
349 __kmp_assert_valid_gtid(global_tid);
350 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
354 @ingroup PARALLEL
355 @param loc source location information
356 @param argc total number of arguments in the ellipsis
357 @param microtask pointer to callback routine consisting of outlined teams
358 construct
359 @param ... pointers to shared variables that aren't global
361 Do the actual fork and call the microtask in the relevant number of threads.
363 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
364 ...) {
365 int gtid = __kmp_entry_gtid();
366 kmp_info_t *this_thr = __kmp_threads[gtid];
367 va_list ap;
368 va_start(ap, microtask);
370 #if KMP_STATS_ENABLED
371 KMP_COUNT_BLOCK(OMP_TEAMS);
372 stats_state_e previous_state = KMP_GET_THREAD_STATE();
373 if (previous_state == stats_state_e::SERIAL_REGION) {
374 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
375 } else {
376 KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
378 #endif
380 // remember teams entry point and nesting level
381 this_thr->th.th_teams_microtask = microtask;
382 this_thr->th.th_teams_level =
383 this_thr->th.th_team->t.t_level; // AC: can be >0 on host
385 #if OMPT_SUPPORT
386 kmp_team_t *parent_team = this_thr->th.th_team;
387 int tid = __kmp_tid_from_gtid(gtid);
388 if (ompt_enabled.enabled) {
389 parent_team->t.t_implicit_task_taskdata[tid]
390 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
392 OMPT_STORE_RETURN_ADDRESS(gtid);
393 #endif
395 // check if __kmpc_push_num_teams called, set default number of teams
396 // otherwise
397 if (this_thr->th.th_teams_size.nteams == 0) {
398 __kmp_push_num_teams(loc, gtid, 0, 0);
400 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
401 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
402 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
404 __kmp_fork_call(
405 loc, gtid, fork_context_intel, argc,
406 VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
407 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
408 __kmp_join_call(loc, gtid
409 #if OMPT_SUPPORT
411 fork_context_intel
412 #endif
415 // Pop current CG root off list
416 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
417 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
418 this_thr->th.th_cg_roots = tmp->up;
419 KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
420 " to node %p. cg_nthreads was %d\n",
421 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
422 KMP_DEBUG_ASSERT(tmp->cg_nthreads);
423 int i = tmp->cg_nthreads--;
424 if (i == 1) { // check is we are the last thread in CG (not always the case)
425 __kmp_free(tmp);
427 // Restore current task's thread_limit from CG root
428 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
429 this_thr->th.th_current_task->td_icvs.thread_limit =
430 this_thr->th.th_cg_roots->cg_thread_limit;
432 this_thr->th.th_teams_microtask = NULL;
433 this_thr->th.th_teams_level = 0;
434 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
435 va_end(ap);
436 #if KMP_STATS_ENABLED
437 if (previous_state == stats_state_e::SERIAL_REGION) {
438 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
439 } else {
440 KMP_POP_PARTITIONED_TIMER();
442 #endif // KMP_STATS_ENABLED
445 // I don't think this function should ever have been exported.
446 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
447 // openmp code ever called it, but it's been exported from the RTL for so
448 // long that I'm afraid to remove the definition.
449 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
452 @ingroup PARALLEL
453 @param loc source location information
454 @param global_tid global thread number
456 Enter a serialized parallel construct. This interface is used to handle a
457 conditional parallel region, like this,
458 @code
459 #pragma omp parallel if (condition)
460 @endcode
461 when the condition is false.
463 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
464 // The implementation is now in kmp_runtime.cpp so that it can share static
465 // functions with kmp_fork_call since the tasks to be done are similar in
466 // each case.
467 __kmp_assert_valid_gtid(global_tid);
468 #if OMPT_SUPPORT
469 OMPT_STORE_RETURN_ADDRESS(global_tid);
470 #endif
471 __kmp_serialized_parallel(loc, global_tid);
475 @ingroup PARALLEL
476 @param loc source location information
477 @param global_tid global thread number
479 Leave a serialized parallel construct.
481 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
482 kmp_internal_control_t *top;
483 kmp_info_t *this_thr;
484 kmp_team_t *serial_team;
486 KC_TRACE(10,
487 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
489 /* skip all this code for autopar serialized loops since it results in
490 unacceptable overhead */
491 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
492 return;
494 // Not autopar code
495 __kmp_assert_valid_gtid(global_tid);
496 if (!TCR_4(__kmp_init_parallel))
497 __kmp_parallel_initialize();
499 __kmp_resume_if_soft_paused();
501 this_thr = __kmp_threads[global_tid];
502 serial_team = this_thr->th.th_serial_team;
504 kmp_task_team_t *task_team = this_thr->th.th_task_team;
505 // we need to wait for the proxy tasks before finishing the thread
506 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
507 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
509 KMP_MB();
510 KMP_DEBUG_ASSERT(serial_team);
511 KMP_ASSERT(serial_team->t.t_serialized);
512 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
513 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
514 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
515 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
517 #if OMPT_SUPPORT
518 if (ompt_enabled.enabled &&
519 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
520 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
521 if (ompt_enabled.ompt_callback_implicit_task) {
522 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
523 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
524 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
527 // reset clear the task id only after unlinking the task
528 ompt_data_t *parent_task_data;
529 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
531 if (ompt_enabled.ompt_callback_parallel_end) {
532 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
533 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
534 ompt_parallel_invoker_program | ompt_parallel_team,
535 OMPT_LOAD_RETURN_ADDRESS(global_tid));
537 __ompt_lw_taskteam_unlink(this_thr);
538 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
540 #endif
542 /* If necessary, pop the internal control stack values and replace the team
543 * values */
544 top = serial_team->t.t_control_stack_top;
545 if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
546 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
547 serial_team->t.t_control_stack_top = top->next;
548 __kmp_free(top);
551 // if( serial_team -> t.t_serialized > 1 )
552 serial_team->t.t_level--;
554 /* pop dispatch buffers stack */
555 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
557 dispatch_private_info_t *disp_buffer =
558 serial_team->t.t_dispatch->th_disp_buffer;
559 serial_team->t.t_dispatch->th_disp_buffer =
560 serial_team->t.t_dispatch->th_disp_buffer->next;
561 __kmp_free(disp_buffer);
563 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
565 --serial_team->t.t_serialized;
566 if (serial_team->t.t_serialized == 0) {
568 /* return to the parallel section */
570 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
571 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
572 __kmp_clear_x87_fpu_status_word();
573 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
574 __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
576 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
578 this_thr->th.th_team = serial_team->t.t_parent;
579 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
581 /* restore values cached in the thread */
582 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
583 this_thr->th.th_team_master =
584 serial_team->t.t_parent->t.t_threads[0]; /* JPH */
585 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
587 /* TODO the below shouldn't need to be adjusted for serialized teams */
588 this_thr->th.th_dispatch =
589 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
591 __kmp_pop_current_task_from_thread(this_thr);
593 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
594 this_thr->th.th_current_task->td_flags.executing = 1;
596 if (__kmp_tasking_mode != tskm_immediate_exec) {
597 // Copy the task team from the new child / old parent team to the thread.
598 this_thr->th.th_task_team =
599 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
600 KA_TRACE(20,
601 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
602 "team %p\n",
603 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
605 } else {
606 if (__kmp_tasking_mode != tskm_immediate_exec) {
607 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
608 "depth of serial team %p to %d\n",
609 global_tid, serial_team, serial_team->t.t_serialized));
613 if (__kmp_env_consistency_check)
614 __kmp_pop_parallel(global_tid, NULL);
615 #if OMPT_SUPPORT
616 if (ompt_enabled.enabled)
617 this_thr->th.ompt_thread_info.state =
618 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
619 : ompt_state_work_parallel);
620 #endif
624 @ingroup SYNCHRONIZATION
625 @param loc source location information.
627 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
628 depending on the memory ordering convention obeyed by the compiler
629 even that may not be necessary).
631 void __kmpc_flush(ident_t *loc) {
632 KC_TRACE(10, ("__kmpc_flush: called\n"));
634 /* need explicit __mf() here since use volatile instead in library */
635 KMP_MB(); /* Flush all pending memory write invalidates. */
637 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
638 #if KMP_MIC
639 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
640 // We shouldn't need it, though, since the ABI rules require that
641 // * If the compiler generates NGO stores it also generates the fence
642 // * If users hand-code NGO stores they should insert the fence
643 // therefore no incomplete unordered stores should be visible.
644 #else
645 // C74404
646 // This is to address non-temporal store instructions (sfence needed).
647 // The clflush instruction is addressed either (mfence needed).
648 // Probably the non-temporal load monvtdqa instruction should also be
649 // addressed.
650 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
651 if (!__kmp_cpuinfo.initialized) {
652 __kmp_query_cpuid(&__kmp_cpuinfo);
654 if (!__kmp_cpuinfo.sse2) {
655 // CPU cannot execute SSE2 instructions.
656 } else {
657 #if KMP_COMPILER_ICC
658 _mm_mfence();
659 #elif KMP_COMPILER_MSVC
660 MemoryBarrier();
661 #else
662 __sync_synchronize();
663 #endif // KMP_COMPILER_ICC
665 #endif // KMP_MIC
666 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
667 KMP_ARCH_RISCV64)
668 // Nothing to see here move along
669 #elif KMP_ARCH_PPC64
670 // Nothing needed here (we have a real MB above).
671 #else
672 #error Unknown or unsupported architecture
673 #endif
675 #if OMPT_SUPPORT && OMPT_OPTIONAL
676 if (ompt_enabled.ompt_callback_flush) {
677 ompt_callbacks.ompt_callback(ompt_callback_flush)(
678 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
680 #endif
683 /* -------------------------------------------------------------------------- */
685 @ingroup SYNCHRONIZATION
686 @param loc source location information
687 @param global_tid thread id.
689 Execute a barrier.
691 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
692 KMP_COUNT_BLOCK(OMP_BARRIER);
693 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
694 __kmp_assert_valid_gtid(global_tid);
696 if (!TCR_4(__kmp_init_parallel))
697 __kmp_parallel_initialize();
699 __kmp_resume_if_soft_paused();
701 if (__kmp_env_consistency_check) {
702 if (loc == 0) {
703 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
705 __kmp_check_barrier(global_tid, ct_barrier, loc);
708 #if OMPT_SUPPORT
709 ompt_frame_t *ompt_frame;
710 if (ompt_enabled.enabled) {
711 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
712 if (ompt_frame->enter_frame.ptr == NULL)
713 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
714 OMPT_STORE_RETURN_ADDRESS(global_tid);
716 #endif
717 __kmp_threads[global_tid]->th.th_ident = loc;
718 // TODO: explicit barrier_wait_id:
719 // this function is called when 'barrier' directive is present or
720 // implicit barrier at the end of a worksharing construct.
721 // 1) better to add a per-thread barrier counter to a thread data structure
722 // 2) set to 0 when a new team is created
723 // 4) no sync is required
725 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
726 #if OMPT_SUPPORT && OMPT_OPTIONAL
727 if (ompt_enabled.enabled) {
728 ompt_frame->enter_frame = ompt_data_none;
730 #endif
733 /* The BARRIER for a MASTER section is always explicit */
735 @ingroup WORK_SHARING
736 @param loc source location information.
737 @param global_tid global thread number .
738 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
740 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
741 int status = 0;
743 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
744 __kmp_assert_valid_gtid(global_tid);
746 if (!TCR_4(__kmp_init_parallel))
747 __kmp_parallel_initialize();
749 __kmp_resume_if_soft_paused();
751 if (KMP_MASTER_GTID(global_tid)) {
752 KMP_COUNT_BLOCK(OMP_MASTER);
753 KMP_PUSH_PARTITIONED_TIMER(OMP_master);
754 status = 1;
757 #if OMPT_SUPPORT && OMPT_OPTIONAL
758 if (status) {
759 if (ompt_enabled.ompt_callback_master) {
760 kmp_info_t *this_thr = __kmp_threads[global_tid];
761 kmp_team_t *team = this_thr->th.th_team;
763 int tid = __kmp_tid_from_gtid(global_tid);
764 ompt_callbacks.ompt_callback(ompt_callback_master)(
765 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
766 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
767 OMPT_GET_RETURN_ADDRESS(0));
770 #endif
772 if (__kmp_env_consistency_check) {
773 #if KMP_USE_DYNAMIC_LOCK
774 if (status)
775 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
776 else
777 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
778 #else
779 if (status)
780 __kmp_push_sync(global_tid, ct_master, loc, NULL);
781 else
782 __kmp_check_sync(global_tid, ct_master, loc, NULL);
783 #endif
786 return status;
790 @ingroup WORK_SHARING
791 @param loc source location information.
792 @param global_tid global thread number .
794 Mark the end of a <tt>master</tt> region. This should only be called by the
795 thread that executes the <tt>master</tt> region.
797 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
798 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
799 __kmp_assert_valid_gtid(global_tid);
800 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
801 KMP_POP_PARTITIONED_TIMER();
803 #if OMPT_SUPPORT && OMPT_OPTIONAL
804 kmp_info_t *this_thr = __kmp_threads[global_tid];
805 kmp_team_t *team = this_thr->th.th_team;
806 if (ompt_enabled.ompt_callback_master) {
807 int tid = __kmp_tid_from_gtid(global_tid);
808 ompt_callbacks.ompt_callback(ompt_callback_master)(
809 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
810 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
811 OMPT_GET_RETURN_ADDRESS(0));
813 #endif
815 if (__kmp_env_consistency_check) {
816 if (KMP_MASTER_GTID(global_tid))
817 __kmp_pop_sync(global_tid, ct_master, loc);
822 @ingroup WORK_SHARING
823 @param loc source location information.
824 @param gtid global thread number.
826 Start execution of an <tt>ordered</tt> construct.
828 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
829 int cid = 0;
830 kmp_info_t *th;
831 KMP_DEBUG_ASSERT(__kmp_init_serial);
833 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
834 __kmp_assert_valid_gtid(gtid);
836 if (!TCR_4(__kmp_init_parallel))
837 __kmp_parallel_initialize();
839 __kmp_resume_if_soft_paused();
841 #if USE_ITT_BUILD
842 __kmp_itt_ordered_prep(gtid);
843 // TODO: ordered_wait_id
844 #endif /* USE_ITT_BUILD */
846 th = __kmp_threads[gtid];
848 #if OMPT_SUPPORT && OMPT_OPTIONAL
849 kmp_team_t *team;
850 ompt_wait_id_t lck;
851 void *codeptr_ra;
852 if (ompt_enabled.enabled) {
853 OMPT_STORE_RETURN_ADDRESS(gtid);
854 team = __kmp_team_from_gtid(gtid);
855 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
856 /* OMPT state update */
857 th->th.ompt_thread_info.wait_id = lck;
858 th->th.ompt_thread_info.state = ompt_state_wait_ordered;
860 /* OMPT event callback */
861 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
862 if (ompt_enabled.ompt_callback_mutex_acquire) {
863 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
864 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
865 codeptr_ra);
868 #endif
870 if (th->th.th_dispatch->th_deo_fcn != 0)
871 (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
872 else
873 __kmp_parallel_deo(&gtid, &cid, loc);
875 #if OMPT_SUPPORT && OMPT_OPTIONAL
876 if (ompt_enabled.enabled) {
877 /* OMPT state update */
878 th->th.ompt_thread_info.state = ompt_state_work_parallel;
879 th->th.ompt_thread_info.wait_id = 0;
881 /* OMPT event callback */
882 if (ompt_enabled.ompt_callback_mutex_acquired) {
883 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
884 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
887 #endif
889 #if USE_ITT_BUILD
890 __kmp_itt_ordered_start(gtid);
891 #endif /* USE_ITT_BUILD */
895 @ingroup WORK_SHARING
896 @param loc source location information.
897 @param gtid global thread number.
899 End execution of an <tt>ordered</tt> construct.
901 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
902 int cid = 0;
903 kmp_info_t *th;
905 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
906 __kmp_assert_valid_gtid(gtid);
908 #if USE_ITT_BUILD
909 __kmp_itt_ordered_end(gtid);
910 // TODO: ordered_wait_id
911 #endif /* USE_ITT_BUILD */
913 th = __kmp_threads[gtid];
915 if (th->th.th_dispatch->th_dxo_fcn != 0)
916 (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
917 else
918 __kmp_parallel_dxo(&gtid, &cid, loc);
920 #if OMPT_SUPPORT && OMPT_OPTIONAL
921 OMPT_STORE_RETURN_ADDRESS(gtid);
922 if (ompt_enabled.ompt_callback_mutex_released) {
923 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
924 ompt_mutex_ordered,
925 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
926 ->t.t_ordered.dt.t_value,
927 OMPT_LOAD_RETURN_ADDRESS(gtid));
929 #endif
932 #if KMP_USE_DYNAMIC_LOCK
934 static __forceinline void
935 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
936 kmp_int32 gtid, kmp_indirect_locktag_t tag) {
937 // Pointer to the allocated indirect lock is written to crit, while indexing
938 // is ignored.
939 void *idx;
940 kmp_indirect_lock_t **lck;
941 lck = (kmp_indirect_lock_t **)crit;
942 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
943 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
944 KMP_SET_I_LOCK_LOCATION(ilk, loc);
945 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
946 KA_TRACE(20,
947 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
948 #if USE_ITT_BUILD
949 __kmp_itt_critical_creating(ilk->lock, loc);
950 #endif
951 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
952 if (status == 0) {
953 #if USE_ITT_BUILD
954 __kmp_itt_critical_destroyed(ilk->lock);
955 #endif
956 // We don't really need to destroy the unclaimed lock here since it will be
957 // cleaned up at program exit.
958 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
960 KMP_DEBUG_ASSERT(*lck != NULL);
963 // Fast-path acquire tas lock
964 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
966 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
967 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
968 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
969 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
970 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
971 kmp_uint32 spins; \
972 KMP_FSYNC_PREPARE(l); \
973 KMP_INIT_YIELD(spins); \
974 kmp_backoff_t backoff = __kmp_spin_backoff_params; \
975 do { \
976 if (TCR_4(__kmp_nth) > \
977 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
978 KMP_YIELD(TRUE); \
979 } else { \
980 KMP_YIELD_SPIN(spins); \
982 __kmp_spin_backoff(&backoff); \
983 } while ( \
984 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
985 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
987 KMP_FSYNC_ACQUIRED(l); \
990 // Fast-path test tas lock
991 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
993 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
994 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
995 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
996 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
997 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1000 // Fast-path release tas lock
1001 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1002 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1004 #if KMP_USE_FUTEX
1006 #include <sys/syscall.h>
1007 #include <unistd.h>
1008 #ifndef FUTEX_WAIT
1009 #define FUTEX_WAIT 0
1010 #endif
1011 #ifndef FUTEX_WAKE
1012 #define FUTEX_WAKE 1
1013 #endif
1015 // Fast-path acquire futex lock
1016 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1018 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1019 kmp_int32 gtid_code = (gtid + 1) << 1; \
1020 KMP_MB(); \
1021 KMP_FSYNC_PREPARE(ftx); \
1022 kmp_int32 poll_val; \
1023 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1024 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1025 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1026 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1027 if (!cond) { \
1028 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1029 poll_val | \
1030 KMP_LOCK_BUSY(1, futex))) { \
1031 continue; \
1033 poll_val |= KMP_LOCK_BUSY(1, futex); \
1035 kmp_int32 rc; \
1036 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1037 NULL, NULL, 0)) != 0) { \
1038 continue; \
1040 gtid_code |= 1; \
1042 KMP_FSYNC_ACQUIRED(ftx); \
1045 // Fast-path test futex lock
1046 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1048 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1049 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1050 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1051 KMP_FSYNC_ACQUIRED(ftx); \
1052 rc = TRUE; \
1053 } else { \
1054 rc = FALSE; \
1058 // Fast-path release futex lock
1059 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1061 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1062 KMP_MB(); \
1063 KMP_FSYNC_RELEASING(ftx); \
1064 kmp_int32 poll_val = \
1065 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1066 if (KMP_LOCK_STRIP(poll_val) & 1) { \
1067 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1068 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1070 KMP_MB(); \
1071 KMP_YIELD_OVERSUB(); \
1074 #endif // KMP_USE_FUTEX
1076 #else // KMP_USE_DYNAMIC_LOCK
1078 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1079 ident_t const *loc,
1080 kmp_int32 gtid) {
1081 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1083 // Because of the double-check, the following load doesn't need to be volatile
1084 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1086 if (lck == NULL) {
1087 void *idx;
1089 // Allocate & initialize the lock.
1090 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1091 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1092 __kmp_init_user_lock_with_checks(lck);
1093 __kmp_set_user_lock_location(lck, loc);
1094 #if USE_ITT_BUILD
1095 __kmp_itt_critical_creating(lck);
1096 // __kmp_itt_critical_creating() should be called *before* the first usage
1097 // of underlying lock. It is the only place where we can guarantee it. There
1098 // are chances the lock will destroyed with no usage, but it is not a
1099 // problem, because this is not real event seen by user but rather setting
1100 // name for object (lock). See more details in kmp_itt.h.
1101 #endif /* USE_ITT_BUILD */
1103 // Use a cmpxchg instruction to slam the start of the critical section with
1104 // the lock pointer. If another thread beat us to it, deallocate the lock,
1105 // and use the lock that the other thread allocated.
1106 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1108 if (status == 0) {
1109 // Deallocate the lock and reload the value.
1110 #if USE_ITT_BUILD
1111 __kmp_itt_critical_destroyed(lck);
1112 // Let ITT know the lock is destroyed and the same memory location may be reused
1113 // for another purpose.
1114 #endif /* USE_ITT_BUILD */
1115 __kmp_destroy_user_lock_with_checks(lck);
1116 __kmp_user_lock_free(&idx, gtid, lck);
1117 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1118 KMP_DEBUG_ASSERT(lck != NULL);
1121 return lck;
1124 #endif // KMP_USE_DYNAMIC_LOCK
1127 @ingroup WORK_SHARING
1128 @param loc source location information.
1129 @param global_tid global thread number.
1130 @param crit identity of the critical section. This could be a pointer to a lock
1131 associated with the critical section, or some other suitably unique value.
1133 Enter code protected by a `critical` construct.
1134 This function blocks until the executing thread can enter the critical section.
1136 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1137 kmp_critical_name *crit) {
1138 #if KMP_USE_DYNAMIC_LOCK
1139 #if OMPT_SUPPORT && OMPT_OPTIONAL
1140 OMPT_STORE_RETURN_ADDRESS(global_tid);
1141 #endif // OMPT_SUPPORT
1142 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1143 #else
1144 KMP_COUNT_BLOCK(OMP_CRITICAL);
1145 #if OMPT_SUPPORT && OMPT_OPTIONAL
1146 ompt_state_t prev_state = ompt_state_undefined;
1147 ompt_thread_info_t ti;
1148 #endif
1149 kmp_user_lock_p lck;
1151 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1152 __kmp_assert_valid_gtid(global_tid);
1154 // TODO: add THR_OVHD_STATE
1156 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1157 KMP_CHECK_USER_LOCK_INIT();
1159 if ((__kmp_user_lock_kind == lk_tas) &&
1160 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1161 lck = (kmp_user_lock_p)crit;
1163 #if KMP_USE_FUTEX
1164 else if ((__kmp_user_lock_kind == lk_futex) &&
1165 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1166 lck = (kmp_user_lock_p)crit;
1168 #endif
1169 else { // ticket, queuing or drdpa
1170 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1173 if (__kmp_env_consistency_check)
1174 __kmp_push_sync(global_tid, ct_critical, loc, lck);
1176 // since the critical directive binds to all threads, not just the current
1177 // team we have to check this even if we are in a serialized team.
1178 // also, even if we are the uber thread, we still have to conduct the lock,
1179 // as we have to contend with sibling threads.
1181 #if USE_ITT_BUILD
1182 __kmp_itt_critical_acquiring(lck);
1183 #endif /* USE_ITT_BUILD */
1184 #if OMPT_SUPPORT && OMPT_OPTIONAL
1185 OMPT_STORE_RETURN_ADDRESS(gtid);
1186 void *codeptr_ra = NULL;
1187 if (ompt_enabled.enabled) {
1188 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1189 /* OMPT state update */
1190 prev_state = ti.state;
1191 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1192 ti.state = ompt_state_wait_critical;
1194 /* OMPT event callback */
1195 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1196 if (ompt_enabled.ompt_callback_mutex_acquire) {
1197 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1198 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1199 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1202 #endif
1203 // Value of 'crit' should be good for using as a critical_id of the critical
1204 // section directive.
1205 __kmp_acquire_user_lock_with_checks(lck, global_tid);
1207 #if USE_ITT_BUILD
1208 __kmp_itt_critical_acquired(lck);
1209 #endif /* USE_ITT_BUILD */
1210 #if OMPT_SUPPORT && OMPT_OPTIONAL
1211 if (ompt_enabled.enabled) {
1212 /* OMPT state update */
1213 ti.state = prev_state;
1214 ti.wait_id = 0;
1216 /* OMPT event callback */
1217 if (ompt_enabled.ompt_callback_mutex_acquired) {
1218 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1219 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1222 #endif
1223 KMP_POP_PARTITIONED_TIMER();
1225 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1226 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1227 #endif // KMP_USE_DYNAMIC_LOCK
1230 #if KMP_USE_DYNAMIC_LOCK
1232 // Converts the given hint to an internal lock implementation
1233 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1234 #if KMP_USE_TSX
1235 #define KMP_TSX_LOCK(seq) lockseq_##seq
1236 #else
1237 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1238 #endif
1240 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1241 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1242 #else
1243 #define KMP_CPUINFO_RTM 0
1244 #endif
1246 // Hints that do not require further logic
1247 if (hint & kmp_lock_hint_hle)
1248 return KMP_TSX_LOCK(hle);
1249 if (hint & kmp_lock_hint_rtm)
1250 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1251 if (hint & kmp_lock_hint_adaptive)
1252 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1254 // Rule out conflicting hints first by returning the default lock
1255 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1256 return __kmp_user_lock_seq;
1257 if ((hint & omp_lock_hint_speculative) &&
1258 (hint & omp_lock_hint_nonspeculative))
1259 return __kmp_user_lock_seq;
1261 // Do not even consider speculation when it appears to be contended
1262 if (hint & omp_lock_hint_contended)
1263 return lockseq_queuing;
1265 // Uncontended lock without speculation
1266 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1267 return lockseq_tas;
1269 // HLE lock for speculation
1270 if (hint & omp_lock_hint_speculative)
1271 return KMP_TSX_LOCK(hle);
1273 return __kmp_user_lock_seq;
1276 #if OMPT_SUPPORT && OMPT_OPTIONAL
1277 #if KMP_USE_DYNAMIC_LOCK
1278 static kmp_mutex_impl_t
1279 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1280 if (user_lock) {
1281 switch (KMP_EXTRACT_D_TAG(user_lock)) {
1282 case 0:
1283 break;
1284 #if KMP_USE_FUTEX
1285 case locktag_futex:
1286 return kmp_mutex_impl_queuing;
1287 #endif
1288 case locktag_tas:
1289 return kmp_mutex_impl_spin;
1290 #if KMP_USE_TSX
1291 case locktag_hle:
1292 return kmp_mutex_impl_speculative;
1293 #endif
1294 default:
1295 return kmp_mutex_impl_none;
1297 ilock = KMP_LOOKUP_I_LOCK(user_lock);
1299 KMP_ASSERT(ilock);
1300 switch (ilock->type) {
1301 #if KMP_USE_TSX
1302 case locktag_adaptive:
1303 case locktag_rtm:
1304 return kmp_mutex_impl_speculative;
1305 #endif
1306 case locktag_nested_tas:
1307 return kmp_mutex_impl_spin;
1308 #if KMP_USE_FUTEX
1309 case locktag_nested_futex:
1310 #endif
1311 case locktag_ticket:
1312 case locktag_queuing:
1313 case locktag_drdpa:
1314 case locktag_nested_ticket:
1315 case locktag_nested_queuing:
1316 case locktag_nested_drdpa:
1317 return kmp_mutex_impl_queuing;
1318 default:
1319 return kmp_mutex_impl_none;
1322 #else
1323 // For locks without dynamic binding
1324 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1325 switch (__kmp_user_lock_kind) {
1326 case lk_tas:
1327 return kmp_mutex_impl_spin;
1328 #if KMP_USE_FUTEX
1329 case lk_futex:
1330 #endif
1331 case lk_ticket:
1332 case lk_queuing:
1333 case lk_drdpa:
1334 return kmp_mutex_impl_queuing;
1335 #if KMP_USE_TSX
1336 case lk_hle:
1337 case lk_rtm:
1338 case lk_adaptive:
1339 return kmp_mutex_impl_speculative;
1340 #endif
1341 default:
1342 return kmp_mutex_impl_none;
1345 #endif // KMP_USE_DYNAMIC_LOCK
1346 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1349 @ingroup WORK_SHARING
1350 @param loc source location information.
1351 @param global_tid global thread number.
1352 @param crit identity of the critical section. This could be a pointer to a lock
1353 associated with the critical section, or some other suitably unique value.
1354 @param hint the lock hint.
1356 Enter code protected by a `critical` construct with a hint. The hint value is
1357 used to suggest a lock implementation. This function blocks until the executing
1358 thread can enter the critical section unless the hint suggests use of
1359 speculative execution and the hardware supports it.
1361 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1362 kmp_critical_name *crit, uint32_t hint) {
1363 KMP_COUNT_BLOCK(OMP_CRITICAL);
1364 kmp_user_lock_p lck;
1365 #if OMPT_SUPPORT && OMPT_OPTIONAL
1366 ompt_state_t prev_state = ompt_state_undefined;
1367 ompt_thread_info_t ti;
1368 // This is the case, if called from __kmpc_critical:
1369 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1370 if (!codeptr)
1371 codeptr = OMPT_GET_RETURN_ADDRESS(0);
1372 #endif
1374 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1375 __kmp_assert_valid_gtid(global_tid);
1377 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1378 // Check if it is initialized.
1379 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1380 if (*lk == 0) {
1381 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1382 if (KMP_IS_D_LOCK(lckseq)) {
1383 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1384 KMP_GET_D_TAG(lckseq));
1385 } else {
1386 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1389 // Branch for accessing the actual lock object and set operation. This
1390 // branching is inevitable since this lock initialization does not follow the
1391 // normal dispatch path (lock table is not used).
1392 if (KMP_EXTRACT_D_TAG(lk) != 0) {
1393 lck = (kmp_user_lock_p)lk;
1394 if (__kmp_env_consistency_check) {
1395 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1396 __kmp_map_hint_to_lock(hint));
1398 #if USE_ITT_BUILD
1399 __kmp_itt_critical_acquiring(lck);
1400 #endif
1401 #if OMPT_SUPPORT && OMPT_OPTIONAL
1402 if (ompt_enabled.enabled) {
1403 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1404 /* OMPT state update */
1405 prev_state = ti.state;
1406 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1407 ti.state = ompt_state_wait_critical;
1409 /* OMPT event callback */
1410 if (ompt_enabled.ompt_callback_mutex_acquire) {
1411 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1412 ompt_mutex_critical, (unsigned int)hint,
1413 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1414 codeptr);
1417 #endif
1418 #if KMP_USE_INLINED_TAS
1419 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1420 KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1421 } else
1422 #elif KMP_USE_INLINED_FUTEX
1423 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1424 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1425 } else
1426 #endif
1428 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1430 } else {
1431 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1432 lck = ilk->lock;
1433 if (__kmp_env_consistency_check) {
1434 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1435 __kmp_map_hint_to_lock(hint));
1437 #if USE_ITT_BUILD
1438 __kmp_itt_critical_acquiring(lck);
1439 #endif
1440 #if OMPT_SUPPORT && OMPT_OPTIONAL
1441 if (ompt_enabled.enabled) {
1442 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1443 /* OMPT state update */
1444 prev_state = ti.state;
1445 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1446 ti.state = ompt_state_wait_critical;
1448 /* OMPT event callback */
1449 if (ompt_enabled.ompt_callback_mutex_acquire) {
1450 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1451 ompt_mutex_critical, (unsigned int)hint,
1452 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1453 codeptr);
1456 #endif
1457 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1459 KMP_POP_PARTITIONED_TIMER();
1461 #if USE_ITT_BUILD
1462 __kmp_itt_critical_acquired(lck);
1463 #endif /* USE_ITT_BUILD */
1464 #if OMPT_SUPPORT && OMPT_OPTIONAL
1465 if (ompt_enabled.enabled) {
1466 /* OMPT state update */
1467 ti.state = prev_state;
1468 ti.wait_id = 0;
1470 /* OMPT event callback */
1471 if (ompt_enabled.ompt_callback_mutex_acquired) {
1472 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1473 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1476 #endif
1478 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1479 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1480 } // __kmpc_critical_with_hint
1482 #endif // KMP_USE_DYNAMIC_LOCK
1485 @ingroup WORK_SHARING
1486 @param loc source location information.
1487 @param global_tid global thread number .
1488 @param crit identity of the critical section. This could be a pointer to a lock
1489 associated with the critical section, or some other suitably unique value.
1491 Leave a critical section, releasing any lock that was held during its execution.
1493 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1494 kmp_critical_name *crit) {
1495 kmp_user_lock_p lck;
1497 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1499 #if KMP_USE_DYNAMIC_LOCK
1500 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1501 lck = (kmp_user_lock_p)crit;
1502 KMP_ASSERT(lck != NULL);
1503 if (__kmp_env_consistency_check) {
1504 __kmp_pop_sync(global_tid, ct_critical, loc);
1506 #if USE_ITT_BUILD
1507 __kmp_itt_critical_releasing(lck);
1508 #endif
1509 #if KMP_USE_INLINED_TAS
1510 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1511 KMP_RELEASE_TAS_LOCK(lck, global_tid);
1512 } else
1513 #elif KMP_USE_INLINED_FUTEX
1514 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1515 KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1516 } else
1517 #endif
1519 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1521 } else {
1522 kmp_indirect_lock_t *ilk =
1523 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1524 KMP_ASSERT(ilk != NULL);
1525 lck = ilk->lock;
1526 if (__kmp_env_consistency_check) {
1527 __kmp_pop_sync(global_tid, ct_critical, loc);
1529 #if USE_ITT_BUILD
1530 __kmp_itt_critical_releasing(lck);
1531 #endif
1532 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1535 #else // KMP_USE_DYNAMIC_LOCK
1537 if ((__kmp_user_lock_kind == lk_tas) &&
1538 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1539 lck = (kmp_user_lock_p)crit;
1541 #if KMP_USE_FUTEX
1542 else if ((__kmp_user_lock_kind == lk_futex) &&
1543 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1544 lck = (kmp_user_lock_p)crit;
1546 #endif
1547 else { // ticket, queuing or drdpa
1548 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1551 KMP_ASSERT(lck != NULL);
1553 if (__kmp_env_consistency_check)
1554 __kmp_pop_sync(global_tid, ct_critical, loc);
1556 #if USE_ITT_BUILD
1557 __kmp_itt_critical_releasing(lck);
1558 #endif /* USE_ITT_BUILD */
1559 // Value of 'crit' should be good for using as a critical_id of the critical
1560 // section directive.
1561 __kmp_release_user_lock_with_checks(lck, global_tid);
1563 #endif // KMP_USE_DYNAMIC_LOCK
1565 #if OMPT_SUPPORT && OMPT_OPTIONAL
1566 /* OMPT release event triggers after lock is released; place here to trigger
1567 * for all #if branches */
1568 OMPT_STORE_RETURN_ADDRESS(global_tid);
1569 if (ompt_enabled.ompt_callback_mutex_released) {
1570 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1571 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1572 OMPT_LOAD_RETURN_ADDRESS(0));
1574 #endif
1576 KMP_POP_PARTITIONED_TIMER();
1577 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1581 @ingroup SYNCHRONIZATION
1582 @param loc source location information
1583 @param global_tid thread id.
1584 @return one if the thread should execute the master block, zero otherwise
1586 Start execution of a combined barrier and master. The barrier is executed inside
1587 this function.
1589 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1590 int status;
1591 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1592 __kmp_assert_valid_gtid(global_tid);
1594 if (!TCR_4(__kmp_init_parallel))
1595 __kmp_parallel_initialize();
1597 __kmp_resume_if_soft_paused();
1599 if (__kmp_env_consistency_check)
1600 __kmp_check_barrier(global_tid, ct_barrier, loc);
1602 #if OMPT_SUPPORT
1603 ompt_frame_t *ompt_frame;
1604 if (ompt_enabled.enabled) {
1605 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1606 if (ompt_frame->enter_frame.ptr == NULL)
1607 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1608 OMPT_STORE_RETURN_ADDRESS(global_tid);
1610 #endif
1611 #if USE_ITT_NOTIFY
1612 __kmp_threads[global_tid]->th.th_ident = loc;
1613 #endif
1614 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1615 #if OMPT_SUPPORT && OMPT_OPTIONAL
1616 if (ompt_enabled.enabled) {
1617 ompt_frame->enter_frame = ompt_data_none;
1619 #endif
1621 return (status != 0) ? 0 : 1;
1625 @ingroup SYNCHRONIZATION
1626 @param loc source location information
1627 @param global_tid thread id.
1629 Complete the execution of a combined barrier and master. This function should
1630 only be called at the completion of the <tt>master</tt> code. Other threads will
1631 still be waiting at the barrier and this call releases them.
1633 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1634 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1635 __kmp_assert_valid_gtid(global_tid);
1636 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1640 @ingroup SYNCHRONIZATION
1641 @param loc source location information
1642 @param global_tid thread id.
1643 @return one if the thread should execute the master block, zero otherwise
1645 Start execution of a combined barrier and master(nowait) construct.
1646 The barrier is executed inside this function.
1647 There is no equivalent "end" function, since the
1649 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1650 kmp_int32 ret;
1651 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1652 __kmp_assert_valid_gtid(global_tid);
1654 if (!TCR_4(__kmp_init_parallel))
1655 __kmp_parallel_initialize();
1657 __kmp_resume_if_soft_paused();
1659 if (__kmp_env_consistency_check) {
1660 if (loc == 0) {
1661 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1663 __kmp_check_barrier(global_tid, ct_barrier, loc);
1666 #if OMPT_SUPPORT
1667 ompt_frame_t *ompt_frame;
1668 if (ompt_enabled.enabled) {
1669 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1670 if (ompt_frame->enter_frame.ptr == NULL)
1671 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1672 OMPT_STORE_RETURN_ADDRESS(global_tid);
1674 #endif
1675 #if USE_ITT_NOTIFY
1676 __kmp_threads[global_tid]->th.th_ident = loc;
1677 #endif
1678 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1679 #if OMPT_SUPPORT && OMPT_OPTIONAL
1680 if (ompt_enabled.enabled) {
1681 ompt_frame->enter_frame = ompt_data_none;
1683 #endif
1685 ret = __kmpc_master(loc, global_tid);
1687 if (__kmp_env_consistency_check) {
1688 /* there's no __kmpc_end_master called; so the (stats) */
1689 /* actions of __kmpc_end_master are done here */
1690 if (ret) {
1691 /* only one thread should do the pop since only */
1692 /* one did the push (see __kmpc_master()) */
1693 __kmp_pop_sync(global_tid, ct_master, loc);
1697 return (ret);
1700 /* The BARRIER for a SINGLE process section is always explicit */
1702 @ingroup WORK_SHARING
1703 @param loc source location information
1704 @param global_tid global thread number
1705 @return One if this thread should execute the single construct, zero otherwise.
1707 Test whether to execute a <tt>single</tt> construct.
1708 There are no implicit barriers in the two "single" calls, rather the compiler
1709 should introduce an explicit barrier if it is required.
1712 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1713 __kmp_assert_valid_gtid(global_tid);
1714 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1716 if (rc) {
1717 // We are going to execute the single statement, so we should count it.
1718 KMP_COUNT_BLOCK(OMP_SINGLE);
1719 KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1722 #if OMPT_SUPPORT && OMPT_OPTIONAL
1723 kmp_info_t *this_thr = __kmp_threads[global_tid];
1724 kmp_team_t *team = this_thr->th.th_team;
1725 int tid = __kmp_tid_from_gtid(global_tid);
1727 if (ompt_enabled.enabled) {
1728 if (rc) {
1729 if (ompt_enabled.ompt_callback_work) {
1730 ompt_callbacks.ompt_callback(ompt_callback_work)(
1731 ompt_work_single_executor, ompt_scope_begin,
1732 &(team->t.ompt_team_info.parallel_data),
1733 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1734 1, OMPT_GET_RETURN_ADDRESS(0));
1736 } else {
1737 if (ompt_enabled.ompt_callback_work) {
1738 ompt_callbacks.ompt_callback(ompt_callback_work)(
1739 ompt_work_single_other, ompt_scope_begin,
1740 &(team->t.ompt_team_info.parallel_data),
1741 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1742 1, OMPT_GET_RETURN_ADDRESS(0));
1743 ompt_callbacks.ompt_callback(ompt_callback_work)(
1744 ompt_work_single_other, ompt_scope_end,
1745 &(team->t.ompt_team_info.parallel_data),
1746 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1747 1, OMPT_GET_RETURN_ADDRESS(0));
1751 #endif
1753 return rc;
1757 @ingroup WORK_SHARING
1758 @param loc source location information
1759 @param global_tid global thread number
1761 Mark the end of a <tt>single</tt> construct. This function should
1762 only be called by the thread that executed the block of code protected
1763 by the `single` construct.
1765 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1766 __kmp_assert_valid_gtid(global_tid);
1767 __kmp_exit_single(global_tid);
1768 KMP_POP_PARTITIONED_TIMER();
1770 #if OMPT_SUPPORT && OMPT_OPTIONAL
1771 kmp_info_t *this_thr = __kmp_threads[global_tid];
1772 kmp_team_t *team = this_thr->th.th_team;
1773 int tid = __kmp_tid_from_gtid(global_tid);
1775 if (ompt_enabled.ompt_callback_work) {
1776 ompt_callbacks.ompt_callback(ompt_callback_work)(
1777 ompt_work_single_executor, ompt_scope_end,
1778 &(team->t.ompt_team_info.parallel_data),
1779 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1780 OMPT_GET_RETURN_ADDRESS(0));
1782 #endif
1786 @ingroup WORK_SHARING
1787 @param loc Source location
1788 @param global_tid Global thread id
1790 Mark the end of a statically scheduled loop.
1792 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1793 KMP_POP_PARTITIONED_TIMER();
1794 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1796 #if OMPT_SUPPORT && OMPT_OPTIONAL
1797 if (ompt_enabled.ompt_callback_work) {
1798 ompt_work_t ompt_work_type = ompt_work_loop;
1799 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1800 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1801 // Determine workshare type
1802 if (loc != NULL) {
1803 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1804 ompt_work_type = ompt_work_loop;
1805 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1806 ompt_work_type = ompt_work_sections;
1807 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1808 ompt_work_type = ompt_work_distribute;
1809 } else {
1810 // use default set above.
1811 // a warning about this case is provided in __kmpc_for_static_init
1813 KMP_DEBUG_ASSERT(ompt_work_type);
1815 ompt_callbacks.ompt_callback(ompt_callback_work)(
1816 ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1817 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1819 #endif
1820 if (__kmp_env_consistency_check)
1821 __kmp_pop_workshare(global_tid, ct_pdo, loc);
1824 // User routines which take C-style arguments (call by value)
1825 // different from the Fortran equivalent routines
1827 void ompc_set_num_threads(int arg) {
1828 // !!!!! TODO: check the per-task binding
1829 __kmp_set_num_threads(arg, __kmp_entry_gtid());
1832 void ompc_set_dynamic(int flag) {
1833 kmp_info_t *thread;
1835 /* For the thread-private implementation of the internal controls */
1836 thread = __kmp_entry_thread();
1838 __kmp_save_internal_controls(thread);
1840 set__dynamic(thread, flag ? TRUE : FALSE);
1843 void ompc_set_nested(int flag) {
1844 kmp_info_t *thread;
1846 /* For the thread-private internal controls implementation */
1847 thread = __kmp_entry_thread();
1849 __kmp_save_internal_controls(thread);
1851 set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1854 void ompc_set_max_active_levels(int max_active_levels) {
1855 /* TO DO */
1856 /* we want per-task implementation of this internal control */
1858 /* For the per-thread internal controls implementation */
1859 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1862 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1863 // !!!!! TODO: check the per-task binding
1864 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1867 int ompc_get_ancestor_thread_num(int level) {
1868 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1871 int ompc_get_team_size(int level) {
1872 return __kmp_get_team_size(__kmp_entry_gtid(), level);
1875 /* OpenMP 5.0 Affinity Format API */
1877 void ompc_set_affinity_format(char const *format) {
1878 if (!__kmp_init_serial) {
1879 __kmp_serial_initialize();
1881 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1882 format, KMP_STRLEN(format) + 1);
1885 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1886 size_t format_size;
1887 if (!__kmp_init_serial) {
1888 __kmp_serial_initialize();
1890 format_size = KMP_STRLEN(__kmp_affinity_format);
1891 if (buffer && size) {
1892 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1893 format_size + 1);
1895 return format_size;
1898 void ompc_display_affinity(char const *format) {
1899 int gtid;
1900 if (!TCR_4(__kmp_init_middle)) {
1901 __kmp_middle_initialize();
1903 gtid = __kmp_get_gtid();
1904 __kmp_aux_display_affinity(gtid, format);
1907 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1908 char const *format) {
1909 int gtid;
1910 size_t num_required;
1911 kmp_str_buf_t capture_buf;
1912 if (!TCR_4(__kmp_init_middle)) {
1913 __kmp_middle_initialize();
1915 gtid = __kmp_get_gtid();
1916 __kmp_str_buf_init(&capture_buf);
1917 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1918 if (buffer && buf_size) {
1919 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1920 capture_buf.used + 1);
1922 __kmp_str_buf_free(&capture_buf);
1923 return num_required;
1926 void kmpc_set_stacksize(int arg) {
1927 // __kmp_aux_set_stacksize initializes the library if needed
1928 __kmp_aux_set_stacksize(arg);
1931 void kmpc_set_stacksize_s(size_t arg) {
1932 // __kmp_aux_set_stacksize initializes the library if needed
1933 __kmp_aux_set_stacksize(arg);
1936 void kmpc_set_blocktime(int arg) {
1937 int gtid, tid;
1938 kmp_info_t *thread;
1940 gtid = __kmp_entry_gtid();
1941 tid = __kmp_tid_from_gtid(gtid);
1942 thread = __kmp_thread_from_gtid(gtid);
1944 __kmp_aux_set_blocktime(arg, thread, tid);
1947 void kmpc_set_library(int arg) {
1948 // __kmp_user_set_library initializes the library if needed
1949 __kmp_user_set_library((enum library_type)arg);
1952 void kmpc_set_defaults(char const *str) {
1953 // __kmp_aux_set_defaults initializes the library if needed
1954 __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1957 void kmpc_set_disp_num_buffers(int arg) {
1958 // ignore after initialization because some teams have already
1959 // allocated dispatch buffers
1960 if (__kmp_init_serial == 0 && arg > 0)
1961 __kmp_dispatch_num_buffers = arg;
1964 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1965 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1966 return -1;
1967 #else
1968 if (!TCR_4(__kmp_init_middle)) {
1969 __kmp_middle_initialize();
1971 return __kmp_aux_set_affinity_mask_proc(proc, mask);
1972 #endif
1975 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1976 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1977 return -1;
1978 #else
1979 if (!TCR_4(__kmp_init_middle)) {
1980 __kmp_middle_initialize();
1982 return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1983 #endif
1986 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1987 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1988 return -1;
1989 #else
1990 if (!TCR_4(__kmp_init_middle)) {
1991 __kmp_middle_initialize();
1993 return __kmp_aux_get_affinity_mask_proc(proc, mask);
1994 #endif
1997 /* -------------------------------------------------------------------------- */
1999 @ingroup THREADPRIVATE
2000 @param loc source location information
2001 @param gtid global thread number
2002 @param cpy_size size of the cpy_data buffer
2003 @param cpy_data pointer to data to be copied
2004 @param cpy_func helper function to call for copying data
2005 @param didit flag variable: 1=single thread; 0=not single thread
2007 __kmpc_copyprivate implements the interface for the private data broadcast
2008 needed for the copyprivate clause associated with a single region in an
2009 OpenMP<sup>*</sup> program (both C and Fortran).
2010 All threads participating in the parallel region call this routine.
2011 One of the threads (called the single thread) should have the <tt>didit</tt>
2012 variable set to 1 and all other threads should have that variable set to 0.
2013 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2015 The OpenMP specification forbids the use of nowait on the single region when a
2016 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2017 barrier internally to avoid race conditions, so the code generation for the
2018 single region should avoid generating a barrier after the call to @ref
2019 __kmpc_copyprivate.
2021 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2022 The <tt>loc</tt> parameter is a pointer to source location information.
2024 Internal implementation: The single thread will first copy its descriptor
2025 address (cpy_data) to a team-private location, then the other threads will each
2026 call the function pointed to by the parameter cpy_func, which carries out the
2027 copy by copying the data using the cpy_data buffer.
2029 The cpy_func routine used for the copy and the contents of the data area defined
2030 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2031 to be done. For instance, the cpy_data buffer can hold the actual data to be
2032 copied or it may hold a list of pointers to the data. The cpy_func routine must
2033 interpret the cpy_data buffer appropriately.
2035 The interface to cpy_func is as follows:
2036 @code
2037 void cpy_func( void *destination, void *source )
2038 @endcode
2039 where void *destination is the cpy_data pointer for the thread being copied to
2040 and void *source is the cpy_data pointer for the thread being copied from.
2042 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2043 void *cpy_data, void (*cpy_func)(void *, void *),
2044 kmp_int32 didit) {
2045 void **data_ptr;
2046 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2047 __kmp_assert_valid_gtid(gtid);
2049 KMP_MB();
2051 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2053 if (__kmp_env_consistency_check) {
2054 if (loc == 0) {
2055 KMP_WARNING(ConstructIdentInvalid);
2059 // ToDo: Optimize the following two barriers into some kind of split barrier
2061 if (didit)
2062 *data_ptr = cpy_data;
2064 #if OMPT_SUPPORT
2065 ompt_frame_t *ompt_frame;
2066 if (ompt_enabled.enabled) {
2067 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2068 if (ompt_frame->enter_frame.ptr == NULL)
2069 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2070 OMPT_STORE_RETURN_ADDRESS(gtid);
2072 #endif
2073 /* This barrier is not a barrier region boundary */
2074 #if USE_ITT_NOTIFY
2075 __kmp_threads[gtid]->th.th_ident = loc;
2076 #endif
2077 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2079 if (!didit)
2080 (*cpy_func)(cpy_data, *data_ptr);
2082 // Consider next barrier a user-visible barrier for barrier region boundaries
2083 // Nesting checks are already handled by the single construct checks
2085 #if OMPT_SUPPORT
2086 if (ompt_enabled.enabled) {
2087 OMPT_STORE_RETURN_ADDRESS(gtid);
2089 #endif
2090 #if USE_ITT_NOTIFY
2091 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2092 // tasks can overwrite the location)
2093 #endif
2094 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2095 #if OMPT_SUPPORT && OMPT_OPTIONAL
2096 if (ompt_enabled.enabled) {
2097 ompt_frame->enter_frame = ompt_data_none;
2099 #endif
2102 /* -------------------------------------------------------------------------- */
2104 #define INIT_LOCK __kmp_init_user_lock_with_checks
2105 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2106 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2107 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2108 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2109 #define ACQUIRE_NESTED_LOCK_TIMED \
2110 __kmp_acquire_nested_user_lock_with_checks_timed
2111 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2112 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2113 #define TEST_LOCK __kmp_test_user_lock_with_checks
2114 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2115 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2116 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2118 // TODO: Make check abort messages use location info & pass it into
2119 // with_checks routines
2121 #if KMP_USE_DYNAMIC_LOCK
2123 // internal lock initializer
2124 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2125 kmp_dyna_lockseq_t seq) {
2126 if (KMP_IS_D_LOCK(seq)) {
2127 KMP_INIT_D_LOCK(lock, seq);
2128 #if USE_ITT_BUILD
2129 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2130 #endif
2131 } else {
2132 KMP_INIT_I_LOCK(lock, seq);
2133 #if USE_ITT_BUILD
2134 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2135 __kmp_itt_lock_creating(ilk->lock, loc);
2136 #endif
2140 // internal nest lock initializer
2141 static __forceinline void
2142 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2143 kmp_dyna_lockseq_t seq) {
2144 #if KMP_USE_TSX
2145 // Don't have nested lock implementation for speculative locks
2146 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2147 seq = __kmp_user_lock_seq;
2148 #endif
2149 switch (seq) {
2150 case lockseq_tas:
2151 seq = lockseq_nested_tas;
2152 break;
2153 #if KMP_USE_FUTEX
2154 case lockseq_futex:
2155 seq = lockseq_nested_futex;
2156 break;
2157 #endif
2158 case lockseq_ticket:
2159 seq = lockseq_nested_ticket;
2160 break;
2161 case lockseq_queuing:
2162 seq = lockseq_nested_queuing;
2163 break;
2164 case lockseq_drdpa:
2165 seq = lockseq_nested_drdpa;
2166 break;
2167 default:
2168 seq = lockseq_nested_queuing;
2170 KMP_INIT_I_LOCK(lock, seq);
2171 #if USE_ITT_BUILD
2172 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2173 __kmp_itt_lock_creating(ilk->lock, loc);
2174 #endif
2177 /* initialize the lock with a hint */
2178 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2179 uintptr_t hint) {
2180 KMP_DEBUG_ASSERT(__kmp_init_serial);
2181 if (__kmp_env_consistency_check && user_lock == NULL) {
2182 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2185 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2187 #if OMPT_SUPPORT && OMPT_OPTIONAL
2188 // This is the case, if called from omp_init_lock_with_hint:
2189 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2190 if (!codeptr)
2191 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2192 if (ompt_enabled.ompt_callback_lock_init) {
2193 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2194 ompt_mutex_lock, (omp_lock_hint_t)hint,
2195 __ompt_get_mutex_impl_type(user_lock),
2196 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2198 #endif
2201 /* initialize the lock with a hint */
2202 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2203 void **user_lock, uintptr_t hint) {
2204 KMP_DEBUG_ASSERT(__kmp_init_serial);
2205 if (__kmp_env_consistency_check && user_lock == NULL) {
2206 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2209 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2211 #if OMPT_SUPPORT && OMPT_OPTIONAL
2212 // This is the case, if called from omp_init_lock_with_hint:
2213 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2214 if (!codeptr)
2215 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2216 if (ompt_enabled.ompt_callback_lock_init) {
2217 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2218 ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2219 __ompt_get_mutex_impl_type(user_lock),
2220 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2222 #endif
2225 #endif // KMP_USE_DYNAMIC_LOCK
2227 /* initialize the lock */
2228 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2229 #if KMP_USE_DYNAMIC_LOCK
2231 KMP_DEBUG_ASSERT(__kmp_init_serial);
2232 if (__kmp_env_consistency_check && user_lock == NULL) {
2233 KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2235 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2237 #if OMPT_SUPPORT && OMPT_OPTIONAL
2238 // This is the case, if called from omp_init_lock_with_hint:
2239 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2240 if (!codeptr)
2241 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2242 if (ompt_enabled.ompt_callback_lock_init) {
2243 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2244 ompt_mutex_lock, omp_lock_hint_none,
2245 __ompt_get_mutex_impl_type(user_lock),
2246 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2248 #endif
2250 #else // KMP_USE_DYNAMIC_LOCK
2252 static char const *const func = "omp_init_lock";
2253 kmp_user_lock_p lck;
2254 KMP_DEBUG_ASSERT(__kmp_init_serial);
2256 if (__kmp_env_consistency_check) {
2257 if (user_lock == NULL) {
2258 KMP_FATAL(LockIsUninitialized, func);
2262 KMP_CHECK_USER_LOCK_INIT();
2264 if ((__kmp_user_lock_kind == lk_tas) &&
2265 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2266 lck = (kmp_user_lock_p)user_lock;
2268 #if KMP_USE_FUTEX
2269 else if ((__kmp_user_lock_kind == lk_futex) &&
2270 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2271 lck = (kmp_user_lock_p)user_lock;
2273 #endif
2274 else {
2275 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2277 INIT_LOCK(lck);
2278 __kmp_set_user_lock_location(lck, loc);
2280 #if OMPT_SUPPORT && OMPT_OPTIONAL
2281 // This is the case, if called from omp_init_lock_with_hint:
2282 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2283 if (!codeptr)
2284 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2285 if (ompt_enabled.ompt_callback_lock_init) {
2286 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2287 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2288 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2290 #endif
2292 #if USE_ITT_BUILD
2293 __kmp_itt_lock_creating(lck);
2294 #endif /* USE_ITT_BUILD */
2296 #endif // KMP_USE_DYNAMIC_LOCK
2297 } // __kmpc_init_lock
2299 /* initialize the lock */
2300 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2301 #if KMP_USE_DYNAMIC_LOCK
2303 KMP_DEBUG_ASSERT(__kmp_init_serial);
2304 if (__kmp_env_consistency_check && user_lock == NULL) {
2305 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2307 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310 // This is the case, if called from omp_init_lock_with_hint:
2311 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2312 if (!codeptr)
2313 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2314 if (ompt_enabled.ompt_callback_lock_init) {
2315 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2316 ompt_mutex_nest_lock, omp_lock_hint_none,
2317 __ompt_get_mutex_impl_type(user_lock),
2318 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2320 #endif
2322 #else // KMP_USE_DYNAMIC_LOCK
2324 static char const *const func = "omp_init_nest_lock";
2325 kmp_user_lock_p lck;
2326 KMP_DEBUG_ASSERT(__kmp_init_serial);
2328 if (__kmp_env_consistency_check) {
2329 if (user_lock == NULL) {
2330 KMP_FATAL(LockIsUninitialized, func);
2334 KMP_CHECK_USER_LOCK_INIT();
2336 if ((__kmp_user_lock_kind == lk_tas) &&
2337 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2338 OMP_NEST_LOCK_T_SIZE)) {
2339 lck = (kmp_user_lock_p)user_lock;
2341 #if KMP_USE_FUTEX
2342 else if ((__kmp_user_lock_kind == lk_futex) &&
2343 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2344 OMP_NEST_LOCK_T_SIZE)) {
2345 lck = (kmp_user_lock_p)user_lock;
2347 #endif
2348 else {
2349 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2352 INIT_NESTED_LOCK(lck);
2353 __kmp_set_user_lock_location(lck, loc);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356 // This is the case, if called from omp_init_lock_with_hint:
2357 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2358 if (!codeptr)
2359 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2360 if (ompt_enabled.ompt_callback_lock_init) {
2361 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2362 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2363 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2365 #endif
2367 #if USE_ITT_BUILD
2368 __kmp_itt_lock_creating(lck);
2369 #endif /* USE_ITT_BUILD */
2371 #endif // KMP_USE_DYNAMIC_LOCK
2372 } // __kmpc_init_nest_lock
2374 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2375 #if KMP_USE_DYNAMIC_LOCK
2377 #if USE_ITT_BUILD
2378 kmp_user_lock_p lck;
2379 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2380 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2381 } else {
2382 lck = (kmp_user_lock_p)user_lock;
2384 __kmp_itt_lock_destroyed(lck);
2385 #endif
2386 #if OMPT_SUPPORT && OMPT_OPTIONAL
2387 // This is the case, if called from omp_init_lock_with_hint:
2388 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2389 if (!codeptr)
2390 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2391 if (ompt_enabled.ompt_callback_lock_destroy) {
2392 kmp_user_lock_p lck;
2393 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2394 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2395 } else {
2396 lck = (kmp_user_lock_p)user_lock;
2398 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2399 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2401 #endif
2402 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2403 #else
2404 kmp_user_lock_p lck;
2406 if ((__kmp_user_lock_kind == lk_tas) &&
2407 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2408 lck = (kmp_user_lock_p)user_lock;
2410 #if KMP_USE_FUTEX
2411 else if ((__kmp_user_lock_kind == lk_futex) &&
2412 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2413 lck = (kmp_user_lock_p)user_lock;
2415 #endif
2416 else {
2417 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2420 #if OMPT_SUPPORT && OMPT_OPTIONAL
2421 // This is the case, if called from omp_init_lock_with_hint:
2422 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2423 if (!codeptr)
2424 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2425 if (ompt_enabled.ompt_callback_lock_destroy) {
2426 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2427 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2429 #endif
2431 #if USE_ITT_BUILD
2432 __kmp_itt_lock_destroyed(lck);
2433 #endif /* USE_ITT_BUILD */
2434 DESTROY_LOCK(lck);
2436 if ((__kmp_user_lock_kind == lk_tas) &&
2437 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2440 #if KMP_USE_FUTEX
2441 else if ((__kmp_user_lock_kind == lk_futex) &&
2442 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2445 #endif
2446 else {
2447 __kmp_user_lock_free(user_lock, gtid, lck);
2449 #endif // KMP_USE_DYNAMIC_LOCK
2450 } // __kmpc_destroy_lock
2452 /* destroy the lock */
2453 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2454 #if KMP_USE_DYNAMIC_LOCK
2456 #if USE_ITT_BUILD
2457 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2458 __kmp_itt_lock_destroyed(ilk->lock);
2459 #endif
2460 #if OMPT_SUPPORT && OMPT_OPTIONAL
2461 // This is the case, if called from omp_init_lock_with_hint:
2462 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2463 if (!codeptr)
2464 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2465 if (ompt_enabled.ompt_callback_lock_destroy) {
2466 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2467 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2469 #endif
2470 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2472 #else // KMP_USE_DYNAMIC_LOCK
2474 kmp_user_lock_p lck;
2476 if ((__kmp_user_lock_kind == lk_tas) &&
2477 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2478 OMP_NEST_LOCK_T_SIZE)) {
2479 lck = (kmp_user_lock_p)user_lock;
2481 #if KMP_USE_FUTEX
2482 else if ((__kmp_user_lock_kind == lk_futex) &&
2483 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2484 OMP_NEST_LOCK_T_SIZE)) {
2485 lck = (kmp_user_lock_p)user_lock;
2487 #endif
2488 else {
2489 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2492 #if OMPT_SUPPORT && OMPT_OPTIONAL
2493 // This is the case, if called from omp_init_lock_with_hint:
2494 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2495 if (!codeptr)
2496 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2497 if (ompt_enabled.ompt_callback_lock_destroy) {
2498 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2499 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2501 #endif
2503 #if USE_ITT_BUILD
2504 __kmp_itt_lock_destroyed(lck);
2505 #endif /* USE_ITT_BUILD */
2507 DESTROY_NESTED_LOCK(lck);
2509 if ((__kmp_user_lock_kind == lk_tas) &&
2510 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2511 OMP_NEST_LOCK_T_SIZE)) {
2514 #if KMP_USE_FUTEX
2515 else if ((__kmp_user_lock_kind == lk_futex) &&
2516 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2517 OMP_NEST_LOCK_T_SIZE)) {
2520 #endif
2521 else {
2522 __kmp_user_lock_free(user_lock, gtid, lck);
2524 #endif // KMP_USE_DYNAMIC_LOCK
2525 } // __kmpc_destroy_nest_lock
2527 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2528 KMP_COUNT_BLOCK(OMP_set_lock);
2529 #if KMP_USE_DYNAMIC_LOCK
2530 int tag = KMP_EXTRACT_D_TAG(user_lock);
2531 #if USE_ITT_BUILD
2532 __kmp_itt_lock_acquiring(
2533 (kmp_user_lock_p)
2534 user_lock); // itt function will get to the right lock object.
2535 #endif
2536 #if OMPT_SUPPORT && OMPT_OPTIONAL
2537 // This is the case, if called from omp_init_lock_with_hint:
2538 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2539 if (!codeptr)
2540 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2541 if (ompt_enabled.ompt_callback_mutex_acquire) {
2542 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2543 ompt_mutex_lock, omp_lock_hint_none,
2544 __ompt_get_mutex_impl_type(user_lock),
2545 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2547 #endif
2548 #if KMP_USE_INLINED_TAS
2549 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2550 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2551 } else
2552 #elif KMP_USE_INLINED_FUTEX
2553 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2554 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2555 } else
2556 #endif
2558 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2560 #if USE_ITT_BUILD
2561 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2562 #endif
2563 #if OMPT_SUPPORT && OMPT_OPTIONAL
2564 if (ompt_enabled.ompt_callback_mutex_acquired) {
2565 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2566 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2568 #endif
2570 #else // KMP_USE_DYNAMIC_LOCK
2572 kmp_user_lock_p lck;
2574 if ((__kmp_user_lock_kind == lk_tas) &&
2575 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2576 lck = (kmp_user_lock_p)user_lock;
2578 #if KMP_USE_FUTEX
2579 else if ((__kmp_user_lock_kind == lk_futex) &&
2580 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2581 lck = (kmp_user_lock_p)user_lock;
2583 #endif
2584 else {
2585 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2588 #if USE_ITT_BUILD
2589 __kmp_itt_lock_acquiring(lck);
2590 #endif /* USE_ITT_BUILD */
2591 #if OMPT_SUPPORT && OMPT_OPTIONAL
2592 // This is the case, if called from omp_init_lock_with_hint:
2593 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2594 if (!codeptr)
2595 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2596 if (ompt_enabled.ompt_callback_mutex_acquire) {
2597 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2598 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2599 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2601 #endif
2603 ACQUIRE_LOCK(lck, gtid);
2605 #if USE_ITT_BUILD
2606 __kmp_itt_lock_acquired(lck);
2607 #endif /* USE_ITT_BUILD */
2609 #if OMPT_SUPPORT && OMPT_OPTIONAL
2610 if (ompt_enabled.ompt_callback_mutex_acquired) {
2611 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2612 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2614 #endif
2616 #endif // KMP_USE_DYNAMIC_LOCK
2619 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2620 #if KMP_USE_DYNAMIC_LOCK
2622 #if USE_ITT_BUILD
2623 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2624 #endif
2625 #if OMPT_SUPPORT && OMPT_OPTIONAL
2626 // This is the case, if called from omp_init_lock_with_hint:
2627 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2628 if (!codeptr)
2629 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2630 if (ompt_enabled.enabled) {
2631 if (ompt_enabled.ompt_callback_mutex_acquire) {
2632 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2633 ompt_mutex_nest_lock, omp_lock_hint_none,
2634 __ompt_get_mutex_impl_type(user_lock),
2635 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2638 #endif
2639 int acquire_status =
2640 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2641 (void) acquire_status;
2642 #if USE_ITT_BUILD
2643 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2644 #endif
2646 #if OMPT_SUPPORT && OMPT_OPTIONAL
2647 if (ompt_enabled.enabled) {
2648 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2649 if (ompt_enabled.ompt_callback_mutex_acquired) {
2650 // lock_first
2651 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2652 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2653 codeptr);
2655 } else {
2656 if (ompt_enabled.ompt_callback_nest_lock) {
2657 // lock_next
2658 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2659 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2663 #endif
2665 #else // KMP_USE_DYNAMIC_LOCK
2666 int acquire_status;
2667 kmp_user_lock_p lck;
2669 if ((__kmp_user_lock_kind == lk_tas) &&
2670 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2671 OMP_NEST_LOCK_T_SIZE)) {
2672 lck = (kmp_user_lock_p)user_lock;
2674 #if KMP_USE_FUTEX
2675 else if ((__kmp_user_lock_kind == lk_futex) &&
2676 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2677 OMP_NEST_LOCK_T_SIZE)) {
2678 lck = (kmp_user_lock_p)user_lock;
2680 #endif
2681 else {
2682 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2685 #if USE_ITT_BUILD
2686 __kmp_itt_lock_acquiring(lck);
2687 #endif /* USE_ITT_BUILD */
2688 #if OMPT_SUPPORT && OMPT_OPTIONAL
2689 // This is the case, if called from omp_init_lock_with_hint:
2690 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2691 if (!codeptr)
2692 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2693 if (ompt_enabled.enabled) {
2694 if (ompt_enabled.ompt_callback_mutex_acquire) {
2695 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2696 ompt_mutex_nest_lock, omp_lock_hint_none,
2697 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2698 codeptr);
2701 #endif
2703 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2705 #if USE_ITT_BUILD
2706 __kmp_itt_lock_acquired(lck);
2707 #endif /* USE_ITT_BUILD */
2709 #if OMPT_SUPPORT && OMPT_OPTIONAL
2710 if (ompt_enabled.enabled) {
2711 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2712 if (ompt_enabled.ompt_callback_mutex_acquired) {
2713 // lock_first
2714 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2715 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2717 } else {
2718 if (ompt_enabled.ompt_callback_nest_lock) {
2719 // lock_next
2720 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2721 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2725 #endif
2727 #endif // KMP_USE_DYNAMIC_LOCK
2730 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2731 #if KMP_USE_DYNAMIC_LOCK
2733 int tag = KMP_EXTRACT_D_TAG(user_lock);
2734 #if USE_ITT_BUILD
2735 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2736 #endif
2737 #if KMP_USE_INLINED_TAS
2738 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2739 KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2740 } else
2741 #elif KMP_USE_INLINED_FUTEX
2742 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2743 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2744 } else
2745 #endif
2747 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2750 #if OMPT_SUPPORT && OMPT_OPTIONAL
2751 // This is the case, if called from omp_init_lock_with_hint:
2752 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2753 if (!codeptr)
2754 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2755 if (ompt_enabled.ompt_callback_mutex_released) {
2756 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2757 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2759 #endif
2761 #else // KMP_USE_DYNAMIC_LOCK
2763 kmp_user_lock_p lck;
2765 /* Can't use serial interval since not block structured */
2766 /* release the lock */
2768 if ((__kmp_user_lock_kind == lk_tas) &&
2769 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2770 #if KMP_OS_LINUX && \
2771 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2772 // "fast" path implemented to fix customer performance issue
2773 #if USE_ITT_BUILD
2774 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2775 #endif /* USE_ITT_BUILD */
2776 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2777 KMP_MB();
2779 #if OMPT_SUPPORT && OMPT_OPTIONAL
2780 // This is the case, if called from omp_init_lock_with_hint:
2781 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2782 if (!codeptr)
2783 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2784 if (ompt_enabled.ompt_callback_mutex_released) {
2785 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2786 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2788 #endif
2790 return;
2791 #else
2792 lck = (kmp_user_lock_p)user_lock;
2793 #endif
2795 #if KMP_USE_FUTEX
2796 else if ((__kmp_user_lock_kind == lk_futex) &&
2797 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2798 lck = (kmp_user_lock_p)user_lock;
2800 #endif
2801 else {
2802 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2805 #if USE_ITT_BUILD
2806 __kmp_itt_lock_releasing(lck);
2807 #endif /* USE_ITT_BUILD */
2809 RELEASE_LOCK(lck, gtid);
2811 #if OMPT_SUPPORT && OMPT_OPTIONAL
2812 // This is the case, if called from omp_init_lock_with_hint:
2813 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2814 if (!codeptr)
2815 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2816 if (ompt_enabled.ompt_callback_mutex_released) {
2817 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2818 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2820 #endif
2822 #endif // KMP_USE_DYNAMIC_LOCK
2825 /* release the lock */
2826 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2827 #if KMP_USE_DYNAMIC_LOCK
2829 #if USE_ITT_BUILD
2830 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2831 #endif
2832 int release_status =
2833 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2834 (void) release_status;
2836 #if OMPT_SUPPORT && OMPT_OPTIONAL
2837 // This is the case, if called from omp_init_lock_with_hint:
2838 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2839 if (!codeptr)
2840 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2841 if (ompt_enabled.enabled) {
2842 if (release_status == KMP_LOCK_RELEASED) {
2843 if (ompt_enabled.ompt_callback_mutex_released) {
2844 // release_lock_last
2845 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2846 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2847 codeptr);
2849 } else if (ompt_enabled.ompt_callback_nest_lock) {
2850 // release_lock_prev
2851 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2852 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2855 #endif
2857 #else // KMP_USE_DYNAMIC_LOCK
2859 kmp_user_lock_p lck;
2861 /* Can't use serial interval since not block structured */
2863 if ((__kmp_user_lock_kind == lk_tas) &&
2864 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2865 OMP_NEST_LOCK_T_SIZE)) {
2866 #if KMP_OS_LINUX && \
2867 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2868 // "fast" path implemented to fix customer performance issue
2869 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2870 #if USE_ITT_BUILD
2871 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2872 #endif /* USE_ITT_BUILD */
2874 #if OMPT_SUPPORT && OMPT_OPTIONAL
2875 int release_status = KMP_LOCK_STILL_HELD;
2876 #endif
2878 if (--(tl->lk.depth_locked) == 0) {
2879 TCW_4(tl->lk.poll, 0);
2880 #if OMPT_SUPPORT && OMPT_OPTIONAL
2881 release_status = KMP_LOCK_RELEASED;
2882 #endif
2884 KMP_MB();
2886 #if OMPT_SUPPORT && OMPT_OPTIONAL
2887 // This is the case, if called from omp_init_lock_with_hint:
2888 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2889 if (!codeptr)
2890 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2891 if (ompt_enabled.enabled) {
2892 if (release_status == KMP_LOCK_RELEASED) {
2893 if (ompt_enabled.ompt_callback_mutex_released) {
2894 // release_lock_last
2895 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2896 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2898 } else if (ompt_enabled.ompt_callback_nest_lock) {
2899 // release_lock_previous
2900 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2901 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2904 #endif
2906 return;
2907 #else
2908 lck = (kmp_user_lock_p)user_lock;
2909 #endif
2911 #if KMP_USE_FUTEX
2912 else if ((__kmp_user_lock_kind == lk_futex) &&
2913 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2914 OMP_NEST_LOCK_T_SIZE)) {
2915 lck = (kmp_user_lock_p)user_lock;
2917 #endif
2918 else {
2919 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2922 #if USE_ITT_BUILD
2923 __kmp_itt_lock_releasing(lck);
2924 #endif /* USE_ITT_BUILD */
2926 int release_status;
2927 release_status = RELEASE_NESTED_LOCK(lck, gtid);
2928 #if OMPT_SUPPORT && OMPT_OPTIONAL
2929 // This is the case, if called from omp_init_lock_with_hint:
2930 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2931 if (!codeptr)
2932 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2933 if (ompt_enabled.enabled) {
2934 if (release_status == KMP_LOCK_RELEASED) {
2935 if (ompt_enabled.ompt_callback_mutex_released) {
2936 // release_lock_last
2937 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2938 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2940 } else if (ompt_enabled.ompt_callback_nest_lock) {
2941 // release_lock_previous
2942 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2943 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2946 #endif
2948 #endif // KMP_USE_DYNAMIC_LOCK
2951 /* try to acquire the lock */
2952 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2953 KMP_COUNT_BLOCK(OMP_test_lock);
2955 #if KMP_USE_DYNAMIC_LOCK
2956 int rc;
2957 int tag = KMP_EXTRACT_D_TAG(user_lock);
2958 #if USE_ITT_BUILD
2959 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2960 #endif
2961 #if OMPT_SUPPORT && OMPT_OPTIONAL
2962 // This is the case, if called from omp_init_lock_with_hint:
2963 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2964 if (!codeptr)
2965 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2966 if (ompt_enabled.ompt_callback_mutex_acquire) {
2967 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2968 ompt_mutex_lock, omp_lock_hint_none,
2969 __ompt_get_mutex_impl_type(user_lock),
2970 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2972 #endif
2973 #if KMP_USE_INLINED_TAS
2974 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2975 KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2976 } else
2977 #elif KMP_USE_INLINED_FUTEX
2978 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2979 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2980 } else
2981 #endif
2983 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2985 if (rc) {
2986 #if USE_ITT_BUILD
2987 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2988 #endif
2989 #if OMPT_SUPPORT && OMPT_OPTIONAL
2990 if (ompt_enabled.ompt_callback_mutex_acquired) {
2991 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2992 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2994 #endif
2995 return FTN_TRUE;
2996 } else {
2997 #if USE_ITT_BUILD
2998 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2999 #endif
3000 return FTN_FALSE;
3003 #else // KMP_USE_DYNAMIC_LOCK
3005 kmp_user_lock_p lck;
3006 int rc;
3008 if ((__kmp_user_lock_kind == lk_tas) &&
3009 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3010 lck = (kmp_user_lock_p)user_lock;
3012 #if KMP_USE_FUTEX
3013 else if ((__kmp_user_lock_kind == lk_futex) &&
3014 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3015 lck = (kmp_user_lock_p)user_lock;
3017 #endif
3018 else {
3019 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3022 #if USE_ITT_BUILD
3023 __kmp_itt_lock_acquiring(lck);
3024 #endif /* USE_ITT_BUILD */
3025 #if OMPT_SUPPORT && OMPT_OPTIONAL
3026 // This is the case, if called from omp_init_lock_with_hint:
3027 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3028 if (!codeptr)
3029 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3030 if (ompt_enabled.ompt_callback_mutex_acquire) {
3031 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3032 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3033 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3035 #endif
3037 rc = TEST_LOCK(lck, gtid);
3038 #if USE_ITT_BUILD
3039 if (rc) {
3040 __kmp_itt_lock_acquired(lck);
3041 } else {
3042 __kmp_itt_lock_cancelled(lck);
3044 #endif /* USE_ITT_BUILD */
3045 #if OMPT_SUPPORT && OMPT_OPTIONAL
3046 if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3047 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3048 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3050 #endif
3052 return (rc ? FTN_TRUE : FTN_FALSE);
3054 /* Can't use serial interval since not block structured */
3056 #endif // KMP_USE_DYNAMIC_LOCK
3059 /* try to acquire the lock */
3060 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3061 #if KMP_USE_DYNAMIC_LOCK
3062 int rc;
3063 #if USE_ITT_BUILD
3064 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3065 #endif
3066 #if OMPT_SUPPORT && OMPT_OPTIONAL
3067 // This is the case, if called from omp_init_lock_with_hint:
3068 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3069 if (!codeptr)
3070 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3071 if (ompt_enabled.ompt_callback_mutex_acquire) {
3072 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3073 ompt_mutex_nest_lock, omp_lock_hint_none,
3074 __ompt_get_mutex_impl_type(user_lock),
3075 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3077 #endif
3078 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3079 #if USE_ITT_BUILD
3080 if (rc) {
3081 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3082 } else {
3083 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3085 #endif
3086 #if OMPT_SUPPORT && OMPT_OPTIONAL
3087 if (ompt_enabled.enabled && rc) {
3088 if (rc == 1) {
3089 if (ompt_enabled.ompt_callback_mutex_acquired) {
3090 // lock_first
3091 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3092 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3093 codeptr);
3095 } else {
3096 if (ompt_enabled.ompt_callback_nest_lock) {
3097 // lock_next
3098 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3099 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3103 #endif
3104 return rc;
3106 #else // KMP_USE_DYNAMIC_LOCK
3108 kmp_user_lock_p lck;
3109 int rc;
3111 if ((__kmp_user_lock_kind == lk_tas) &&
3112 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3113 OMP_NEST_LOCK_T_SIZE)) {
3114 lck = (kmp_user_lock_p)user_lock;
3116 #if KMP_USE_FUTEX
3117 else if ((__kmp_user_lock_kind == lk_futex) &&
3118 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3119 OMP_NEST_LOCK_T_SIZE)) {
3120 lck = (kmp_user_lock_p)user_lock;
3122 #endif
3123 else {
3124 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3127 #if USE_ITT_BUILD
3128 __kmp_itt_lock_acquiring(lck);
3129 #endif /* USE_ITT_BUILD */
3131 #if OMPT_SUPPORT && OMPT_OPTIONAL
3132 // This is the case, if called from omp_init_lock_with_hint:
3133 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3134 if (!codeptr)
3135 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3136 if (ompt_enabled.enabled) &&
3137 ompt_enabled.ompt_callback_mutex_acquire) {
3138 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3139 ompt_mutex_nest_lock, omp_lock_hint_none,
3140 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3141 codeptr);
3143 #endif
3145 rc = TEST_NESTED_LOCK(lck, gtid);
3146 #if USE_ITT_BUILD
3147 if (rc) {
3148 __kmp_itt_lock_acquired(lck);
3149 } else {
3150 __kmp_itt_lock_cancelled(lck);
3152 #endif /* USE_ITT_BUILD */
3153 #if OMPT_SUPPORT && OMPT_OPTIONAL
3154 if (ompt_enabled.enabled && rc) {
3155 if (rc == 1) {
3156 if (ompt_enabled.ompt_callback_mutex_acquired) {
3157 // lock_first
3158 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3159 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3161 } else {
3162 if (ompt_enabled.ompt_callback_nest_lock) {
3163 // lock_next
3164 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3165 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3169 #endif
3170 return rc;
3172 /* Can't use serial interval since not block structured */
3174 #endif // KMP_USE_DYNAMIC_LOCK
3177 // Interface to fast scalable reduce methods routines
3179 // keep the selected method in a thread local structure for cross-function
3180 // usage: will be used in __kmpc_end_reduce* functions;
3181 // another solution: to re-determine the method one more time in
3182 // __kmpc_end_reduce* functions (new prototype required then)
3183 // AT: which solution is better?
3184 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3185 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3187 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3188 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3190 // description of the packed_reduction_method variable: look at the macros in
3191 // kmp.h
3193 // used in a critical section reduce block
3194 static __forceinline void
3195 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3196 kmp_critical_name *crit) {
3198 // this lock was visible to a customer and to the threading profile tool as a
3199 // serial overhead span (although it's used for an internal purpose only)
3200 // why was it visible in previous implementation?
3201 // should we keep it visible in new reduce block?
3202 kmp_user_lock_p lck;
3204 #if KMP_USE_DYNAMIC_LOCK
3206 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3207 // Check if it is initialized.
3208 if (*lk == 0) {
3209 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3210 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3211 KMP_GET_D_TAG(__kmp_user_lock_seq));
3212 } else {
3213 __kmp_init_indirect_csptr(crit, loc, global_tid,
3214 KMP_GET_I_TAG(__kmp_user_lock_seq));
3217 // Branch for accessing the actual lock object and set operation. This
3218 // branching is inevitable since this lock initialization does not follow the
3219 // normal dispatch path (lock table is not used).
3220 if (KMP_EXTRACT_D_TAG(lk) != 0) {
3221 lck = (kmp_user_lock_p)lk;
3222 KMP_DEBUG_ASSERT(lck != NULL);
3223 if (__kmp_env_consistency_check) {
3224 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3226 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3227 } else {
3228 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3229 lck = ilk->lock;
3230 KMP_DEBUG_ASSERT(lck != NULL);
3231 if (__kmp_env_consistency_check) {
3232 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3234 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3237 #else // KMP_USE_DYNAMIC_LOCK
3239 // We know that the fast reduction code is only emitted by Intel compilers
3240 // with 32 byte critical sections. If there isn't enough space, then we
3241 // have to use a pointer.
3242 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3243 lck = (kmp_user_lock_p)crit;
3244 } else {
3245 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3247 KMP_DEBUG_ASSERT(lck != NULL);
3249 if (__kmp_env_consistency_check)
3250 __kmp_push_sync(global_tid, ct_critical, loc, lck);
3252 __kmp_acquire_user_lock_with_checks(lck, global_tid);
3254 #endif // KMP_USE_DYNAMIC_LOCK
3257 // used in a critical section reduce block
3258 static __forceinline void
3259 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3260 kmp_critical_name *crit) {
3262 kmp_user_lock_p lck;
3264 #if KMP_USE_DYNAMIC_LOCK
3266 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3267 lck = (kmp_user_lock_p)crit;
3268 if (__kmp_env_consistency_check)
3269 __kmp_pop_sync(global_tid, ct_critical, loc);
3270 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3271 } else {
3272 kmp_indirect_lock_t *ilk =
3273 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3274 if (__kmp_env_consistency_check)
3275 __kmp_pop_sync(global_tid, ct_critical, loc);
3276 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3279 #else // KMP_USE_DYNAMIC_LOCK
3281 // We know that the fast reduction code is only emitted by Intel compilers
3282 // with 32 byte critical sections. If there isn't enough space, then we have
3283 // to use a pointer.
3284 if (__kmp_base_user_lock_size > 32) {
3285 lck = *((kmp_user_lock_p *)crit);
3286 KMP_ASSERT(lck != NULL);
3287 } else {
3288 lck = (kmp_user_lock_p)crit;
3291 if (__kmp_env_consistency_check)
3292 __kmp_pop_sync(global_tid, ct_critical, loc);
3294 __kmp_release_user_lock_with_checks(lck, global_tid);
3296 #endif // KMP_USE_DYNAMIC_LOCK
3297 } // __kmp_end_critical_section_reduce_block
3299 static __forceinline int
3300 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3301 int *task_state) {
3302 kmp_team_t *team;
3304 // Check if we are inside the teams construct?
3305 if (th->th.th_teams_microtask) {
3306 *team_p = team = th->th.th_team;
3307 if (team->t.t_level == th->th.th_teams_level) {
3308 // This is reduction at teams construct.
3309 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3310 // Let's swap teams temporarily for the reduction.
3311 th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3312 th->th.th_team = team->t.t_parent;
3313 th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3314 th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3315 *task_state = th->th.th_task_state;
3316 th->th.th_task_state = 0;
3318 return 1;
3321 return 0;
3324 static __forceinline void
3325 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3326 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3327 th->th.th_info.ds.ds_tid = 0;
3328 th->th.th_team = team;
3329 th->th.th_team_nproc = team->t.t_nproc;
3330 th->th.th_task_team = team->t.t_task_team[task_state];
3331 th->th.th_task_state = task_state;
3334 /* 2.a.i. Reduce Block without a terminating barrier */
3336 @ingroup SYNCHRONIZATION
3337 @param loc source location information
3338 @param global_tid global thread number
3339 @param num_vars number of items (variables) to be reduced
3340 @param reduce_size size of data in bytes to be reduced
3341 @param reduce_data pointer to data to be reduced
3342 @param reduce_func callback function providing reduction operation on two
3343 operands and returning result of reduction in lhs_data
3344 @param lck pointer to the unique lock data structure
3345 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3346 threads if atomic reduction needed
3348 The nowait version is used for a reduce clause with the nowait argument.
3350 kmp_int32
3351 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3352 size_t reduce_size, void *reduce_data,
3353 void (*reduce_func)(void *lhs_data, void *rhs_data),
3354 kmp_critical_name *lck) {
3356 KMP_COUNT_BLOCK(REDUCE_nowait);
3357 int retval = 0;
3358 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3359 kmp_info_t *th;
3360 kmp_team_t *team;
3361 int teams_swapped = 0, task_state;
3362 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3363 __kmp_assert_valid_gtid(global_tid);
3365 // why do we need this initialization here at all?
3366 // Reduction clause can not be used as a stand-alone directive.
3368 // do not call __kmp_serial_initialize(), it will be called by
3369 // __kmp_parallel_initialize() if needed
3370 // possible detection of false-positive race by the threadchecker ???
3371 if (!TCR_4(__kmp_init_parallel))
3372 __kmp_parallel_initialize();
3374 __kmp_resume_if_soft_paused();
3376 // check correctness of reduce block nesting
3377 #if KMP_USE_DYNAMIC_LOCK
3378 if (__kmp_env_consistency_check)
3379 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3380 #else
3381 if (__kmp_env_consistency_check)
3382 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3383 #endif
3385 th = __kmp_thread_from_gtid(global_tid);
3386 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3388 // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3389 // the value should be kept in a variable
3390 // the variable should be either a construct-specific or thread-specific
3391 // property, not a team specific property
3392 // (a thread can reach the next reduce block on the next construct, reduce
3393 // method may differ on the next construct)
3394 // an ident_t "loc" parameter could be used as a construct-specific property
3395 // (what if loc == 0?)
3396 // (if both construct-specific and team-specific variables were shared,
3397 // then unness extra syncs should be needed)
3398 // a thread-specific variable is better regarding two issues above (next
3399 // construct and extra syncs)
3400 // a thread-specific "th_local.reduction_method" variable is used currently
3401 // each thread executes 'determine' and 'set' lines (no need to execute by one
3402 // thread, to avoid unness extra syncs)
3404 packed_reduction_method = __kmp_determine_reduction_method(
3405 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3406 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3408 OMPT_REDUCTION_DECL(th, global_tid);
3409 if (packed_reduction_method == critical_reduce_block) {
3411 OMPT_REDUCTION_BEGIN;
3413 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3414 retval = 1;
3416 } else if (packed_reduction_method == empty_reduce_block) {
3418 OMPT_REDUCTION_BEGIN;
3420 // usage: if team size == 1, no synchronization is required ( Intel
3421 // platforms only )
3422 retval = 1;
3424 } else if (packed_reduction_method == atomic_reduce_block) {
3426 retval = 2;
3428 // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3429 // won't be called by the code gen)
3430 // (it's not quite good, because the checking block has been closed by
3431 // this 'pop',
3432 // but atomic operation has not been executed yet, will be executed
3433 // slightly later, literally on next instruction)
3434 if (__kmp_env_consistency_check)
3435 __kmp_pop_sync(global_tid, ct_reduce, loc);
3437 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3438 tree_reduce_block)) {
3440 // AT: performance issue: a real barrier here
3441 // AT: (if master goes slow, other threads are blocked here waiting for the
3442 // master to come and release them)
3443 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3444 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3445 // be confusing to a customer)
3446 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3447 // might go faster and be more in line with sense of NOWAIT
3448 // AT: TO DO: do epcc test and compare times
3450 // this barrier should be invisible to a customer and to the threading profile
3451 // tool (it's neither a terminating barrier nor customer's code, it's
3452 // used for an internal purpose)
3453 #if OMPT_SUPPORT
3454 // JP: can this barrier potentially leed to task scheduling?
3455 // JP: as long as there is a barrier in the implementation, OMPT should and
3456 // will provide the barrier events
3457 // so we set-up the necessary frame/return addresses.
3458 ompt_frame_t *ompt_frame;
3459 if (ompt_enabled.enabled) {
3460 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3461 if (ompt_frame->enter_frame.ptr == NULL)
3462 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3463 OMPT_STORE_RETURN_ADDRESS(global_tid);
3465 #endif
3466 #if USE_ITT_NOTIFY
3467 __kmp_threads[global_tid]->th.th_ident = loc;
3468 #endif
3469 retval =
3470 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3471 global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3472 retval = (retval != 0) ? (0) : (1);
3473 #if OMPT_SUPPORT && OMPT_OPTIONAL
3474 if (ompt_enabled.enabled) {
3475 ompt_frame->enter_frame = ompt_data_none;
3477 #endif
3479 // all other workers except master should do this pop here
3480 // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3481 if (__kmp_env_consistency_check) {
3482 if (retval == 0) {
3483 __kmp_pop_sync(global_tid, ct_reduce, loc);
3487 } else {
3489 // should never reach this block
3490 KMP_ASSERT(0); // "unexpected method"
3492 if (teams_swapped) {
3493 __kmp_restore_swapped_teams(th, team, task_state);
3495 KA_TRACE(
3497 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3498 global_tid, packed_reduction_method, retval));
3500 return retval;
3504 @ingroup SYNCHRONIZATION
3505 @param loc source location information
3506 @param global_tid global thread id.
3507 @param lck pointer to the unique lock data structure
3509 Finish the execution of a reduce nowait.
3511 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3512 kmp_critical_name *lck) {
3514 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3516 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3517 __kmp_assert_valid_gtid(global_tid);
3519 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3521 OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3523 if (packed_reduction_method == critical_reduce_block) {
3525 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3526 OMPT_REDUCTION_END;
3528 } else if (packed_reduction_method == empty_reduce_block) {
3530 // usage: if team size == 1, no synchronization is required ( on Intel
3531 // platforms only )
3533 OMPT_REDUCTION_END;
3535 } else if (packed_reduction_method == atomic_reduce_block) {
3537 // neither master nor other workers should get here
3538 // (code gen does not generate this call in case 2: atomic reduce block)
3539 // actually it's better to remove this elseif at all;
3540 // after removal this value will checked by the 'else' and will assert
3542 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3543 tree_reduce_block)) {
3545 // only master gets here
3546 // OMPT: tree reduction is annotated in the barrier code
3548 } else {
3550 // should never reach this block
3551 KMP_ASSERT(0); // "unexpected method"
3554 if (__kmp_env_consistency_check)
3555 __kmp_pop_sync(global_tid, ct_reduce, loc);
3557 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3558 global_tid, packed_reduction_method));
3560 return;
3563 /* 2.a.ii. Reduce Block with a terminating barrier */
3566 @ingroup SYNCHRONIZATION
3567 @param loc source location information
3568 @param global_tid global thread number
3569 @param num_vars number of items (variables) to be reduced
3570 @param reduce_size size of data in bytes to be reduced
3571 @param reduce_data pointer to data to be reduced
3572 @param reduce_func callback function providing reduction operation on two
3573 operands and returning result of reduction in lhs_data
3574 @param lck pointer to the unique lock data structure
3575 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3576 threads if atomic reduction needed
3578 A blocking reduce that includes an implicit barrier.
3580 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3581 size_t reduce_size, void *reduce_data,
3582 void (*reduce_func)(void *lhs_data, void *rhs_data),
3583 kmp_critical_name *lck) {
3584 KMP_COUNT_BLOCK(REDUCE_wait);
3585 int retval = 0;
3586 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3587 kmp_info_t *th;
3588 kmp_team_t *team;
3589 int teams_swapped = 0, task_state;
3591 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3592 __kmp_assert_valid_gtid(global_tid);
3594 // why do we need this initialization here at all?
3595 // Reduction clause can not be a stand-alone directive.
3597 // do not call __kmp_serial_initialize(), it will be called by
3598 // __kmp_parallel_initialize() if needed
3599 // possible detection of false-positive race by the threadchecker ???
3600 if (!TCR_4(__kmp_init_parallel))
3601 __kmp_parallel_initialize();
3603 __kmp_resume_if_soft_paused();
3605 // check correctness of reduce block nesting
3606 #if KMP_USE_DYNAMIC_LOCK
3607 if (__kmp_env_consistency_check)
3608 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3609 #else
3610 if (__kmp_env_consistency_check)
3611 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3612 #endif
3614 th = __kmp_thread_from_gtid(global_tid);
3615 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3617 packed_reduction_method = __kmp_determine_reduction_method(
3618 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3619 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3621 OMPT_REDUCTION_DECL(th, global_tid);
3623 if (packed_reduction_method == critical_reduce_block) {
3625 OMPT_REDUCTION_BEGIN;
3626 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3627 retval = 1;
3629 } else if (packed_reduction_method == empty_reduce_block) {
3631 OMPT_REDUCTION_BEGIN;
3632 // usage: if team size == 1, no synchronization is required ( Intel
3633 // platforms only )
3634 retval = 1;
3636 } else if (packed_reduction_method == atomic_reduce_block) {
3638 retval = 2;
3640 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3641 tree_reduce_block)) {
3643 // case tree_reduce_block:
3644 // this barrier should be visible to a customer and to the threading profile
3645 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3646 #if OMPT_SUPPORT
3647 ompt_frame_t *ompt_frame;
3648 if (ompt_enabled.enabled) {
3649 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3650 if (ompt_frame->enter_frame.ptr == NULL)
3651 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3652 OMPT_STORE_RETURN_ADDRESS(global_tid);
3654 #endif
3655 #if USE_ITT_NOTIFY
3656 __kmp_threads[global_tid]->th.th_ident =
3657 loc; // needed for correct notification of frames
3658 #endif
3659 retval =
3660 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3661 global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3662 retval = (retval != 0) ? (0) : (1);
3663 #if OMPT_SUPPORT && OMPT_OPTIONAL
3664 if (ompt_enabled.enabled) {
3665 ompt_frame->enter_frame = ompt_data_none;
3667 #endif
3669 // all other workers except master should do this pop here
3670 // ( none of other workers except master will enter __kmpc_end_reduce() )
3671 if (__kmp_env_consistency_check) {
3672 if (retval == 0) { // 0: all other workers; 1: master
3673 __kmp_pop_sync(global_tid, ct_reduce, loc);
3677 } else {
3679 // should never reach this block
3680 KMP_ASSERT(0); // "unexpected method"
3682 if (teams_swapped) {
3683 __kmp_restore_swapped_teams(th, team, task_state);
3686 KA_TRACE(10,
3687 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3688 global_tid, packed_reduction_method, retval));
3689 return retval;
3693 @ingroup SYNCHRONIZATION
3694 @param loc source location information
3695 @param global_tid global thread id.
3696 @param lck pointer to the unique lock data structure
3698 Finish the execution of a blocking reduce.
3699 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3700 start function.
3702 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3703 kmp_critical_name *lck) {
3705 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3706 kmp_info_t *th;
3707 kmp_team_t *team;
3708 int teams_swapped = 0, task_state;
3710 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3711 __kmp_assert_valid_gtid(global_tid);
3713 th = __kmp_thread_from_gtid(global_tid);
3714 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3716 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3718 // this barrier should be visible to a customer and to the threading profile
3719 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3720 OMPT_REDUCTION_DECL(th, global_tid);
3722 if (packed_reduction_method == critical_reduce_block) {
3723 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3725 OMPT_REDUCTION_END;
3727 // TODO: implicit barrier: should be exposed
3728 #if OMPT_SUPPORT
3729 ompt_frame_t *ompt_frame;
3730 if (ompt_enabled.enabled) {
3731 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3732 if (ompt_frame->enter_frame.ptr == NULL)
3733 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3734 OMPT_STORE_RETURN_ADDRESS(global_tid);
3736 #endif
3737 #if USE_ITT_NOTIFY
3738 __kmp_threads[global_tid]->th.th_ident = loc;
3739 #endif
3740 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3741 #if OMPT_SUPPORT && OMPT_OPTIONAL
3742 if (ompt_enabled.enabled) {
3743 ompt_frame->enter_frame = ompt_data_none;
3745 #endif
3747 } else if (packed_reduction_method == empty_reduce_block) {
3749 OMPT_REDUCTION_END;
3751 // usage: if team size==1, no synchronization is required (Intel platforms only)
3753 // TODO: implicit barrier: should be exposed
3754 #if OMPT_SUPPORT
3755 ompt_frame_t *ompt_frame;
3756 if (ompt_enabled.enabled) {
3757 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3758 if (ompt_frame->enter_frame.ptr == NULL)
3759 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3760 OMPT_STORE_RETURN_ADDRESS(global_tid);
3762 #endif
3763 #if USE_ITT_NOTIFY
3764 __kmp_threads[global_tid]->th.th_ident = loc;
3765 #endif
3766 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3767 #if OMPT_SUPPORT && OMPT_OPTIONAL
3768 if (ompt_enabled.enabled) {
3769 ompt_frame->enter_frame = ompt_data_none;
3771 #endif
3773 } else if (packed_reduction_method == atomic_reduce_block) {
3775 #if OMPT_SUPPORT
3776 ompt_frame_t *ompt_frame;
3777 if (ompt_enabled.enabled) {
3778 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3779 if (ompt_frame->enter_frame.ptr == NULL)
3780 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3781 OMPT_STORE_RETURN_ADDRESS(global_tid);
3783 #endif
3784 // TODO: implicit barrier: should be exposed
3785 #if USE_ITT_NOTIFY
3786 __kmp_threads[global_tid]->th.th_ident = loc;
3787 #endif
3788 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3789 #if OMPT_SUPPORT && OMPT_OPTIONAL
3790 if (ompt_enabled.enabled) {
3791 ompt_frame->enter_frame = ompt_data_none;
3793 #endif
3795 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3796 tree_reduce_block)) {
3798 // only master executes here (master releases all other workers)
3799 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3800 global_tid);
3802 } else {
3804 // should never reach this block
3805 KMP_ASSERT(0); // "unexpected method"
3807 if (teams_swapped) {
3808 __kmp_restore_swapped_teams(th, team, task_state);
3811 if (__kmp_env_consistency_check)
3812 __kmp_pop_sync(global_tid, ct_reduce, loc);
3814 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3815 global_tid, packed_reduction_method));
3817 return;
3820 #undef __KMP_GET_REDUCTION_METHOD
3821 #undef __KMP_SET_REDUCTION_METHOD
3823 /* end of interface to fast scalable reduce routines */
3825 kmp_uint64 __kmpc_get_taskid() {
3827 kmp_int32 gtid;
3828 kmp_info_t *thread;
3830 gtid = __kmp_get_gtid();
3831 if (gtid < 0) {
3832 return 0;
3834 thread = __kmp_thread_from_gtid(gtid);
3835 return thread->th.th_current_task->td_task_id;
3837 } // __kmpc_get_taskid
3839 kmp_uint64 __kmpc_get_parent_taskid() {
3841 kmp_int32 gtid;
3842 kmp_info_t *thread;
3843 kmp_taskdata_t *parent_task;
3845 gtid = __kmp_get_gtid();
3846 if (gtid < 0) {
3847 return 0;
3849 thread = __kmp_thread_from_gtid(gtid);
3850 parent_task = thread->th.th_current_task->td_parent;
3851 return (parent_task == NULL ? 0 : parent_task->td_task_id);
3853 } // __kmpc_get_parent_taskid
3856 @ingroup WORK_SHARING
3857 @param loc source location information.
3858 @param gtid global thread number.
3859 @param num_dims number of associated doacross loops.
3860 @param dims info on loops bounds.
3862 Initialize doacross loop information.
3863 Expect compiler send us inclusive bounds,
3864 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3866 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3867 const struct kmp_dim *dims) {
3868 __kmp_assert_valid_gtid(gtid);
3869 int j, idx;
3870 kmp_int64 last, trace_count;
3871 kmp_info_t *th = __kmp_threads[gtid];
3872 kmp_team_t *team = th->th.th_team;
3873 kmp_uint32 *flags;
3874 kmp_disp_t *pr_buf = th->th.th_dispatch;
3875 dispatch_shared_info_t *sh_buf;
3877 KA_TRACE(
3879 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3880 gtid, num_dims, !team->t.t_serialized));
3881 KMP_DEBUG_ASSERT(dims != NULL);
3882 KMP_DEBUG_ASSERT(num_dims > 0);
3884 if (team->t.t_serialized) {
3885 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3886 return; // no dependencies if team is serialized
3888 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3889 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3890 // the next loop
3891 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3893 // Save bounds info into allocated private buffer
3894 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3895 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3896 th, sizeof(kmp_int64) * (4 * num_dims + 1));
3897 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3898 pr_buf->th_doacross_info[0] =
3899 (kmp_int64)num_dims; // first element is number of dimensions
3900 // Save also address of num_done in order to access it later without knowing
3901 // the buffer index
3902 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3903 pr_buf->th_doacross_info[2] = dims[0].lo;
3904 pr_buf->th_doacross_info[3] = dims[0].up;
3905 pr_buf->th_doacross_info[4] = dims[0].st;
3906 last = 5;
3907 for (j = 1; j < num_dims; ++j) {
3908 kmp_int64
3909 range_length; // To keep ranges of all dimensions but the first dims[0]
3910 if (dims[j].st == 1) { // most common case
3911 // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3912 range_length = dims[j].up - dims[j].lo + 1;
3913 } else {
3914 if (dims[j].st > 0) {
3915 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3916 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3917 } else { // negative increment
3918 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3919 range_length =
3920 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3923 pr_buf->th_doacross_info[last++] = range_length;
3924 pr_buf->th_doacross_info[last++] = dims[j].lo;
3925 pr_buf->th_doacross_info[last++] = dims[j].up;
3926 pr_buf->th_doacross_info[last++] = dims[j].st;
3929 // Compute total trip count.
3930 // Start with range of dims[0] which we don't need to keep in the buffer.
3931 if (dims[0].st == 1) { // most common case
3932 trace_count = dims[0].up - dims[0].lo + 1;
3933 } else if (dims[0].st > 0) {
3934 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3935 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3936 } else { // negative increment
3937 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3938 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3940 for (j = 1; j < num_dims; ++j) {
3941 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3943 KMP_DEBUG_ASSERT(trace_count > 0);
3945 // Check if shared buffer is not occupied by other loop (idx -
3946 // __kmp_dispatch_num_buffers)
3947 if (idx != sh_buf->doacross_buf_idx) {
3948 // Shared buffer is occupied, wait for it to be free
3949 __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3950 __kmp_eq_4, NULL);
3952 #if KMP_32_BIT_ARCH
3953 // Check if we are the first thread. After the CAS the first thread gets 0,
3954 // others get 1 if initialization is in progress, allocated pointer otherwise.
3955 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3956 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3957 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3958 #else
3959 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3960 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3961 #endif
3962 if (flags == NULL) {
3963 // we are the first thread, allocate the array of flags
3964 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3965 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3966 KMP_MB();
3967 sh_buf->doacross_flags = flags;
3968 } else if (flags == (kmp_uint32 *)1) {
3969 #if KMP_32_BIT_ARCH
3970 // initialization is still in progress, need to wait
3971 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3972 #else
3973 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3974 #endif
3975 KMP_YIELD(TRUE);
3976 KMP_MB();
3977 } else {
3978 KMP_MB();
3980 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3981 pr_buf->th_doacross_flags =
3982 sh_buf->doacross_flags; // save private copy in order to not
3983 // touch shared buffer on each iteration
3984 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3987 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3988 __kmp_assert_valid_gtid(gtid);
3989 kmp_int32 shft, num_dims, i;
3990 kmp_uint32 flag;
3991 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3992 kmp_info_t *th = __kmp_threads[gtid];
3993 kmp_team_t *team = th->th.th_team;
3994 kmp_disp_t *pr_buf;
3995 kmp_int64 lo, up, st;
3997 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3998 if (team->t.t_serialized) {
3999 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4000 return; // no dependencies if team is serialized
4003 // calculate sequential iteration number and check out-of-bounds condition
4004 pr_buf = th->th.th_dispatch;
4005 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4006 num_dims = pr_buf->th_doacross_info[0];
4007 lo = pr_buf->th_doacross_info[2];
4008 up = pr_buf->th_doacross_info[3];
4009 st = pr_buf->th_doacross_info[4];
4010 #if OMPT_SUPPORT && OMPT_OPTIONAL
4011 ompt_dependence_t deps[num_dims];
4012 #endif
4013 if (st == 1) { // most common case
4014 if (vec[0] < lo || vec[0] > up) {
4015 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4016 "bounds [%lld,%lld]\n",
4017 gtid, vec[0], lo, up));
4018 return;
4020 iter_number = vec[0] - lo;
4021 } else if (st > 0) {
4022 if (vec[0] < lo || vec[0] > up) {
4023 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4024 "bounds [%lld,%lld]\n",
4025 gtid, vec[0], lo, up));
4026 return;
4028 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4029 } else { // negative increment
4030 if (vec[0] > lo || vec[0] < up) {
4031 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4032 "bounds [%lld,%lld]\n",
4033 gtid, vec[0], lo, up));
4034 return;
4036 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4038 #if OMPT_SUPPORT && OMPT_OPTIONAL
4039 deps[0].variable.value = iter_number;
4040 deps[0].dependence_type = ompt_dependence_type_sink;
4041 #endif
4042 for (i = 1; i < num_dims; ++i) {
4043 kmp_int64 iter, ln;
4044 kmp_int32 j = i * 4;
4045 ln = pr_buf->th_doacross_info[j + 1];
4046 lo = pr_buf->th_doacross_info[j + 2];
4047 up = pr_buf->th_doacross_info[j + 3];
4048 st = pr_buf->th_doacross_info[j + 4];
4049 if (st == 1) {
4050 if (vec[i] < lo || vec[i] > up) {
4051 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4052 "bounds [%lld,%lld]\n",
4053 gtid, vec[i], lo, up));
4054 return;
4056 iter = vec[i] - lo;
4057 } else if (st > 0) {
4058 if (vec[i] < lo || vec[i] > up) {
4059 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4060 "bounds [%lld,%lld]\n",
4061 gtid, vec[i], lo, up));
4062 return;
4064 iter = (kmp_uint64)(vec[i] - lo) / st;
4065 } else { // st < 0
4066 if (vec[i] > lo || vec[i] < up) {
4067 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4068 "bounds [%lld,%lld]\n",
4069 gtid, vec[i], lo, up));
4070 return;
4072 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4074 iter_number = iter + ln * iter_number;
4075 #if OMPT_SUPPORT && OMPT_OPTIONAL
4076 deps[i].variable.value = iter;
4077 deps[i].dependence_type = ompt_dependence_type_sink;
4078 #endif
4080 shft = iter_number % 32; // use 32-bit granularity
4081 iter_number >>= 5; // divided by 32
4082 flag = 1 << shft;
4083 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4084 KMP_YIELD(TRUE);
4086 KMP_MB();
4087 #if OMPT_SUPPORT && OMPT_OPTIONAL
4088 if (ompt_enabled.ompt_callback_dependences) {
4089 ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4090 &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4092 #endif
4093 KA_TRACE(20,
4094 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4095 gtid, (iter_number << 5) + shft));
4098 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4099 __kmp_assert_valid_gtid(gtid);
4100 kmp_int32 shft, num_dims, i;
4101 kmp_uint32 flag;
4102 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4103 kmp_info_t *th = __kmp_threads[gtid];
4104 kmp_team_t *team = th->th.th_team;
4105 kmp_disp_t *pr_buf;
4106 kmp_int64 lo, st;
4108 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4109 if (team->t.t_serialized) {
4110 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4111 return; // no dependencies if team is serialized
4114 // calculate sequential iteration number (same as in "wait" but no
4115 // out-of-bounds checks)
4116 pr_buf = th->th.th_dispatch;
4117 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4118 num_dims = pr_buf->th_doacross_info[0];
4119 lo = pr_buf->th_doacross_info[2];
4120 st = pr_buf->th_doacross_info[4];
4121 #if OMPT_SUPPORT && OMPT_OPTIONAL
4122 ompt_dependence_t deps[num_dims];
4123 #endif
4124 if (st == 1) { // most common case
4125 iter_number = vec[0] - lo;
4126 } else if (st > 0) {
4127 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4128 } else { // negative increment
4129 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4131 #if OMPT_SUPPORT && OMPT_OPTIONAL
4132 deps[0].variable.value = iter_number;
4133 deps[0].dependence_type = ompt_dependence_type_source;
4134 #endif
4135 for (i = 1; i < num_dims; ++i) {
4136 kmp_int64 iter, ln;
4137 kmp_int32 j = i * 4;
4138 ln = pr_buf->th_doacross_info[j + 1];
4139 lo = pr_buf->th_doacross_info[j + 2];
4140 st = pr_buf->th_doacross_info[j + 4];
4141 if (st == 1) {
4142 iter = vec[i] - lo;
4143 } else if (st > 0) {
4144 iter = (kmp_uint64)(vec[i] - lo) / st;
4145 } else { // st < 0
4146 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4148 iter_number = iter + ln * iter_number;
4149 #if OMPT_SUPPORT && OMPT_OPTIONAL
4150 deps[i].variable.value = iter;
4151 deps[i].dependence_type = ompt_dependence_type_source;
4152 #endif
4154 #if OMPT_SUPPORT && OMPT_OPTIONAL
4155 if (ompt_enabled.ompt_callback_dependences) {
4156 ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4157 &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4159 #endif
4160 shft = iter_number % 32; // use 32-bit granularity
4161 iter_number >>= 5; // divided by 32
4162 flag = 1 << shft;
4163 KMP_MB();
4164 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4165 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4166 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4167 (iter_number << 5) + shft));
4170 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4171 __kmp_assert_valid_gtid(gtid);
4172 kmp_int32 num_done;
4173 kmp_info_t *th = __kmp_threads[gtid];
4174 kmp_team_t *team = th->th.th_team;
4175 kmp_disp_t *pr_buf = th->th.th_dispatch;
4177 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4178 if (team->t.t_serialized) {
4179 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4180 return; // nothing to do
4182 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4183 if (num_done == th->th.th_team_nproc) {
4184 // we are the last thread, need to free shared resources
4185 int idx = pr_buf->th_doacross_buf_idx - 1;
4186 dispatch_shared_info_t *sh_buf =
4187 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4188 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4189 (kmp_int64)&sh_buf->doacross_num_done);
4190 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4191 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4192 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4193 sh_buf->doacross_flags = NULL;
4194 sh_buf->doacross_num_done = 0;
4195 sh_buf->doacross_buf_idx +=
4196 __kmp_dispatch_num_buffers; // free buffer for future re-use
4198 // free private resources (need to keep buffer index forever)
4199 pr_buf->th_doacross_flags = NULL;
4200 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4201 pr_buf->th_doacross_info = NULL;
4202 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4205 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4206 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4207 return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4210 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4211 __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4214 int __kmpc_get_target_offload(void) {
4215 if (!__kmp_init_serial) {
4216 __kmp_serial_initialize();
4218 return __kmp_target_offload;
4221 int __kmpc_pause_resource(kmp_pause_status_t level) {
4222 if (!__kmp_init_serial) {
4223 return 1; // Can't pause if runtime is not initialized
4225 return __kmp_pause_resource(level);