2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
21 #include "ompt-specific.h"
24 #include "tsan_annotations.h"
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t
*task_team
,
28 kmp_info_t
*this_thr
);
29 static void __kmp_alloc_task_deque(kmp_info_t
*thread
,
30 kmp_thread_data_t
*thread_data
);
31 static int __kmp_realloc_task_threads_data(kmp_info_t
*thread
,
32 kmp_task_team_t
*task_team
);
33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid
, kmp_task_t
*ptask
);
35 #ifdef BUILD_TIED_TASK_STACK
37 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
40 // gtid: global thread identifier for thread containing stack
41 // thread_data: thread data for task team thread containing stack
42 // threshold: value above which the trace statement triggers
43 // location: string identifying call site of this function (for trace)
44 static void __kmp_trace_task_stack(kmp_int32 gtid
,
45 kmp_thread_data_t
*thread_data
,
46 int threshold
, char *location
) {
47 kmp_task_stack_t
*task_stack
= &thread_data
->td
.td_susp_tied_tasks
;
48 kmp_taskdata_t
**stack_top
= task_stack
->ts_top
;
49 kmp_int32 entries
= task_stack
->ts_entries
;
50 kmp_taskdata_t
*tied_task
;
54 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
55 "first_block = %p, stack_top = %p \n",
56 location
, gtid
, entries
, task_stack
->ts_first_block
, stack_top
));
58 KMP_DEBUG_ASSERT(stack_top
!= NULL
);
59 KMP_DEBUG_ASSERT(entries
> 0);
61 while (entries
!= 0) {
62 KMP_DEBUG_ASSERT(stack_top
!= &task_stack
->ts_first_block
.sb_block
[0]);
63 // fix up ts_top if we need to pop from previous block
64 if (entries
& TASK_STACK_INDEX_MASK
== 0) {
65 kmp_stack_block_t
*stack_block
= (kmp_stack_block_t
*)(stack_top
);
67 stack_block
= stack_block
->sb_prev
;
68 stack_top
= &stack_block
->sb_block
[TASK_STACK_BLOCK_SIZE
];
75 tied_task
= *stack_top
;
77 KMP_DEBUG_ASSERT(tied_task
!= NULL
);
78 KMP_DEBUG_ASSERT(tied_task
->td_flags
.tasktype
== TASK_TIED
);
81 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
82 "stack_top=%p, tied_task=%p\n",
83 location
, gtid
, entries
, stack_top
, tied_task
));
85 KMP_DEBUG_ASSERT(stack_top
== &task_stack
->ts_first_block
.sb_block
[0]);
88 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
92 // __kmp_init_task_stack: initialize the task stack for the first time
93 // after a thread_data structure is created.
94 // It should not be necessary to do this again (assuming the stack works).
96 // gtid: global thread identifier of calling thread
97 // thread_data: thread data for task team thread containing stack
98 static void __kmp_init_task_stack(kmp_int32 gtid
,
99 kmp_thread_data_t
*thread_data
) {
100 kmp_task_stack_t
*task_stack
= &thread_data
->td
.td_susp_tied_tasks
;
101 kmp_stack_block_t
*first_block
;
103 // set up the first block of the stack
104 first_block
= &task_stack
->ts_first_block
;
105 task_stack
->ts_top
= (kmp_taskdata_t
**)first_block
;
106 memset((void *)first_block
, '\0',
107 TASK_STACK_BLOCK_SIZE
* sizeof(kmp_taskdata_t
*));
109 // initialize the stack to be empty
110 task_stack
->ts_entries
= TASK_STACK_EMPTY
;
111 first_block
->sb_next
= NULL
;
112 first_block
->sb_prev
= NULL
;
115 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
117 // gtid: global thread identifier for calling thread
118 // thread_data: thread info for thread containing stack
119 static void __kmp_free_task_stack(kmp_int32 gtid
,
120 kmp_thread_data_t
*thread_data
) {
121 kmp_task_stack_t
*task_stack
= &thread_data
->td
.td_susp_tied_tasks
;
122 kmp_stack_block_t
*stack_block
= &task_stack
->ts_first_block
;
124 KMP_DEBUG_ASSERT(task_stack
->ts_entries
== TASK_STACK_EMPTY
);
125 // free from the second block of the stack
126 while (stack_block
!= NULL
) {
127 kmp_stack_block_t
*next_block
= (stack_block
) ? stack_block
->sb_next
: NULL
;
129 stack_block
->sb_next
= NULL
;
130 stack_block
->sb_prev
= NULL
;
131 if (stack_block
!= &task_stack
->ts_first_block
) {
132 __kmp_thread_free(thread
,
133 stack_block
); // free the block, if not the first
135 stack_block
= next_block
;
137 // initialize the stack to be empty
138 task_stack
->ts_entries
= 0;
139 task_stack
->ts_top
= NULL
;
142 // __kmp_push_task_stack: Push the tied task onto the task stack.
143 // Grow the stack if necessary by allocating another block.
145 // gtid: global thread identifier for calling thread
146 // thread: thread info for thread containing stack
147 // tied_task: the task to push on the stack
148 static void __kmp_push_task_stack(kmp_int32 gtid
, kmp_info_t
*thread
,
149 kmp_taskdata_t
*tied_task
) {
150 // GEH - need to consider what to do if tt_threads_data not allocated yet
151 kmp_thread_data_t
*thread_data
=
152 &thread
->th
.th_task_team
->tt
.tt_threads_data
[__kmp_tid_from_gtid(gtid
)];
153 kmp_task_stack_t
*task_stack
= &thread_data
->td
.td_susp_tied_tasks
;
155 if (tied_task
->td_flags
.team_serial
|| tied_task
->td_flags
.tasking_ser
) {
156 return; // Don't push anything on stack if team or team tasks are serialized
159 KMP_DEBUG_ASSERT(tied_task
->td_flags
.tasktype
== TASK_TIED
);
160 KMP_DEBUG_ASSERT(task_stack
->ts_top
!= NULL
);
163 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
164 gtid
, thread
, tied_task
));
166 *(task_stack
->ts_top
) = tied_task
;
168 // Do bookkeeping for next push
169 task_stack
->ts_top
++;
170 task_stack
->ts_entries
++;
172 if (task_stack
->ts_entries
& TASK_STACK_INDEX_MASK
== 0) {
173 // Find beginning of this task block
174 kmp_stack_block_t
*stack_block
=
175 (kmp_stack_block_t
*)(task_stack
->ts_top
- TASK_STACK_BLOCK_SIZE
);
177 // Check if we already have a block
178 if (stack_block
->sb_next
!=
179 NULL
) { // reset ts_top to beginning of next block
180 task_stack
->ts_top
= &stack_block
->sb_next
->sb_block
[0];
181 } else { // Alloc new block and link it up
182 kmp_stack_block_t
*new_block
= (kmp_stack_block_t
*)__kmp_thread_calloc(
183 thread
, sizeof(kmp_stack_block_t
));
185 task_stack
->ts_top
= &new_block
->sb_block
[0];
186 stack_block
->sb_next
= new_block
;
187 new_block
->sb_prev
= stack_block
;
188 new_block
->sb_next
= NULL
;
192 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
193 gtid
, tied_task
, new_block
));
196 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid
,
200 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
201 // the task, just check to make sure it matches the ending task passed in.
203 // gtid: global thread identifier for the calling thread
204 // thread: thread info structure containing stack
205 // tied_task: the task popped off the stack
206 // ending_task: the task that is ending (should match popped task)
207 static void __kmp_pop_task_stack(kmp_int32 gtid
, kmp_info_t
*thread
,
208 kmp_taskdata_t
*ending_task
) {
209 // GEH - need to consider what to do if tt_threads_data not allocated yet
210 kmp_thread_data_t
*thread_data
=
211 &thread
->th
.th_task_team
->tt_threads_data
[__kmp_tid_from_gtid(gtid
)];
212 kmp_task_stack_t
*task_stack
= &thread_data
->td
.td_susp_tied_tasks
;
213 kmp_taskdata_t
*tied_task
;
215 if (ending_task
->td_flags
.team_serial
|| ending_task
->td_flags
.tasking_ser
) {
216 // Don't pop anything from stack if team or team tasks are serialized
220 KMP_DEBUG_ASSERT(task_stack
->ts_top
!= NULL
);
221 KMP_DEBUG_ASSERT(task_stack
->ts_entries
> 0);
223 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid
,
226 // fix up ts_top if we need to pop from previous block
227 if (task_stack
->ts_entries
& TASK_STACK_INDEX_MASK
== 0) {
228 kmp_stack_block_t
*stack_block
= (kmp_stack_block_t
*)(task_stack
->ts_top
);
230 stack_block
= stack_block
->sb_prev
;
231 task_stack
->ts_top
= &stack_block
->sb_block
[TASK_STACK_BLOCK_SIZE
];
234 // finish bookkeeping
235 task_stack
->ts_top
--;
236 task_stack
->ts_entries
--;
238 tied_task
= *(task_stack
->ts_top
);
240 KMP_DEBUG_ASSERT(tied_task
!= NULL
);
241 KMP_DEBUG_ASSERT(tied_task
->td_flags
.tasktype
== TASK_TIED
);
242 KMP_DEBUG_ASSERT(tied_task
== ending_task
); // If we built the stack correctly
244 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid
,
248 #endif /* BUILD_TIED_TASK_STACK */
250 // returns 1 if new task is allowed to execute, 0 otherwise
251 // checks Task Scheduling constraint (if requested) and
252 // mutexinoutset dependencies if any
253 static bool __kmp_task_is_allowed(int gtid
, const kmp_int32 is_constrained
,
254 const kmp_taskdata_t
*tasknew
,
255 const kmp_taskdata_t
*taskcurr
) {
256 if (is_constrained
&& (tasknew
->td_flags
.tiedness
== TASK_TIED
)) {
257 // Check if the candidate obeys the Task Scheduling Constraints (TSC)
258 // only descendant of all deferred tied tasks can be scheduled, checking
259 // the last one is enough, as it in turn is the descendant of all others
260 kmp_taskdata_t
*current
= taskcurr
->td_last_tied
;
261 KMP_DEBUG_ASSERT(current
!= NULL
);
262 // check if the task is not suspended on barrier
263 if (current
->td_flags
.tasktype
== TASK_EXPLICIT
||
264 current
->td_taskwait_thread
> 0) { // <= 0 on barrier
265 kmp_int32 level
= current
->td_level
;
266 kmp_taskdata_t
*parent
= tasknew
->td_parent
;
267 while (parent
!= current
&& parent
->td_level
> level
) {
268 // check generation up to the level of the current task
269 parent
= parent
->td_parent
;
270 KMP_DEBUG_ASSERT(parent
!= NULL
);
272 if (parent
!= current
)
276 // Check mutexinoutset dependencies, acquire locks
277 kmp_depnode_t
*node
= tasknew
->td_depnode
;
278 if (node
&& (node
->dn
.mtx_num_locks
> 0)) {
279 for (int i
= 0; i
< node
->dn
.mtx_num_locks
; ++i
) {
280 KMP_DEBUG_ASSERT(node
->dn
.mtx_locks
[i
] != NULL
);
281 if (__kmp_test_lock(node
->dn
.mtx_locks
[i
], gtid
))
283 // could not get the lock, release previous locks
284 for (int j
= i
- 1; j
>= 0; --j
)
285 __kmp_release_lock(node
->dn
.mtx_locks
[j
], gtid
);
288 // negative num_locks means all locks acquired successfully
289 node
->dn
.mtx_num_locks
= -node
->dn
.mtx_num_locks
;
294 // __kmp_realloc_task_deque:
295 // Re-allocates a task deque for a particular thread, copies the content from
296 // the old deque and adjusts the necessary data structures relating to the
297 // deque. This operation must be done with the deque_lock being held
298 static void __kmp_realloc_task_deque(kmp_info_t
*thread
,
299 kmp_thread_data_t
*thread_data
) {
300 kmp_int32 size
= TASK_DEQUE_SIZE(thread_data
->td
);
301 KMP_DEBUG_ASSERT(TCR_4(thread_data
->td
.td_deque_ntasks
) == size
);
302 kmp_int32 new_size
= 2 * size
;
304 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
305 "%d] for thread_data %p\n",
306 __kmp_gtid_from_thread(thread
), size
, new_size
, thread_data
));
308 kmp_taskdata_t
**new_deque
=
309 (kmp_taskdata_t
**)__kmp_allocate(new_size
* sizeof(kmp_taskdata_t
*));
312 for (i
= thread_data
->td
.td_deque_head
, j
= 0; j
< size
;
313 i
= (i
+ 1) & TASK_DEQUE_MASK(thread_data
->td
), j
++)
314 new_deque
[j
] = thread_data
->td
.td_deque
[i
];
316 __kmp_free(thread_data
->td
.td_deque
);
318 thread_data
->td
.td_deque_head
= 0;
319 thread_data
->td
.td_deque_tail
= size
;
320 thread_data
->td
.td_deque
= new_deque
;
321 thread_data
->td
.td_deque_size
= new_size
;
324 // __kmp_push_task: Add a task to the thread's deque
325 static kmp_int32
__kmp_push_task(kmp_int32 gtid
, kmp_task_t
*task
) {
326 kmp_info_t
*thread
= __kmp_threads
[gtid
];
327 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
328 kmp_task_team_t
*task_team
= thread
->th
.th_task_team
;
329 kmp_int32 tid
= __kmp_tid_from_gtid(gtid
);
330 kmp_thread_data_t
*thread_data
;
333 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid
, taskdata
));
335 if (taskdata
->td_flags
.tiedness
== TASK_UNTIED
) {
336 // untied task needs to increment counter so that the task structure is not
338 kmp_int32 counter
= 1 + KMP_ATOMIC_INC(&taskdata
->td_untied_count
);
339 KMP_DEBUG_USE_VAR(counter
);
342 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
343 gtid
, counter
, taskdata
));
346 // The first check avoids building task_team thread data if serialized
347 if (taskdata
->td_flags
.task_serial
) {
348 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
349 "TASK_NOT_PUSHED for task %p\n",
351 return TASK_NOT_PUSHED
;
354 // Now that serialized tasks have returned, we can assume that we are not in
355 // immediate exec mode
356 KMP_DEBUG_ASSERT(__kmp_tasking_mode
!= tskm_immediate_exec
);
357 if (!KMP_TASKING_ENABLED(task_team
)) {
358 __kmp_enable_tasking(task_team
, thread
);
360 KMP_DEBUG_ASSERT(TCR_4(task_team
->tt
.tt_found_tasks
) == TRUE
);
361 KMP_DEBUG_ASSERT(TCR_PTR(task_team
->tt
.tt_threads_data
) != NULL
);
363 // Find tasking deque specific to encountering thread
364 thread_data
= &task_team
->tt
.tt_threads_data
[tid
];
366 // No lock needed since only owner can allocate
367 if (thread_data
->td
.td_deque
== NULL
) {
368 __kmp_alloc_task_deque(thread
, thread_data
);
372 // Check if deque is full
373 if (TCR_4(thread_data
->td
.td_deque_ntasks
) >=
374 TASK_DEQUE_SIZE(thread_data
->td
)) {
375 if (__kmp_enable_task_throttling
&&
376 __kmp_task_is_allowed(gtid
, __kmp_task_stealing_constraint
, taskdata
,
377 thread
->th
.th_current_task
)) {
378 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
379 "TASK_NOT_PUSHED for task %p\n",
381 return TASK_NOT_PUSHED
;
383 __kmp_acquire_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
385 if (TCR_4(thread_data
->td
.td_deque_ntasks
) >=
386 TASK_DEQUE_SIZE(thread_data
->td
)) {
387 // expand deque to push the task which is not allowed to execute
388 __kmp_realloc_task_deque(thread
, thread_data
);
392 // Lock the deque for the task push operation
394 __kmp_acquire_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
395 // Need to recheck as we can get a proxy task from thread outside of OpenMP
396 if (TCR_4(thread_data
->td
.td_deque_ntasks
) >=
397 TASK_DEQUE_SIZE(thread_data
->td
)) {
398 if (__kmp_enable_task_throttling
&&
399 __kmp_task_is_allowed(gtid
, __kmp_task_stealing_constraint
, taskdata
,
400 thread
->th
.th_current_task
)) {
401 __kmp_release_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
402 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
403 "returning TASK_NOT_PUSHED for task %p\n",
405 return TASK_NOT_PUSHED
;
407 // expand deque to push the task which is not allowed to execute
408 __kmp_realloc_task_deque(thread
, thread_data
);
412 // Must have room since no thread can add tasks but calling thread
413 KMP_DEBUG_ASSERT(TCR_4(thread_data
->td
.td_deque_ntasks
) <
414 TASK_DEQUE_SIZE(thread_data
->td
));
416 thread_data
->td
.td_deque
[thread_data
->td
.td_deque_tail
] =
417 taskdata
; // Push taskdata
419 thread_data
->td
.td_deque_tail
=
420 (thread_data
->td
.td_deque_tail
+ 1) & TASK_DEQUE_MASK(thread_data
->td
);
421 TCW_4(thread_data
->td
.td_deque_ntasks
,
422 TCR_4(thread_data
->td
.td_deque_ntasks
) + 1); // Adjust task count
423 KMP_FSYNC_RELEASING(thread
->th
.th_current_task
); // releasing self
424 KMP_FSYNC_RELEASING(taskdata
); // releasing child
425 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
426 "task=%p ntasks=%d head=%u tail=%u\n",
427 gtid
, taskdata
, thread_data
->td
.td_deque_ntasks
,
428 thread_data
->td
.td_deque_head
, thread_data
->td
.td_deque_tail
));
430 __kmp_release_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
432 return TASK_SUCCESSFULLY_PUSHED
;
435 // __kmp_pop_current_task_from_thread: set up current task from called thread
438 // this_thr: thread structure to set current_task in.
439 void __kmp_pop_current_task_from_thread(kmp_info_t
*this_thr
) {
440 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
441 "this_thread=%p, curtask=%p, "
442 "curtask_parent=%p\n",
443 0, this_thr
, this_thr
->th
.th_current_task
,
444 this_thr
->th
.th_current_task
->td_parent
));
446 this_thr
->th
.th_current_task
= this_thr
->th
.th_current_task
->td_parent
;
448 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
449 "this_thread=%p, curtask=%p, "
450 "curtask_parent=%p\n",
451 0, this_thr
, this_thr
->th
.th_current_task
,
452 this_thr
->th
.th_current_task
->td_parent
));
455 // __kmp_push_current_task_to_thread: set up current task in called thread for a
458 // this_thr: thread structure to set up
459 // team: team for implicit task data
460 // tid: thread within team to set up
461 void __kmp_push_current_task_to_thread(kmp_info_t
*this_thr
, kmp_team_t
*team
,
463 // current task of the thread is a parent of the new just created implicit
465 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
468 tid
, this_thr
, this_thr
->th
.th_current_task
,
469 team
->t
.t_implicit_task_taskdata
[tid
].td_parent
));
471 KMP_DEBUG_ASSERT(this_thr
!= NULL
);
474 if (this_thr
->th
.th_current_task
!= &team
->t
.t_implicit_task_taskdata
[0]) {
475 team
->t
.t_implicit_task_taskdata
[0].td_parent
=
476 this_thr
->th
.th_current_task
;
477 this_thr
->th
.th_current_task
= &team
->t
.t_implicit_task_taskdata
[0];
480 team
->t
.t_implicit_task_taskdata
[tid
].td_parent
=
481 team
->t
.t_implicit_task_taskdata
[0].td_parent
;
482 this_thr
->th
.th_current_task
= &team
->t
.t_implicit_task_taskdata
[tid
];
485 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
488 tid
, this_thr
, this_thr
->th
.th_current_task
,
489 team
->t
.t_implicit_task_taskdata
[tid
].td_parent
));
492 // __kmp_task_start: bookkeeping for a task starting execution
494 // GTID: global thread id of calling thread
495 // task: task starting execution
496 // current_task: task suspending
497 static void __kmp_task_start(kmp_int32 gtid
, kmp_task_t
*task
,
498 kmp_taskdata_t
*current_task
) {
499 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
500 kmp_info_t
*thread
= __kmp_threads
[gtid
];
503 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
504 gtid
, taskdata
, current_task
));
506 KMP_DEBUG_ASSERT(taskdata
->td_flags
.tasktype
== TASK_EXPLICIT
);
508 // mark currently executing task as suspended
509 // TODO: GEH - make sure root team implicit task is initialized properly.
510 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
511 current_task
->td_flags
.executing
= 0;
513 // Add task to stack if tied
514 #ifdef BUILD_TIED_TASK_STACK
515 if (taskdata
->td_flags
.tiedness
== TASK_TIED
) {
516 __kmp_push_task_stack(gtid
, thread
, taskdata
);
518 #endif /* BUILD_TIED_TASK_STACK */
520 // mark starting task as executing and as current task
521 thread
->th
.th_current_task
= taskdata
;
523 KMP_DEBUG_ASSERT(taskdata
->td_flags
.started
== 0 ||
524 taskdata
->td_flags
.tiedness
== TASK_UNTIED
);
525 KMP_DEBUG_ASSERT(taskdata
->td_flags
.executing
== 0 ||
526 taskdata
->td_flags
.tiedness
== TASK_UNTIED
);
527 taskdata
->td_flags
.started
= 1;
528 taskdata
->td_flags
.executing
= 1;
529 KMP_DEBUG_ASSERT(taskdata
->td_flags
.complete
== 0);
530 KMP_DEBUG_ASSERT(taskdata
->td_flags
.freed
== 0);
532 // GEH TODO: shouldn't we pass some sort of location identifier here?
533 // APT: yes, we will pass location here.
534 // need to store current thread state (in a thread or taskdata structure)
535 // before setting work_state, otherwise wrong state is set after end of task
537 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid
, taskdata
));
543 //------------------------------------------------------------------------------
545 // Initialize OMPT fields maintained by a task. This will only be called after
546 // ompt_start_tool, so we already know whether ompt is enabled or not.
548 static inline void __ompt_task_init(kmp_taskdata_t
*task
, int tid
) {
549 // The calls to __ompt_task_init already have the ompt_enabled condition.
550 task
->ompt_task_info
.task_data
.value
= 0;
551 task
->ompt_task_info
.frame
.exit_frame
= ompt_data_none
;
552 task
->ompt_task_info
.frame
.enter_frame
= ompt_data_none
;
553 task
->ompt_task_info
.frame
.exit_frame_flags
= ompt_frame_runtime
| ompt_frame_framepointer
;
554 task
->ompt_task_info
.frame
.enter_frame_flags
= ompt_frame_runtime
| ompt_frame_framepointer
;
557 // __ompt_task_start:
558 // Build and trigger task-begin event
559 static inline void __ompt_task_start(kmp_task_t
*task
,
560 kmp_taskdata_t
*current_task
,
562 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
563 ompt_task_status_t status
= ompt_task_switch
;
564 if (__kmp_threads
[gtid
]->th
.ompt_thread_info
.ompt_task_yielded
) {
565 status
= ompt_task_yield
;
566 __kmp_threads
[gtid
]->th
.ompt_thread_info
.ompt_task_yielded
= 0;
568 /* let OMPT know that we're about to run this task */
569 if (ompt_enabled
.ompt_callback_task_schedule
) {
570 ompt_callbacks
.ompt_callback(ompt_callback_task_schedule
)(
571 &(current_task
->ompt_task_info
.task_data
), status
,
572 &(taskdata
->ompt_task_info
.task_data
));
574 taskdata
->ompt_task_info
.scheduling_parent
= current_task
;
577 // __ompt_task_finish:
578 // Build and trigger final task-schedule event
579 static inline void __ompt_task_finish(kmp_task_t
*task
,
580 kmp_taskdata_t
*resumed_task
,
581 ompt_task_status_t status
) {
582 if (ompt_enabled
.ompt_callback_task_schedule
) {
583 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
584 if (__kmp_omp_cancellation
&& taskdata
->td_taskgroup
&&
585 taskdata
->td_taskgroup
->cancel_request
== cancel_taskgroup
) {
586 status
= ompt_task_cancel
;
589 /* let OMPT know that we're returning to the callee task */
590 ompt_callbacks
.ompt_callback(ompt_callback_task_schedule
)(
591 &(taskdata
->ompt_task_info
.task_data
), status
,
592 (resumed_task
? &(resumed_task
->ompt_task_info
.task_data
) : NULL
));
598 static void __kmpc_omp_task_begin_if0_template(ident_t
*loc_ref
, kmp_int32 gtid
,
601 void *return_address
) {
602 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
603 kmp_taskdata_t
*current_task
= __kmp_threads
[gtid
]->th
.th_current_task
;
605 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
607 gtid
, loc_ref
, taskdata
, current_task
));
609 if (taskdata
->td_flags
.tiedness
== TASK_UNTIED
) {
610 // untied task needs to increment counter so that the task structure is not
612 kmp_int32 counter
= 1 + KMP_ATOMIC_INC(&taskdata
->td_untied_count
);
613 KMP_DEBUG_USE_VAR(counter
);
614 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
615 "incremented for task %p\n",
616 gtid
, counter
, taskdata
));
619 taskdata
->td_flags
.task_serial
=
620 1; // Execute this task immediately, not deferred.
621 __kmp_task_start(gtid
, task
, current_task
);
625 if (current_task
->ompt_task_info
.frame
.enter_frame
.ptr
== NULL
) {
626 current_task
->ompt_task_info
.frame
.enter_frame
.ptr
=
627 taskdata
->ompt_task_info
.frame
.exit_frame
.ptr
= frame_address
;
628 current_task
->ompt_task_info
.frame
.enter_frame_flags
=
629 taskdata
->ompt_task_info
.frame
.exit_frame_flags
= ompt_frame_application
| ompt_frame_framepointer
;
631 if (ompt_enabled
.ompt_callback_task_create
) {
632 ompt_task_info_t
*parent_info
= &(current_task
->ompt_task_info
);
633 ompt_callbacks
.ompt_callback(ompt_callback_task_create
)(
634 &(parent_info
->task_data
), &(parent_info
->frame
),
635 &(taskdata
->ompt_task_info
.task_data
),
636 ompt_task_explicit
| TASK_TYPE_DETAILS_FORMAT(taskdata
), 0,
639 __ompt_task_start(task
, current_task
, gtid
);
641 #endif // OMPT_SUPPORT
643 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid
,
649 static void __kmpc_omp_task_begin_if0_ompt(ident_t
*loc_ref
, kmp_int32 gtid
,
652 void *return_address
) {
653 __kmpc_omp_task_begin_if0_template
<true>(loc_ref
, gtid
, task
, frame_address
,
656 #endif // OMPT_SUPPORT
658 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
661 // loc_ref: source location information; points to beginning of task block.
662 // gtid: global thread number.
663 // task: task thunk for the started task.
664 void __kmpc_omp_task_begin_if0(ident_t
*loc_ref
, kmp_int32 gtid
,
667 if (UNLIKELY(ompt_enabled
.enabled
)) {
668 OMPT_STORE_RETURN_ADDRESS(gtid
);
669 __kmpc_omp_task_begin_if0_ompt(loc_ref
, gtid
, task
,
670 OMPT_GET_FRAME_ADDRESS(1),
671 OMPT_LOAD_RETURN_ADDRESS(gtid
));
675 __kmpc_omp_task_begin_if0_template
<false>(loc_ref
, gtid
, task
, NULL
, NULL
);
679 // __kmpc_omp_task_begin: report that a given task has started execution
680 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
681 void __kmpc_omp_task_begin(ident_t
*loc_ref
, kmp_int32 gtid
, kmp_task_t
*task
) {
682 kmp_taskdata_t
*current_task
= __kmp_threads
[gtid
]->th
.th_current_task
;
686 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
687 gtid
, loc_ref
, KMP_TASK_TO_TASKDATA(task
), current_task
));
689 __kmp_task_start(gtid
, task
, current_task
);
691 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid
,
692 loc_ref
, KMP_TASK_TO_TASKDATA(task
)));
695 #endif // TASK_UNUSED
697 // __kmp_free_task: free the current task space and the space for shareds
699 // gtid: Global thread ID of calling thread
700 // taskdata: task to free
701 // thread: thread data structure of caller
702 static void __kmp_free_task(kmp_int32 gtid
, kmp_taskdata_t
*taskdata
,
703 kmp_info_t
*thread
) {
704 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid
,
707 // Check to make sure all flags and counters have the correct values
708 KMP_DEBUG_ASSERT(taskdata
->td_flags
.tasktype
== TASK_EXPLICIT
);
709 KMP_DEBUG_ASSERT(taskdata
->td_flags
.executing
== 0);
710 KMP_DEBUG_ASSERT(taskdata
->td_flags
.complete
== 1);
711 KMP_DEBUG_ASSERT(taskdata
->td_flags
.freed
== 0);
712 KMP_DEBUG_ASSERT(taskdata
->td_allocated_child_tasks
== 0 ||
713 taskdata
->td_flags
.task_serial
== 1);
714 KMP_DEBUG_ASSERT(taskdata
->td_incomplete_child_tasks
== 0);
716 taskdata
->td_flags
.freed
= 1;
717 ANNOTATE_HAPPENS_BEFORE(taskdata
);
718 // deallocate the taskdata and shared variable blocks associated with this task
720 __kmp_fast_free(thread
, taskdata
);
721 #else /* ! USE_FAST_MEMORY */
722 __kmp_thread_free(thread
, taskdata
);
725 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid
, taskdata
));
728 // __kmp_free_task_and_ancestors: free the current task and ancestors without
731 // gtid: Global thread ID of calling thread
732 // taskdata: task to free
733 // thread: thread data structure of caller
734 static void __kmp_free_task_and_ancestors(kmp_int32 gtid
,
735 kmp_taskdata_t
*taskdata
,
736 kmp_info_t
*thread
) {
737 // Proxy tasks must always be allowed to free their parents
738 // because they can be run in background even in serial mode.
739 kmp_int32 team_serial
=
740 (taskdata
->td_flags
.team_serial
|| taskdata
->td_flags
.tasking_ser
) &&
741 !taskdata
->td_flags
.proxy
;
742 KMP_DEBUG_ASSERT(taskdata
->td_flags
.tasktype
== TASK_EXPLICIT
);
744 kmp_int32 children
= KMP_ATOMIC_DEC(&taskdata
->td_allocated_child_tasks
) - 1;
745 KMP_DEBUG_ASSERT(children
>= 0);
747 // Now, go up the ancestor tree to see if any ancestors can now be freed.
748 while (children
== 0) {
749 kmp_taskdata_t
*parent_taskdata
= taskdata
->td_parent
;
751 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
752 "and freeing itself\n",
755 // --- Deallocate my ancestor task ---
756 __kmp_free_task(gtid
, taskdata
, thread
);
758 taskdata
= parent_taskdata
;
762 // Stop checking ancestors at implicit task instead of walking up ancestor
763 // tree to avoid premature deallocation of ancestors.
764 if (taskdata
->td_flags
.tasktype
== TASK_IMPLICIT
) {
765 if (taskdata
->td_dephash
) { // do we need to cleanup dephash?
766 int children
= KMP_ATOMIC_LD_ACQ(&taskdata
->td_incomplete_child_tasks
);
767 kmp_tasking_flags_t flags_old
= taskdata
->td_flags
;
768 if (children
== 0 && flags_old
.complete
== 1) {
769 kmp_tasking_flags_t flags_new
= flags_old
;
770 flags_new
.complete
= 0;
771 if (KMP_COMPARE_AND_STORE_ACQ32(
772 RCAST(kmp_int32
*, &taskdata
->td_flags
),
773 *RCAST(kmp_int32
*, &flags_old
),
774 *RCAST(kmp_int32
*, &flags_new
))) {
775 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
776 "dephash of implicit task %p\n",
778 // cleanup dephash of finished implicit task
779 __kmp_dephash_free_entries(thread
, taskdata
->td_dephash
);
785 // Predecrement simulated by "- 1" calculation
786 children
= KMP_ATOMIC_DEC(&taskdata
->td_allocated_child_tasks
) - 1;
787 KMP_DEBUG_ASSERT(children
>= 0);
791 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
792 "not freeing it yet\n",
793 gtid
, taskdata
, children
));
796 // __kmp_task_finish: bookkeeping to do when a task finishes execution
798 // gtid: global thread ID for calling thread
799 // task: task to be finished
800 // resumed_task: task to be resumed. (may be NULL if task is serialized)
802 // template<ompt>: effectively ompt_enabled.enabled!=0
803 // the version with ompt=false is inlined, allowing to optimize away all ompt
806 static void __kmp_task_finish(kmp_int32 gtid
, kmp_task_t
*task
,
807 kmp_taskdata_t
*resumed_task
) {
808 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
809 kmp_info_t
*thread
= __kmp_threads
[gtid
];
810 kmp_task_team_t
*task_team
=
811 thread
->th
.th_task_team
; // might be NULL for serial teams...
812 kmp_int32 children
= 0;
814 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
816 gtid
, taskdata
, resumed_task
));
818 KMP_DEBUG_ASSERT(taskdata
->td_flags
.tasktype
== TASK_EXPLICIT
);
820 // Pop task from stack if tied
821 #ifdef BUILD_TIED_TASK_STACK
822 if (taskdata
->td_flags
.tiedness
== TASK_TIED
) {
823 __kmp_pop_task_stack(gtid
, thread
, taskdata
);
825 #endif /* BUILD_TIED_TASK_STACK */
827 if (taskdata
->td_flags
.tiedness
== TASK_UNTIED
) {
828 // untied task needs to check the counter so that the task structure is not
830 kmp_int32 counter
= KMP_ATOMIC_DEC(&taskdata
->td_untied_count
) - 1;
833 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
834 gtid
, counter
, taskdata
));
836 // untied task is not done, to be continued possibly by other thread, do
838 if (resumed_task
== NULL
) {
839 KMP_DEBUG_ASSERT(taskdata
->td_flags
.task_serial
);
840 resumed_task
= taskdata
->td_parent
; // In a serialized task, the resumed
841 // task is the parent
843 thread
->th
.th_current_task
= resumed_task
; // restore current_task
844 resumed_task
->td_flags
.executing
= 1; // resume previous task
845 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
846 "resuming task %p\n",
847 gtid
, taskdata
, resumed_task
));
852 // Check mutexinoutset dependencies, release locks
853 kmp_depnode_t
*node
= taskdata
->td_depnode
;
854 if (node
&& (node
->dn
.mtx_num_locks
< 0)) {
855 // negative num_locks means all locks were acquired
856 node
->dn
.mtx_num_locks
= -node
->dn
.mtx_num_locks
;
857 for (int i
= node
->dn
.mtx_num_locks
- 1; i
>= 0; --i
) {
858 KMP_DEBUG_ASSERT(node
->dn
.mtx_locks
[i
] != NULL
);
859 __kmp_release_lock(node
->dn
.mtx_locks
[i
], gtid
);
863 // bookkeeping for resuming task:
864 // GEH - note tasking_ser => task_serial
866 (taskdata
->td_flags
.tasking_ser
|| taskdata
->td_flags
.task_serial
) ==
867 taskdata
->td_flags
.task_serial
);
868 if (taskdata
->td_flags
.task_serial
) {
869 if (resumed_task
== NULL
) {
870 resumed_task
= taskdata
->td_parent
; // In a serialized task, the resumed
871 // task is the parent
874 KMP_DEBUG_ASSERT(resumed_task
!=
875 NULL
); // verify that resumed task is passed as argument
878 /* If the tasks' destructor thunk flag has been set, we need to invoke the
879 destructor thunk that has been generated by the compiler. The code is
880 placed here, since at this point other tasks might have been released
881 hence overlapping the destructor invocations with some other work in the
882 released tasks. The OpenMP spec is not specific on when the destructors
883 are invoked, so we should be free to choose. */
884 if (taskdata
->td_flags
.destructors_thunk
) {
885 kmp_routine_entry_t destr_thunk
= task
->data1
.destructors
;
886 KMP_ASSERT(destr_thunk
);
887 destr_thunk(gtid
, task
);
890 KMP_DEBUG_ASSERT(taskdata
->td_flags
.complete
== 0);
891 KMP_DEBUG_ASSERT(taskdata
->td_flags
.started
== 1);
892 KMP_DEBUG_ASSERT(taskdata
->td_flags
.freed
== 0);
895 if (taskdata
->td_flags
.detachable
== TASK_DETACHABLE
) {
896 if (taskdata
->td_allow_completion_event
.type
==
897 KMP_EVENT_ALLOW_COMPLETION
) {
898 // event hasn't been fulfilled yet. Try to detach task.
899 __kmp_acquire_tas_lock(&taskdata
->td_allow_completion_event
.lock
, gtid
);
900 if (taskdata
->td_allow_completion_event
.type
==
901 KMP_EVENT_ALLOW_COMPLETION
) {
902 // task finished execution
903 KMP_DEBUG_ASSERT(taskdata
->td_flags
.executing
== 1);
904 taskdata
->td_flags
.executing
= 0; // suspend the finishing task
907 // For a detached task, which is not completed, we switch back
908 // the omp_fulfill_event signals completion
909 // locking is necessary to avoid a race with ompt_task_late_fulfill
911 __ompt_task_finish(task
, resumed_task
, ompt_task_detach
);
914 // no access to taskdata after this point!
915 // __kmp_fulfill_event might free taskdata at any time from now
917 taskdata
->td_flags
.proxy
= TASK_PROXY
; // proxify!
920 __kmp_release_tas_lock(&taskdata
->td_allow_completion_event
.lock
, gtid
);
925 taskdata
->td_flags
.complete
= 1; // mark the task as completed
928 // This is not a detached task, we are done here
930 __ompt_task_finish(task
, resumed_task
, ompt_task_complete
);
933 // Only need to keep track of count if team parallel and tasking not
934 // serialized, or task is detachable and event has already been fulfilled
935 if (!(taskdata
->td_flags
.team_serial
|| taskdata
->td_flags
.tasking_ser
) ||
936 taskdata
->td_flags
.detachable
== TASK_DETACHABLE
) {
937 // Predecrement simulated by "- 1" calculation
939 KMP_ATOMIC_DEC(&taskdata
->td_parent
->td_incomplete_child_tasks
) - 1;
940 KMP_DEBUG_ASSERT(children
>= 0);
941 if (taskdata
->td_taskgroup
)
942 KMP_ATOMIC_DEC(&taskdata
->td_taskgroup
->count
);
943 __kmp_release_deps(gtid
, taskdata
);
944 } else if (task_team
&& task_team
->tt
.tt_found_proxy_tasks
) {
945 // if we found proxy tasks there could exist a dependency chain
946 // with the proxy task as origin
947 __kmp_release_deps(gtid
, taskdata
);
949 // td_flags.executing must be marked as 0 after __kmp_release_deps has been
950 // called. Othertwise, if a task is executed immediately from the
951 // release_deps code, the flag will be reset to 1 again by this same
953 KMP_DEBUG_ASSERT(taskdata
->td_flags
.executing
== 1);
954 taskdata
->td_flags
.executing
= 0; // suspend the finishing task
959 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
960 gtid
, taskdata
, children
));
962 // Free this task and then ancestor tasks if they have no children.
963 // Restore th_current_task first as suggested by John:
964 // johnmc: if an asynchronous inquiry peers into the runtime system
965 // it doesn't see the freed task as the current task.
966 thread
->th
.th_current_task
= resumed_task
;
968 __kmp_free_task_and_ancestors(gtid
, taskdata
, thread
);
970 // TODO: GEH - make sure root team implicit task is initialized properly.
971 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
972 resumed_task
->td_flags
.executing
= 1; // resume previous task
975 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
976 gtid
, taskdata
, resumed_task
));
982 static void __kmpc_omp_task_complete_if0_template(ident_t
*loc_ref
,
985 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
986 gtid
, loc_ref
, KMP_TASK_TO_TASKDATA(task
)));
987 __kmp_assert_valid_gtid(gtid
);
988 // this routine will provide task to resume
989 __kmp_task_finish
<ompt
>(gtid
, task
, NULL
);
991 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
992 gtid
, loc_ref
, KMP_TASK_TO_TASKDATA(task
)));
996 ompt_frame_t
*ompt_frame
;
997 __ompt_get_task_info_internal(0, NULL
, NULL
, &ompt_frame
, NULL
, NULL
);
998 ompt_frame
->enter_frame
= ompt_data_none
;
999 ompt_frame
->enter_frame_flags
= ompt_frame_runtime
| ompt_frame_framepointer
;
1008 void __kmpc_omp_task_complete_if0_ompt(ident_t
*loc_ref
, kmp_int32 gtid
,
1010 __kmpc_omp_task_complete_if0_template
<true>(loc_ref
, gtid
, task
);
1012 #endif // OMPT_SUPPORT
1014 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1016 // loc_ref: source location information; points to end of task block.
1017 // gtid: global thread number.
1018 // task: task thunk for the completed task.
1019 void __kmpc_omp_task_complete_if0(ident_t
*loc_ref
, kmp_int32 gtid
,
1022 if (UNLIKELY(ompt_enabled
.enabled
)) {
1023 __kmpc_omp_task_complete_if0_ompt(loc_ref
, gtid
, task
);
1027 __kmpc_omp_task_complete_if0_template
<false>(loc_ref
, gtid
, task
);
1031 // __kmpc_omp_task_complete: report that a task has completed execution
1032 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1033 void __kmpc_omp_task_complete(ident_t
*loc_ref
, kmp_int32 gtid
,
1035 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid
,
1036 loc_ref
, KMP_TASK_TO_TASKDATA(task
)));
1038 __kmp_task_finish
<false>(gtid
, task
,
1039 NULL
); // Not sure how to find task to resume
1041 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid
,
1042 loc_ref
, KMP_TASK_TO_TASKDATA(task
)));
1045 #endif // TASK_UNUSED
1047 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1048 // task for a given thread
1050 // loc_ref: reference to source location of parallel region
1051 // this_thr: thread data structure corresponding to implicit task
1052 // team: team for this_thr
1053 // tid: thread id of given thread within team
1054 // set_curr_task: TRUE if need to push current task to thread
1055 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1056 // have already been done elsewhere.
1057 // TODO: Get better loc_ref. Value passed in may be NULL
1058 void __kmp_init_implicit_task(ident_t
*loc_ref
, kmp_info_t
*this_thr
,
1059 kmp_team_t
*team
, int tid
, int set_curr_task
) {
1060 kmp_taskdata_t
*task
= &team
->t
.t_implicit_task_taskdata
[tid
];
1064 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1065 tid
, team
, task
, set_curr_task
? "TRUE" : "FALSE"));
1067 task
->td_task_id
= KMP_GEN_TASK_ID();
1068 task
->td_team
= team
;
1069 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1071 task
->td_ident
= loc_ref
;
1072 task
->td_taskwait_ident
= NULL
;
1073 task
->td_taskwait_counter
= 0;
1074 task
->td_taskwait_thread
= 0;
1076 task
->td_flags
.tiedness
= TASK_TIED
;
1077 task
->td_flags
.tasktype
= TASK_IMPLICIT
;
1078 task
->td_flags
.proxy
= TASK_FULL
;
1080 // All implicit tasks are executed immediately, not deferred
1081 task
->td_flags
.task_serial
= 1;
1082 task
->td_flags
.tasking_ser
= (__kmp_tasking_mode
== tskm_immediate_exec
);
1083 task
->td_flags
.team_serial
= (team
->t
.t_serialized
) ? 1 : 0;
1085 task
->td_flags
.started
= 1;
1086 task
->td_flags
.executing
= 1;
1087 task
->td_flags
.complete
= 0;
1088 task
->td_flags
.freed
= 0;
1090 task
->td_depnode
= NULL
;
1091 task
->td_last_tied
= task
;
1092 task
->td_allow_completion_event
.type
= KMP_EVENT_UNINITIALIZED
;
1094 if (set_curr_task
) { // only do this init first time thread is created
1095 KMP_ATOMIC_ST_REL(&task
->td_incomplete_child_tasks
, 0);
1096 // Not used: don't need to deallocate implicit task
1097 KMP_ATOMIC_ST_REL(&task
->td_allocated_child_tasks
, 0);
1098 task
->td_taskgroup
= NULL
; // An implicit task does not have taskgroup
1099 task
->td_dephash
= NULL
;
1100 __kmp_push_current_task_to_thread(this_thr
, team
, tid
);
1102 KMP_DEBUG_ASSERT(task
->td_incomplete_child_tasks
== 0);
1103 KMP_DEBUG_ASSERT(task
->td_allocated_child_tasks
== 0);
1107 if (UNLIKELY(ompt_enabled
.enabled
))
1108 __ompt_task_init(task
, tid
);
1111 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid
,
1115 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1116 // at the end of parallel regions. Some resources are kept for reuse in the next
1119 // thread: thread data structure corresponding to implicit task
1120 void __kmp_finish_implicit_task(kmp_info_t
*thread
) {
1121 kmp_taskdata_t
*task
= thread
->th
.th_current_task
;
1122 if (task
->td_dephash
) {
1124 task
->td_flags
.complete
= 1;
1125 children
= KMP_ATOMIC_LD_ACQ(&task
->td_incomplete_child_tasks
);
1126 kmp_tasking_flags_t flags_old
= task
->td_flags
;
1127 if (children
== 0 && flags_old
.complete
== 1) {
1128 kmp_tasking_flags_t flags_new
= flags_old
;
1129 flags_new
.complete
= 0;
1130 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32
*, &task
->td_flags
),
1131 *RCAST(kmp_int32
*, &flags_old
),
1132 *RCAST(kmp_int32
*, &flags_new
))) {
1133 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1134 "dephash of implicit task %p\n",
1135 thread
->th
.th_info
.ds
.ds_gtid
, task
));
1136 __kmp_dephash_free_entries(thread
, task
->td_dephash
);
1142 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1143 // when these are destroyed regions
1145 // thread: thread data structure corresponding to implicit task
1146 void __kmp_free_implicit_task(kmp_info_t
*thread
) {
1147 kmp_taskdata_t
*task
= thread
->th
.th_current_task
;
1148 if (task
&& task
->td_dephash
) {
1149 __kmp_dephash_free(thread
, task
->td_dephash
);
1150 task
->td_dephash
= NULL
;
1154 // Round up a size to a power of two specified by val: Used to insert padding
1155 // between structures co-allocated using a single malloc() call
1156 static size_t __kmp_round_up_to_val(size_t size
, size_t val
) {
1157 if (size
& (val
- 1)) {
1159 if (size
<= KMP_SIZE_T_MAX
- val
) {
1160 size
+= val
; // Round up if there is no overflow.
1164 } // __kmp_round_up_to_va
1166 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1168 // loc_ref: source location information
1169 // gtid: global thread number.
1170 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1171 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1172 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1173 // private vars accessed in task.
1174 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1176 // task_entry: Pointer to task code entry point generated by compiler.
1177 // returns: a pointer to the allocated kmp_task_t structure (task).
1178 kmp_task_t
*__kmp_task_alloc(ident_t
*loc_ref
, kmp_int32 gtid
,
1179 kmp_tasking_flags_t
*flags
,
1180 size_t sizeof_kmp_task_t
, size_t sizeof_shareds
,
1181 kmp_routine_entry_t task_entry
) {
1183 kmp_taskdata_t
*taskdata
;
1184 kmp_info_t
*thread
= __kmp_threads
[gtid
];
1185 kmp_team_t
*team
= thread
->th
.th_team
;
1186 kmp_taskdata_t
*parent_task
= thread
->th
.th_current_task
;
1187 size_t shareds_offset
;
1189 if (!TCR_4(__kmp_init_middle
))
1190 __kmp_middle_initialize();
1192 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1193 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1194 gtid
, loc_ref
, *((kmp_int32
*)flags
), sizeof_kmp_task_t
,
1195 sizeof_shareds
, task_entry
));
1197 if (parent_task
->td_flags
.final
) {
1198 if (flags
->merged_if0
) {
1202 if (flags
->tiedness
== TASK_UNTIED
&& !team
->t
.t_serialized
) {
1203 // Untied task encountered causes the TSC algorithm to check entire deque of
1204 // the victim thread. If no untied task encountered, then checking the head
1205 // of the deque should be enough.
1206 KMP_CHECK_UPDATE(thread
->th
.th_task_team
->tt
.tt_untied_task_encountered
, 1);
1209 // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1210 // the tasking setup
1211 // when that happens is too late.
1212 if (flags
->proxy
== TASK_PROXY
|| flags
->detachable
== TASK_DETACHABLE
) {
1213 if (flags
->proxy
== TASK_PROXY
) {
1214 flags
->tiedness
= TASK_UNTIED
;
1215 flags
->merged_if0
= 1;
1217 /* are we running in a sequential parallel or tskm_immediate_exec... we need
1218 tasking support enabled */
1219 if ((thread
->th
.th_task_team
) == NULL
) {
1220 /* This should only happen if the team is serialized
1221 setup a task team and propagate it to the thread */
1222 KMP_DEBUG_ASSERT(team
->t
.t_serialized
);
1224 ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1226 __kmp_task_team_setup(
1228 1); // 1 indicates setup the current team regardless of nthreads
1229 thread
->th
.th_task_team
= team
->t
.t_task_team
[thread
->th
.th_task_state
];
1231 kmp_task_team_t
*task_team
= thread
->th
.th_task_team
;
1233 /* tasking must be enabled now as the task might not be pushed */
1234 if (!KMP_TASKING_ENABLED(task_team
)) {
1237 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid
));
1238 __kmp_enable_tasking(task_team
, thread
);
1239 kmp_int32 tid
= thread
->th
.th_info
.ds
.ds_tid
;
1240 kmp_thread_data_t
*thread_data
= &task_team
->tt
.tt_threads_data
[tid
];
1241 // No lock needed since only owner can allocate
1242 if (thread_data
->td
.td_deque
== NULL
) {
1243 __kmp_alloc_task_deque(thread
, thread_data
);
1247 if (task_team
->tt
.tt_found_proxy_tasks
== FALSE
)
1248 TCW_4(task_team
->tt
.tt_found_proxy_tasks
, TRUE
);
1251 // Calculate shared structure offset including padding after kmp_task_t struct
1252 // to align pointers in shared struct
1253 shareds_offset
= sizeof(kmp_taskdata_t
) + sizeof_kmp_task_t
;
1254 shareds_offset
= __kmp_round_up_to_val(shareds_offset
, sizeof(void *));
1256 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1257 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid
,
1259 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid
,
1262 // Avoid double allocation here by combining shareds with taskdata
1264 taskdata
= (kmp_taskdata_t
*)__kmp_fast_allocate(thread
, shareds_offset
+
1266 #else /* ! USE_FAST_MEMORY */
1267 taskdata
= (kmp_taskdata_t
*)__kmp_thread_malloc(thread
, shareds_offset
+
1269 #endif /* USE_FAST_MEMORY */
1270 ANNOTATE_HAPPENS_AFTER(taskdata
);
1272 task
= KMP_TASKDATA_TO_TASK(taskdata
);
1274 // Make sure task & taskdata are aligned appropriately
1275 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1276 KMP_DEBUG_ASSERT((((kmp_uintptr_t
)taskdata
) & (sizeof(double) - 1)) == 0);
1277 KMP_DEBUG_ASSERT((((kmp_uintptr_t
)task
) & (sizeof(double) - 1)) == 0);
1279 KMP_DEBUG_ASSERT((((kmp_uintptr_t
)taskdata
) & (sizeof(_Quad
) - 1)) == 0);
1280 KMP_DEBUG_ASSERT((((kmp_uintptr_t
)task
) & (sizeof(_Quad
) - 1)) == 0);
1282 if (sizeof_shareds
> 0) {
1283 // Avoid double allocation here by combining shareds with taskdata
1284 task
->shareds
= &((char *)taskdata
)[shareds_offset
];
1285 // Make sure shareds struct is aligned to pointer size
1286 KMP_DEBUG_ASSERT((((kmp_uintptr_t
)task
->shareds
) & (sizeof(void *) - 1)) ==
1289 task
->shareds
= NULL
;
1291 task
->routine
= task_entry
;
1292 task
->part_id
= 0; // AC: Always start with 0 part id
1294 taskdata
->td_task_id
= KMP_GEN_TASK_ID();
1295 taskdata
->td_team
= team
;
1296 taskdata
->td_alloc_thread
= thread
;
1297 taskdata
->td_parent
= parent_task
;
1298 taskdata
->td_level
= parent_task
->td_level
+ 1; // increment nesting level
1299 KMP_ATOMIC_ST_RLX(&taskdata
->td_untied_count
, 0);
1300 taskdata
->td_ident
= loc_ref
;
1301 taskdata
->td_taskwait_ident
= NULL
;
1302 taskdata
->td_taskwait_counter
= 0;
1303 taskdata
->td_taskwait_thread
= 0;
1304 KMP_DEBUG_ASSERT(taskdata
->td_parent
!= NULL
);
1305 // avoid copying icvs for proxy tasks
1306 if (flags
->proxy
== TASK_FULL
)
1307 copy_icvs(&taskdata
->td_icvs
, &taskdata
->td_parent
->td_icvs
);
1309 taskdata
->td_flags
.tiedness
= flags
->tiedness
;
1310 taskdata
->td_flags
.final
= flags
->final
;
1311 taskdata
->td_flags
.merged_if0
= flags
->merged_if0
;
1312 taskdata
->td_flags
.destructors_thunk
= flags
->destructors_thunk
;
1313 taskdata
->td_flags
.proxy
= flags
->proxy
;
1314 taskdata
->td_flags
.detachable
= flags
->detachable
;
1315 taskdata
->td_task_team
= thread
->th
.th_task_team
;
1316 taskdata
->td_size_alloc
= shareds_offset
+ sizeof_shareds
;
1317 taskdata
->td_flags
.tasktype
= TASK_EXPLICIT
;
1319 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1320 taskdata
->td_flags
.tasking_ser
= (__kmp_tasking_mode
== tskm_immediate_exec
);
1322 // GEH - TODO: fix this to copy parent task's value of team_serial flag
1323 taskdata
->td_flags
.team_serial
= (team
->t
.t_serialized
) ? 1 : 0;
1325 // GEH - Note we serialize the task if the team is serialized to make sure
1326 // implicit parallel region tasks are not left until program termination to
1327 // execute. Also, it helps locality to execute immediately.
1329 taskdata
->td_flags
.task_serial
=
1330 (parent_task
->td_flags
.final
|| taskdata
->td_flags
.team_serial
||
1331 taskdata
->td_flags
.tasking_ser
|| flags
->merged_if0
);
1333 taskdata
->td_flags
.started
= 0;
1334 taskdata
->td_flags
.executing
= 0;
1335 taskdata
->td_flags
.complete
= 0;
1336 taskdata
->td_flags
.freed
= 0;
1338 taskdata
->td_flags
.native
= flags
->native
;
1340 KMP_ATOMIC_ST_RLX(&taskdata
->td_incomplete_child_tasks
, 0);
1341 // start at one because counts current task and children
1342 KMP_ATOMIC_ST_RLX(&taskdata
->td_allocated_child_tasks
, 1);
1343 taskdata
->td_taskgroup
=
1344 parent_task
->td_taskgroup
; // task inherits taskgroup from the parent task
1345 taskdata
->td_dephash
= NULL
;
1346 taskdata
->td_depnode
= NULL
;
1347 if (flags
->tiedness
== TASK_UNTIED
)
1348 taskdata
->td_last_tied
= NULL
; // will be set when the task is scheduled
1350 taskdata
->td_last_tied
= taskdata
;
1351 taskdata
->td_allow_completion_event
.type
= KMP_EVENT_UNINITIALIZED
;
1353 if (UNLIKELY(ompt_enabled
.enabled
))
1354 __ompt_task_init(taskdata
, gtid
);
1356 // Only need to keep track of child task counts if team parallel and tasking not
1357 // serialized or if it is a proxy or detachable task
1358 if (flags
->proxy
== TASK_PROXY
||
1359 flags
->detachable
== TASK_DETACHABLE
||
1360 !(taskdata
->td_flags
.team_serial
|| taskdata
->td_flags
.tasking_ser
))
1362 KMP_ATOMIC_INC(&parent_task
->td_incomplete_child_tasks
);
1363 if (parent_task
->td_taskgroup
)
1364 KMP_ATOMIC_INC(&parent_task
->td_taskgroup
->count
);
1365 // Only need to keep track of allocated child tasks for explicit tasks since
1366 // implicit not deallocated
1367 if (taskdata
->td_parent
->td_flags
.tasktype
== TASK_EXPLICIT
) {
1368 KMP_ATOMIC_INC(&taskdata
->td_parent
->td_allocated_child_tasks
);
1372 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1373 gtid
, taskdata
, taskdata
->td_parent
));
1374 ANNOTATE_HAPPENS_BEFORE(task
);
1379 kmp_task_t
*__kmpc_omp_task_alloc(ident_t
*loc_ref
, kmp_int32 gtid
,
1380 kmp_int32 flags
, size_t sizeof_kmp_task_t
,
1381 size_t sizeof_shareds
,
1382 kmp_routine_entry_t task_entry
) {
1384 kmp_tasking_flags_t
*input_flags
= (kmp_tasking_flags_t
*)&flags
;
1385 __kmp_assert_valid_gtid(gtid
);
1386 input_flags
->native
= FALSE
;
1387 // __kmp_task_alloc() sets up all other runtime flags
1388 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1389 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1390 gtid
, loc_ref
, input_flags
->tiedness
? "tied " : "untied",
1391 input_flags
->proxy
? "proxy" : "",
1392 input_flags
->detachable
? "detachable" : "", sizeof_kmp_task_t
,
1393 sizeof_shareds
, task_entry
));
1395 retval
= __kmp_task_alloc(loc_ref
, gtid
, input_flags
, sizeof_kmp_task_t
,
1396 sizeof_shareds
, task_entry
);
1398 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid
, retval
));
1403 kmp_task_t
*__kmpc_omp_target_task_alloc(ident_t
*loc_ref
, kmp_int32 gtid
,
1405 size_t sizeof_kmp_task_t
,
1406 size_t sizeof_shareds
,
1407 kmp_routine_entry_t task_entry
,
1408 kmp_int64 device_id
) {
1409 return __kmpc_omp_task_alloc(loc_ref
, gtid
, flags
, sizeof_kmp_task_t
,
1410 sizeof_shareds
, task_entry
);
1415 @param loc_ref location of the original task directive
1416 @param gtid Global Thread ID of encountering thread
1417 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1419 @param naffins Number of affinity items
1420 @param affin_list List of affinity items
1421 @return Returns non-zero if registering affinity information was not successful.
1422 Returns 0 if registration was successful
1423 This entry registers the affinity information attached to a task with the task
1424 thunk structure kmp_taskdata_t.
1427 __kmpc_omp_reg_task_with_affinity(ident_t
*loc_ref
, kmp_int32 gtid
,
1428 kmp_task_t
*new_task
, kmp_int32 naffins
,
1429 kmp_task_affinity_info_t
*affin_list
) {
1433 // __kmp_invoke_task: invoke the specified task
1435 // gtid: global thread ID of caller
1436 // task: the task to invoke
1437 // current_task: the task to resume after task invocation
1438 static void __kmp_invoke_task(kmp_int32 gtid
, kmp_task_t
*task
,
1439 kmp_taskdata_t
*current_task
) {
1440 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
1442 int discard
= 0 /* false */;
1444 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1445 gtid
, taskdata
, current_task
));
1446 KMP_DEBUG_ASSERT(task
);
1447 if (taskdata
->td_flags
.proxy
== TASK_PROXY
&&
1448 taskdata
->td_flags
.complete
== 1) {
1449 // This is a proxy task that was already completed but it needs to run
1450 // its bottom-half finish
1453 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1456 __kmp_bottom_half_finish_proxy(gtid
, task
);
1458 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1459 "proxy task %p, resuming task %p\n",
1460 gtid
, taskdata
, current_task
));
1466 // For untied tasks, the first task executed only calls __kmpc_omp_task and
1467 // does not execute code.
1468 ompt_thread_info_t oldInfo
;
1469 if (UNLIKELY(ompt_enabled
.enabled
)) {
1470 // Store the threads states and restore them after the task
1471 thread
= __kmp_threads
[gtid
];
1472 oldInfo
= thread
->th
.ompt_thread_info
;
1473 thread
->th
.ompt_thread_info
.wait_id
= 0;
1474 thread
->th
.ompt_thread_info
.state
= (thread
->th
.th_team_serialized
)
1475 ? ompt_state_work_serial
1476 : ompt_state_work_parallel
;
1477 taskdata
->ompt_task_info
.frame
.exit_frame
.ptr
= OMPT_GET_FRAME_ADDRESS(0);
1481 // Proxy tasks are not handled by the runtime
1482 if (taskdata
->td_flags
.proxy
!= TASK_PROXY
) {
1483 ANNOTATE_HAPPENS_AFTER(task
);
1484 __kmp_task_start(gtid
, task
, current_task
); // OMPT only if not discarded
1487 // TODO: cancel tasks if the parallel region has also been cancelled
1488 // TODO: check if this sequence can be hoisted above __kmp_task_start
1489 // if cancellation has been enabled for this run ...
1490 if (__kmp_omp_cancellation
) {
1491 thread
= __kmp_threads
[gtid
];
1492 kmp_team_t
*this_team
= thread
->th
.th_team
;
1493 kmp_taskgroup_t
*taskgroup
= taskdata
->td_taskgroup
;
1494 if ((taskgroup
&& taskgroup
->cancel_request
) ||
1495 (this_team
->t
.t_cancel_request
== cancel_parallel
)) {
1496 #if OMPT_SUPPORT && OMPT_OPTIONAL
1497 ompt_data_t
*task_data
;
1498 if (UNLIKELY(ompt_enabled
.ompt_callback_cancel
)) {
1499 __ompt_get_task_info_internal(0, NULL
, &task_data
, NULL
, NULL
, NULL
);
1500 ompt_callbacks
.ompt_callback(ompt_callback_cancel
)(
1502 ((taskgroup
&& taskgroup
->cancel_request
) ? ompt_cancel_taskgroup
1503 : ompt_cancel_parallel
) |
1504 ompt_cancel_discarded_task
,
1508 KMP_COUNT_BLOCK(TASK_cancelled
);
1509 // this task belongs to a task group and we need to cancel it
1510 discard
= 1 /* true */;
1514 // Invoke the task routine and pass in relevant data.
1515 // Thunks generated by gcc take a different argument list.
1517 if (taskdata
->td_flags
.tiedness
== TASK_UNTIED
) {
1518 taskdata
->td_last_tied
= current_task
->td_last_tied
;
1519 KMP_DEBUG_ASSERT(taskdata
->td_last_tied
);
1521 #if KMP_STATS_ENABLED
1522 KMP_COUNT_BLOCK(TASK_executed
);
1523 switch (KMP_GET_THREAD_STATE()) {
1524 case FORK_JOIN_BARRIER
:
1525 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar
);
1528 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar
);
1531 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield
);
1534 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait
);
1537 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup
);
1540 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate
);
1543 #endif // KMP_STATS_ENABLED
1547 if (UNLIKELY(ompt_enabled
.enabled
))
1548 __ompt_task_start(task
, current_task
, gtid
);
1551 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1552 kmp_uint64 cur_time
;
1553 kmp_int32 kmp_itt_count_task
=
1554 __kmp_forkjoin_frames_mode
== 3 && !taskdata
->td_flags
.task_serial
&&
1555 current_task
->td_flags
.tasktype
== TASK_IMPLICIT
;
1556 if (kmp_itt_count_task
) {
1557 thread
= __kmp_threads
[gtid
];
1558 // Time outer level explicit task on barrier for adjusting imbalance time
1559 if (thread
->th
.th_bar_arrive_time
)
1560 cur_time
= __itt_get_timestamp();
1562 kmp_itt_count_task
= 0; // thread is not on a barrier - skip timing
1564 KMP_FSYNC_ACQUIRED(taskdata
); // acquired self (new task)
1567 #ifdef KMP_GOMP_COMPAT
1568 if (taskdata
->td_flags
.native
) {
1569 ((void (*)(void *))(*(task
->routine
)))(task
->shareds
);
1571 #endif /* KMP_GOMP_COMPAT */
1573 (*(task
->routine
))(gtid
, task
);
1575 KMP_POP_PARTITIONED_TIMER();
1577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1578 if (kmp_itt_count_task
) {
1579 // Barrier imbalance - adjust arrive time with the task duration
1580 thread
->th
.th_bar_arrive_time
+= (__itt_get_timestamp() - cur_time
);
1582 KMP_FSYNC_CANCEL(taskdata
); // destroy self (just executed)
1583 KMP_FSYNC_RELEASING(taskdata
->td_parent
); // releasing parent
1588 // Proxy tasks are not handled by the runtime
1589 if (taskdata
->td_flags
.proxy
!= TASK_PROXY
) {
1590 ANNOTATE_HAPPENS_BEFORE(taskdata
->td_parent
);
1592 if (UNLIKELY(ompt_enabled
.enabled
)) {
1593 thread
->th
.ompt_thread_info
= oldInfo
;
1594 if (taskdata
->td_flags
.tiedness
== TASK_TIED
) {
1595 taskdata
->ompt_task_info
.frame
.exit_frame
= ompt_data_none
;
1597 __kmp_task_finish
<true>(gtid
, task
, current_task
);
1600 __kmp_task_finish
<false>(gtid
, task
, current_task
);
1605 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1606 gtid
, taskdata
, current_task
));
1610 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1612 // loc_ref: location of original task pragma (ignored)
1613 // gtid: Global Thread ID of encountering thread
1614 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1616 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1617 // be resumed later.
1618 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1620 kmp_int32
__kmpc_omp_task_parts(ident_t
*loc_ref
, kmp_int32 gtid
,
1621 kmp_task_t
*new_task
) {
1622 kmp_taskdata_t
*new_taskdata
= KMP_TASK_TO_TASKDATA(new_task
);
1624 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid
,
1625 loc_ref
, new_taskdata
));
1628 kmp_taskdata_t
*parent
;
1629 if (UNLIKELY(ompt_enabled
.enabled
)) {
1630 parent
= new_taskdata
->td_parent
;
1631 if (ompt_enabled
.ompt_callback_task_create
) {
1632 ompt_data_t task_data
= ompt_data_none
;
1633 ompt_callbacks
.ompt_callback(ompt_callback_task_create
)(
1634 parent
? &(parent
->ompt_task_info
.task_data
) : &task_data
,
1635 parent
? &(parent
->ompt_task_info
.frame
) : NULL
,
1636 &(new_taskdata
->ompt_task_info
.task_data
), ompt_task_explicit
, 0,
1637 OMPT_GET_RETURN_ADDRESS(0));
1642 /* Should we execute the new task or queue it? For now, let's just always try
1643 to queue it. If the queue fills up, then we'll execute it. */
1645 if (__kmp_push_task(gtid
, new_task
) == TASK_NOT_PUSHED
) // if cannot defer
1646 { // Execute this task immediately
1647 kmp_taskdata_t
*current_task
= __kmp_threads
[gtid
]->th
.th_current_task
;
1648 new_taskdata
->td_flags
.task_serial
= 1;
1649 __kmp_invoke_task(gtid
, new_task
, current_task
);
1654 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1655 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1656 gtid
, loc_ref
, new_taskdata
));
1658 ANNOTATE_HAPPENS_BEFORE(new_task
);
1660 if (UNLIKELY(ompt_enabled
.enabled
)) {
1661 parent
->ompt_task_info
.frame
.enter_frame
= ompt_data_none
;
1664 return TASK_CURRENT_NOT_QUEUED
;
1667 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1669 // gtid: Global Thread ID of encountering thread
1670 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1671 // serialize_immediate: if TRUE then if the task is executed immediately its
1672 // execution will be serialized
1674 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1675 // be resumed later.
1676 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1678 kmp_int32
__kmp_omp_task(kmp_int32 gtid
, kmp_task_t
*new_task
,
1679 bool serialize_immediate
) {
1680 kmp_taskdata_t
*new_taskdata
= KMP_TASK_TO_TASKDATA(new_task
);
1682 /* Should we execute the new task or queue it? For now, let's just always try
1683 to queue it. If the queue fills up, then we'll execute it. */
1684 if (new_taskdata
->td_flags
.proxy
== TASK_PROXY
||
1685 __kmp_push_task(gtid
, new_task
) == TASK_NOT_PUSHED
) // if cannot defer
1686 { // Execute this task immediately
1687 kmp_taskdata_t
*current_task
= __kmp_threads
[gtid
]->th
.th_current_task
;
1688 if (serialize_immediate
)
1689 new_taskdata
->td_flags
.task_serial
= 1;
1690 __kmp_invoke_task(gtid
, new_task
, current_task
);
1693 ANNOTATE_HAPPENS_BEFORE(new_task
);
1694 return TASK_CURRENT_NOT_QUEUED
;
1697 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1698 // non-thread-switchable task from the parent thread only!
1700 // loc_ref: location of original task pragma (ignored)
1701 // gtid: Global Thread ID of encountering thread
1702 // new_task: non-thread-switchable task thunk allocated by
1703 // __kmp_omp_task_alloc()
1705 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1706 // be resumed later.
1707 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1709 kmp_int32
__kmpc_omp_task(ident_t
*loc_ref
, kmp_int32 gtid
,
1710 kmp_task_t
*new_task
) {
1712 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK
);
1714 #if KMP_DEBUG || OMPT_SUPPORT
1715 kmp_taskdata_t
*new_taskdata
= KMP_TASK_TO_TASKDATA(new_task
);
1717 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid
, loc_ref
,
1719 __kmp_assert_valid_gtid(gtid
);
1722 kmp_taskdata_t
*parent
= NULL
;
1723 if (UNLIKELY(ompt_enabled
.enabled
)) {
1724 if (!new_taskdata
->td_flags
.started
) {
1725 OMPT_STORE_RETURN_ADDRESS(gtid
);
1726 parent
= new_taskdata
->td_parent
;
1727 if (!parent
->ompt_task_info
.frame
.enter_frame
.ptr
) {
1728 parent
->ompt_task_info
.frame
.enter_frame
.ptr
= OMPT_GET_FRAME_ADDRESS(0);
1730 if (ompt_enabled
.ompt_callback_task_create
) {
1731 ompt_data_t task_data
= ompt_data_none
;
1732 ompt_callbacks
.ompt_callback(ompt_callback_task_create
)(
1733 parent
? &(parent
->ompt_task_info
.task_data
) : &task_data
,
1734 parent
? &(parent
->ompt_task_info
.frame
) : NULL
,
1735 &(new_taskdata
->ompt_task_info
.task_data
),
1736 ompt_task_explicit
| TASK_TYPE_DETAILS_FORMAT(new_taskdata
), 0,
1737 OMPT_LOAD_RETURN_ADDRESS(gtid
));
1740 // We are scheduling the continuation of an UNTIED task.
1741 // Scheduling back to the parent task.
1742 __ompt_task_finish(new_task
,
1743 new_taskdata
->ompt_task_info
.scheduling_parent
,
1745 new_taskdata
->ompt_task_info
.frame
.exit_frame
= ompt_data_none
;
1750 res
= __kmp_omp_task(gtid
, new_task
, true);
1752 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1753 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1754 gtid
, loc_ref
, new_taskdata
));
1756 if (UNLIKELY(ompt_enabled
.enabled
&& parent
!= NULL
)) {
1757 parent
->ompt_task_info
.frame
.enter_frame
= ompt_data_none
;
1763 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1764 // a taskloop task with the correct OMPT return address
1766 // loc_ref: location of original task pragma (ignored)
1767 // gtid: Global Thread ID of encountering thread
1768 // new_task: non-thread-switchable task thunk allocated by
1769 // __kmp_omp_task_alloc()
1770 // codeptr_ra: return address for OMPT callback
1772 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1773 // be resumed later.
1774 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1776 kmp_int32
__kmp_omp_taskloop_task(ident_t
*loc_ref
, kmp_int32 gtid
,
1777 kmp_task_t
*new_task
, void *codeptr_ra
) {
1779 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK
);
1781 #if KMP_DEBUG || OMPT_SUPPORT
1782 kmp_taskdata_t
*new_taskdata
= KMP_TASK_TO_TASKDATA(new_task
);
1784 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid
, loc_ref
,
1788 kmp_taskdata_t
*parent
= NULL
;
1789 if (UNLIKELY(ompt_enabled
.enabled
&& !new_taskdata
->td_flags
.started
)) {
1790 parent
= new_taskdata
->td_parent
;
1791 if (!parent
->ompt_task_info
.frame
.enter_frame
.ptr
)
1792 parent
->ompt_task_info
.frame
.enter_frame
.ptr
= OMPT_GET_FRAME_ADDRESS(0);
1793 if (ompt_enabled
.ompt_callback_task_create
) {
1794 ompt_data_t task_data
= ompt_data_none
;
1795 ompt_callbacks
.ompt_callback(ompt_callback_task_create
)(
1796 parent
? &(parent
->ompt_task_info
.task_data
) : &task_data
,
1797 parent
? &(parent
->ompt_task_info
.frame
) : NULL
,
1798 &(new_taskdata
->ompt_task_info
.task_data
),
1799 ompt_task_explicit
| TASK_TYPE_DETAILS_FORMAT(new_taskdata
), 0,
1805 res
= __kmp_omp_task(gtid
, new_task
, true);
1807 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1808 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1809 gtid
, loc_ref
, new_taskdata
));
1811 if (UNLIKELY(ompt_enabled
.enabled
&& parent
!= NULL
)) {
1812 parent
->ompt_task_info
.frame
.enter_frame
= ompt_data_none
;
1818 template <bool ompt
>
1819 static kmp_int32
__kmpc_omp_taskwait_template(ident_t
*loc_ref
, kmp_int32 gtid
,
1820 void *frame_address
,
1821 void *return_address
) {
1822 kmp_taskdata_t
*taskdata
;
1824 int thread_finished
= FALSE
;
1825 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT
);
1827 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid
, loc_ref
));
1828 __kmp_assert_valid_gtid(gtid
);
1830 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
1831 thread
= __kmp_threads
[gtid
];
1832 taskdata
= thread
->th
.th_current_task
;
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835 ompt_data_t
*my_task_data
;
1836 ompt_data_t
*my_parallel_data
;
1839 my_task_data
= &(taskdata
->ompt_task_info
.task_data
);
1840 my_parallel_data
= OMPT_CUR_TEAM_DATA(thread
);
1842 taskdata
->ompt_task_info
.frame
.enter_frame
.ptr
= frame_address
;
1844 if (ompt_enabled
.ompt_callback_sync_region
) {
1845 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
1846 ompt_sync_region_taskwait
, ompt_scope_begin
, my_parallel_data
,
1847 my_task_data
, return_address
);
1850 if (ompt_enabled
.ompt_callback_sync_region_wait
) {
1851 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
1852 ompt_sync_region_taskwait
, ompt_scope_begin
, my_parallel_data
,
1853 my_task_data
, return_address
);
1856 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1858 // Debugger: The taskwait is active. Store location and thread encountered the
1861 // Note: These values are used by ITT events as well.
1862 #endif /* USE_ITT_BUILD */
1863 taskdata
->td_taskwait_counter
+= 1;
1864 taskdata
->td_taskwait_ident
= loc_ref
;
1865 taskdata
->td_taskwait_thread
= gtid
+ 1;
1868 void *itt_sync_obj
= __kmp_itt_taskwait_object(gtid
);
1869 if (itt_sync_obj
!= NULL
)
1870 __kmp_itt_taskwait_starting(gtid
, itt_sync_obj
);
1871 #endif /* USE_ITT_BUILD */
1874 !taskdata
->td_flags
.team_serial
&& !taskdata
->td_flags
.final
;
1876 must_wait
= must_wait
|| (thread
->th
.th_task_team
!= NULL
&&
1877 thread
->th
.th_task_team
->tt
.tt_found_proxy_tasks
);
1879 kmp_flag_32
flag(RCAST(std::atomic
<kmp_uint32
> *,
1880 &(taskdata
->td_incomplete_child_tasks
)),
1882 while (KMP_ATOMIC_LD_ACQ(&taskdata
->td_incomplete_child_tasks
) != 0) {
1883 flag
.execute_tasks(thread
, gtid
, FALSE
,
1884 &thread_finished
USE_ITT_BUILD_ARG(itt_sync_obj
),
1885 __kmp_task_stealing_constraint
);
1889 if (itt_sync_obj
!= NULL
)
1890 __kmp_itt_taskwait_finished(gtid
, itt_sync_obj
);
1891 KMP_FSYNC_ACQUIRED(taskdata
); // acquire self - sync with children
1892 #endif /* USE_ITT_BUILD */
1894 // Debugger: The taskwait is completed. Location remains, but thread is
1896 taskdata
->td_taskwait_thread
= -taskdata
->td_taskwait_thread
;
1898 #if OMPT_SUPPORT && OMPT_OPTIONAL
1900 if (ompt_enabled
.ompt_callback_sync_region_wait
) {
1901 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
1902 ompt_sync_region_taskwait
, ompt_scope_end
, my_parallel_data
,
1903 my_task_data
, return_address
);
1905 if (ompt_enabled
.ompt_callback_sync_region
) {
1906 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
1907 ompt_sync_region_taskwait
, ompt_scope_end
, my_parallel_data
,
1908 my_task_data
, return_address
);
1910 taskdata
->ompt_task_info
.frame
.enter_frame
= ompt_data_none
;
1912 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1914 ANNOTATE_HAPPENS_AFTER(taskdata
);
1917 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1918 "returning TASK_CURRENT_NOT_QUEUED\n",
1921 return TASK_CURRENT_NOT_QUEUED
;
1924 #if OMPT_SUPPORT && OMPT_OPTIONAL
1926 static kmp_int32
__kmpc_omp_taskwait_ompt(ident_t
*loc_ref
, kmp_int32 gtid
,
1927 void *frame_address
,
1928 void *return_address
) {
1929 return __kmpc_omp_taskwait_template
<true>(loc_ref
, gtid
, frame_address
,
1932 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1934 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1936 kmp_int32
__kmpc_omp_taskwait(ident_t
*loc_ref
, kmp_int32 gtid
) {
1937 #if OMPT_SUPPORT && OMPT_OPTIONAL
1938 if (UNLIKELY(ompt_enabled
.enabled
)) {
1939 OMPT_STORE_RETURN_ADDRESS(gtid
);
1940 return __kmpc_omp_taskwait_ompt(loc_ref
, gtid
, OMPT_GET_FRAME_ADDRESS(0),
1941 OMPT_LOAD_RETURN_ADDRESS(gtid
));
1944 return __kmpc_omp_taskwait_template
<false>(loc_ref
, gtid
, NULL
, NULL
);
1947 // __kmpc_omp_taskyield: switch to a different task
1948 kmp_int32
__kmpc_omp_taskyield(ident_t
*loc_ref
, kmp_int32 gtid
, int end_part
) {
1949 kmp_taskdata_t
*taskdata
;
1951 int thread_finished
= FALSE
;
1953 KMP_COUNT_BLOCK(OMP_TASKYIELD
);
1954 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD
);
1956 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1957 gtid
, loc_ref
, end_part
));
1958 __kmp_assert_valid_gtid(gtid
);
1960 if (__kmp_tasking_mode
!= tskm_immediate_exec
&& __kmp_init_parallel
) {
1961 thread
= __kmp_threads
[gtid
];
1962 taskdata
= thread
->th
.th_current_task
;
1963 // Should we model this as a task wait or not?
1964 // Debugger: The taskwait is active. Store location and thread encountered the
1967 // Note: These values are used by ITT events as well.
1968 #endif /* USE_ITT_BUILD */
1969 taskdata
->td_taskwait_counter
+= 1;
1970 taskdata
->td_taskwait_ident
= loc_ref
;
1971 taskdata
->td_taskwait_thread
= gtid
+ 1;
1974 void *itt_sync_obj
= __kmp_itt_taskwait_object(gtid
);
1975 if (itt_sync_obj
!= NULL
)
1976 __kmp_itt_taskwait_starting(gtid
, itt_sync_obj
);
1977 #endif /* USE_ITT_BUILD */
1978 if (!taskdata
->td_flags
.team_serial
) {
1979 kmp_task_team_t
*task_team
= thread
->th
.th_task_team
;
1980 if (task_team
!= NULL
) {
1981 if (KMP_TASKING_ENABLED(task_team
)) {
1983 if (UNLIKELY(ompt_enabled
.enabled
))
1984 thread
->th
.ompt_thread_info
.ompt_task_yielded
= 1;
1986 __kmp_execute_tasks_32(
1987 thread
, gtid
, NULL
, FALSE
,
1988 &thread_finished
USE_ITT_BUILD_ARG(itt_sync_obj
),
1989 __kmp_task_stealing_constraint
);
1991 if (UNLIKELY(ompt_enabled
.enabled
))
1992 thread
->th
.ompt_thread_info
.ompt_task_yielded
= 0;
1998 if (itt_sync_obj
!= NULL
)
1999 __kmp_itt_taskwait_finished(gtid
, itt_sync_obj
);
2000 #endif /* USE_ITT_BUILD */
2002 // Debugger: The taskwait is completed. Location remains, but thread is
2004 taskdata
->td_taskwait_thread
= -taskdata
->td_taskwait_thread
;
2007 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2008 "returning TASK_CURRENT_NOT_QUEUED\n",
2011 return TASK_CURRENT_NOT_QUEUED
;
2014 // Task Reduction implementation
2016 // Note: initial implementation didn't take into account the possibility
2017 // to specify omp_orig for initializer of the UDR (user defined reduction).
2018 // Corrected implementation takes into account the omp_orig object.
2019 // Compiler is free to use old implementation if omp_orig is not specified.
2022 @ingroup BASIC_TYPES
2027 Flags for special info per task reduction item.
2029 typedef struct kmp_taskred_flags
{
2030 /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
2031 unsigned lazy_priv
: 1;
2032 unsigned reserved31
: 31;
2033 } kmp_taskred_flags_t
;
2036 Internal struct for reduction data item related info set up by compiler.
2038 typedef struct kmp_task_red_input
{
2039 void *reduce_shar
; /**< shared between tasks item to reduce into */
2040 size_t reduce_size
; /**< size of data item in bytes */
2041 // three compiler-generated routines (init, fini are optional):
2042 void *reduce_init
; /**< data initialization routine (single parameter) */
2043 void *reduce_fini
; /**< data finalization routine */
2044 void *reduce_comb
; /**< data combiner routine */
2045 kmp_taskred_flags_t flags
; /**< flags for additional info from compiler */
2046 } kmp_task_red_input_t
;
2049 Internal struct for reduction data item related info saved by the library.
2051 typedef struct kmp_taskred_data
{
2052 void *reduce_shar
; /**< shared between tasks item to reduce into */
2053 size_t reduce_size
; /**< size of data item */
2054 kmp_taskred_flags_t flags
; /**< flags for additional info from compiler */
2055 void *reduce_priv
; /**< array of thread specific items */
2056 void *reduce_pend
; /**< end of private data for faster comparison op */
2057 // three compiler-generated routines (init, fini are optional):
2058 void *reduce_comb
; /**< data combiner routine */
2059 void *reduce_init
; /**< data initialization routine (two parameters) */
2060 void *reduce_fini
; /**< data finalization routine */
2061 void *reduce_orig
; /**< original item (can be used in UDR initializer) */
2062 } kmp_taskred_data_t
;
2065 Internal struct for reduction data item related info set up by compiler.
2067 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2069 typedef struct kmp_taskred_input
{
2070 void *reduce_shar
; /**< shared between tasks item to reduce into */
2071 void *reduce_orig
; /**< original reduction item used for initialization */
2072 size_t reduce_size
; /**< size of data item */
2073 // three compiler-generated routines (init, fini are optional):
2074 void *reduce_init
; /**< data initialization routine (two parameters) */
2075 void *reduce_fini
; /**< data finalization routine */
2076 void *reduce_comb
; /**< data combiner routine */
2077 kmp_taskred_flags_t flags
; /**< flags for additional info from compiler */
2078 } kmp_taskred_input_t
;
2083 template <typename T
> void __kmp_assign_orig(kmp_taskred_data_t
&item
, T
&src
);
2085 void __kmp_assign_orig
<kmp_task_red_input_t
>(kmp_taskred_data_t
&item
,
2086 kmp_task_red_input_t
&src
) {
2087 item
.reduce_orig
= NULL
;
2090 void __kmp_assign_orig
<kmp_taskred_input_t
>(kmp_taskred_data_t
&item
,
2091 kmp_taskred_input_t
&src
) {
2092 if (src
.reduce_orig
!= NULL
) {
2093 item
.reduce_orig
= src
.reduce_orig
;
2095 item
.reduce_orig
= src
.reduce_shar
;
2096 } // non-NULL reduce_orig means new interface used
2099 template <typename T
> void __kmp_call_init(kmp_taskred_data_t
&item
, int j
);
2101 void __kmp_call_init
<kmp_task_red_input_t
>(kmp_taskred_data_t
&item
,
2103 ((void (*)(void *))item
.reduce_init
)((char *)(item
.reduce_priv
) + offset
);
2106 void __kmp_call_init
<kmp_taskred_input_t
>(kmp_taskred_data_t
&item
,
2108 ((void (*)(void *, void *))item
.reduce_init
)(
2109 (char *)(item
.reduce_priv
) + offset
, item
.reduce_orig
);
2112 template <typename T
>
2113 void *__kmp_task_reduction_init(int gtid
, int num
, T
*data
) {
2114 __kmp_assert_valid_gtid(gtid
);
2115 kmp_info_t
*thread
= __kmp_threads
[gtid
];
2116 kmp_taskgroup_t
*tg
= thread
->th
.th_current_task
->td_taskgroup
;
2117 kmp_int32 nth
= thread
->th
.th_team_nproc
;
2118 kmp_taskred_data_t
*arr
;
2120 // check input data just in case
2121 KMP_ASSERT(tg
!= NULL
);
2122 KMP_ASSERT(data
!= NULL
);
2123 KMP_ASSERT(num
> 0);
2125 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2129 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2131 arr
= (kmp_taskred_data_t
*)__kmp_thread_malloc(
2132 thread
, num
* sizeof(kmp_taskred_data_t
));
2133 for (int i
= 0; i
< num
; ++i
) {
2134 size_t size
= data
[i
].reduce_size
- 1;
2135 // round the size up to cache line per thread-specific item
2136 size
+= CACHE_LINE
- size
% CACHE_LINE
;
2137 KMP_ASSERT(data
[i
].reduce_comb
!= NULL
); // combiner is mandatory
2138 arr
[i
].reduce_shar
= data
[i
].reduce_shar
;
2139 arr
[i
].reduce_size
= size
;
2140 arr
[i
].flags
= data
[i
].flags
;
2141 arr
[i
].reduce_comb
= data
[i
].reduce_comb
;
2142 arr
[i
].reduce_init
= data
[i
].reduce_init
;
2143 arr
[i
].reduce_fini
= data
[i
].reduce_fini
;
2144 __kmp_assign_orig
<T
>(arr
[i
], data
[i
]);
2145 if (!arr
[i
].flags
.lazy_priv
) {
2146 // allocate cache-line aligned block and fill it with zeros
2147 arr
[i
].reduce_priv
= __kmp_allocate(nth
* size
);
2148 arr
[i
].reduce_pend
= (char *)(arr
[i
].reduce_priv
) + nth
* size
;
2149 if (arr
[i
].reduce_init
!= NULL
) {
2150 // initialize all thread-specific items
2151 for (int j
= 0; j
< nth
; ++j
) {
2152 __kmp_call_init
<T
>(arr
[i
], j
* size
);
2156 // only allocate space for pointers now,
2157 // objects will be lazily allocated/initialized if/when requested
2158 // note that __kmp_allocate zeroes the allocated memory
2159 arr
[i
].reduce_priv
= __kmp_allocate(nth
* sizeof(void *));
2162 tg
->reduce_data
= (void *)arr
;
2163 tg
->reduce_num_data
= num
;
2169 @param gtid Global thread ID
2170 @param num Number of data items to reduce
2171 @param data Array of data for reduction
2172 @return The taskgroup identifier
2174 Initialize task reduction for the taskgroup.
2176 Note: this entry supposes the optional compiler-generated initializer routine
2177 has single parameter - pointer to object to be initialized. That means
2178 the reduction either does not use omp_orig object, or the omp_orig is accessible
2179 without help of the runtime library.
2181 void *__kmpc_task_reduction_init(int gtid
, int num
, void *data
) {
2182 return __kmp_task_reduction_init(gtid
, num
, (kmp_task_red_input_t
*)data
);
2187 @param gtid Global thread ID
2188 @param num Number of data items to reduce
2189 @param data Array of data for reduction
2190 @return The taskgroup identifier
2192 Initialize task reduction for the taskgroup.
2194 Note: this entry supposes the optional compiler-generated initializer routine
2195 has two parameters, pointer to object to be initialized and pointer to omp_orig
2197 void *__kmpc_taskred_init(int gtid
, int num
, void *data
) {
2198 return __kmp_task_reduction_init(gtid
, num
, (kmp_taskred_input_t
*)data
);
2201 // Copy task reduction data (except for shared pointers).
2202 template <typename T
>
2203 void __kmp_task_reduction_init_copy(kmp_info_t
*thr
, int num
, T
*data
,
2204 kmp_taskgroup_t
*tg
, void *reduce_data
) {
2205 kmp_taskred_data_t
*arr
;
2206 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2208 thr
, tg
, reduce_data
));
2209 arr
= (kmp_taskred_data_t
*)__kmp_thread_malloc(
2210 thr
, num
* sizeof(kmp_taskred_data_t
));
2211 // threads will share private copies, thunk routines, sizes, flags, etc.:
2212 KMP_MEMCPY(arr
, reduce_data
, num
* sizeof(kmp_taskred_data_t
));
2213 for (int i
= 0; i
< num
; ++i
) {
2214 arr
[i
].reduce_shar
= data
[i
].reduce_shar
; // init unique shared pointers
2216 tg
->reduce_data
= (void *)arr
;
2217 tg
->reduce_num_data
= num
;
2222 @param gtid Global thread ID
2223 @param tskgrp The taskgroup ID (optional)
2224 @param data Shared location of the item
2225 @return The pointer to per-thread data
2227 Get thread-specific location of data item
2229 void *__kmpc_task_reduction_get_th_data(int gtid
, void *tskgrp
, void *data
) {
2230 __kmp_assert_valid_gtid(gtid
);
2231 kmp_info_t
*thread
= __kmp_threads
[gtid
];
2232 kmp_int32 nth
= thread
->th
.th_team_nproc
;
2234 return data
; // nothing to do
2236 kmp_taskgroup_t
*tg
= (kmp_taskgroup_t
*)tskgrp
;
2238 tg
= thread
->th
.th_current_task
->td_taskgroup
;
2239 KMP_ASSERT(tg
!= NULL
);
2240 kmp_taskred_data_t
*arr
= (kmp_taskred_data_t
*)(tg
->reduce_data
);
2241 kmp_int32 num
= tg
->reduce_num_data
;
2242 kmp_int32 tid
= thread
->th
.th_info
.ds
.ds_tid
;
2244 KMP_ASSERT(data
!= NULL
);
2245 while (tg
!= NULL
) {
2246 for (int i
= 0; i
< num
; ++i
) {
2247 if (!arr
[i
].flags
.lazy_priv
) {
2248 if (data
== arr
[i
].reduce_shar
||
2249 (data
>= arr
[i
].reduce_priv
&& data
< arr
[i
].reduce_pend
))
2250 return (char *)(arr
[i
].reduce_priv
) + tid
* arr
[i
].reduce_size
;
2252 // check shared location first
2253 void **p_priv
= (void **)(arr
[i
].reduce_priv
);
2254 if (data
== arr
[i
].reduce_shar
)
2256 // check if we get some thread specific location as parameter
2257 for (int j
= 0; j
< nth
; ++j
)
2258 if (data
== p_priv
[j
])
2260 continue; // not found, continue search
2262 if (p_priv
[tid
] == NULL
) {
2263 // allocate thread specific object lazily
2264 p_priv
[tid
] = __kmp_allocate(arr
[i
].reduce_size
);
2265 if (arr
[i
].reduce_init
!= NULL
) {
2266 if (arr
[i
].reduce_orig
!= NULL
) { // new interface
2267 ((void (*)(void *, void *))arr
[i
].reduce_init
)(
2268 p_priv
[tid
], arr
[i
].reduce_orig
);
2269 } else { // old interface (single parameter)
2270 ((void (*)(void *))arr
[i
].reduce_init
)(p_priv
[tid
]);
2278 arr
= (kmp_taskred_data_t
*)(tg
->reduce_data
);
2279 num
= tg
->reduce_num_data
;
2281 KMP_ASSERT2(0, "Unknown task reduction item");
2282 return NULL
; // ERROR, this line never executed
2285 // Finalize task reduction.
2286 // Called from __kmpc_end_taskgroup()
2287 static void __kmp_task_reduction_fini(kmp_info_t
*th
, kmp_taskgroup_t
*tg
) {
2288 kmp_int32 nth
= th
->th
.th_team_nproc
;
2289 KMP_DEBUG_ASSERT(nth
> 1); // should not be called if nth == 1
2290 kmp_taskred_data_t
*arr
= (kmp_taskred_data_t
*)tg
->reduce_data
;
2291 kmp_int32 num
= tg
->reduce_num_data
;
2292 for (int i
= 0; i
< num
; ++i
) {
2293 void *sh_data
= arr
[i
].reduce_shar
;
2294 void (*f_fini
)(void *) = (void (*)(void *))(arr
[i
].reduce_fini
);
2295 void (*f_comb
)(void *, void *) =
2296 (void (*)(void *, void *))(arr
[i
].reduce_comb
);
2297 if (!arr
[i
].flags
.lazy_priv
) {
2298 void *pr_data
= arr
[i
].reduce_priv
;
2299 size_t size
= arr
[i
].reduce_size
;
2300 for (int j
= 0; j
< nth
; ++j
) {
2301 void *priv_data
= (char *)pr_data
+ j
* size
;
2302 f_comb(sh_data
, priv_data
); // combine results
2304 f_fini(priv_data
); // finalize if needed
2307 void **pr_data
= (void **)(arr
[i
].reduce_priv
);
2308 for (int j
= 0; j
< nth
; ++j
) {
2309 if (pr_data
[j
] != NULL
) {
2310 f_comb(sh_data
, pr_data
[j
]); // combine results
2312 f_fini(pr_data
[j
]); // finalize if needed
2313 __kmp_free(pr_data
[j
]);
2317 __kmp_free(arr
[i
].reduce_priv
);
2319 __kmp_thread_free(th
, arr
);
2320 tg
->reduce_data
= NULL
;
2321 tg
->reduce_num_data
= 0;
2324 // Cleanup task reduction data for parallel or worksharing,
2325 // do not touch task private data other threads still working with.
2326 // Called from __kmpc_end_taskgroup()
2327 static void __kmp_task_reduction_clean(kmp_info_t
*th
, kmp_taskgroup_t
*tg
) {
2328 __kmp_thread_free(th
, tg
->reduce_data
);
2329 tg
->reduce_data
= NULL
;
2330 tg
->reduce_num_data
= 0;
2333 template <typename T
>
2334 void *__kmp_task_reduction_modifier_init(ident_t
*loc
, int gtid
, int is_ws
,
2336 __kmp_assert_valid_gtid(gtid
);
2337 kmp_info_t
*thr
= __kmp_threads
[gtid
];
2338 kmp_int32 nth
= thr
->th
.th_team_nproc
;
2339 __kmpc_taskgroup(loc
, gtid
); // form new taskgroup first
2342 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2343 gtid
, thr
->th
.th_current_task
->td_taskgroup
));
2344 return (void *)thr
->th
.th_current_task
->td_taskgroup
;
2346 kmp_team_t
*team
= thr
->th
.th_team
;
2348 kmp_taskgroup_t
*tg
;
2349 reduce_data
= KMP_ATOMIC_LD_RLX(&team
->t
.t_tg_reduce_data
[is_ws
]);
2350 if (reduce_data
== NULL
&&
2351 __kmp_atomic_compare_store(&team
->t
.t_tg_reduce_data
[is_ws
], reduce_data
,
2353 // single thread enters this block to initialize common reduction data
2354 KMP_DEBUG_ASSERT(reduce_data
== NULL
);
2355 // first initialize own data, then make a copy other threads can use
2356 tg
= (kmp_taskgroup_t
*)__kmp_task_reduction_init
<T
>(gtid
, num
, data
);
2357 reduce_data
= __kmp_thread_malloc(thr
, num
* sizeof(kmp_taskred_data_t
));
2358 KMP_MEMCPY(reduce_data
, tg
->reduce_data
, num
* sizeof(kmp_taskred_data_t
));
2359 // fini counters should be 0 at this point
2360 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team
->t
.t_tg_fini_counter
[0]) == 0);
2361 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team
->t
.t_tg_fini_counter
[1]) == 0);
2362 KMP_ATOMIC_ST_REL(&team
->t
.t_tg_reduce_data
[is_ws
], reduce_data
);
2365 (reduce_data
= KMP_ATOMIC_LD_ACQ(&team
->t
.t_tg_reduce_data
[is_ws
])) ==
2366 (void *)1) { // wait for task reduction initialization
2369 KMP_DEBUG_ASSERT(reduce_data
> (void *)1); // should be valid pointer here
2370 tg
= thr
->th
.th_current_task
->td_taskgroup
;
2371 __kmp_task_reduction_init_copy
<T
>(thr
, num
, data
, tg
, reduce_data
);
2378 @param loc Source location info
2379 @param gtid Global thread ID
2380 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2381 @param num Number of data items to reduce
2382 @param data Array of data for reduction
2383 @return The taskgroup identifier
2385 Initialize task reduction for a parallel or worksharing.
2387 Note: this entry supposes the optional compiler-generated initializer routine
2388 has single parameter - pointer to object to be initialized. That means
2389 the reduction either does not use omp_orig object, or the omp_orig is accessible
2390 without help of the runtime library.
2392 void *__kmpc_task_reduction_modifier_init(ident_t
*loc
, int gtid
, int is_ws
,
2393 int num
, void *data
) {
2394 return __kmp_task_reduction_modifier_init(loc
, gtid
, is_ws
, num
,
2395 (kmp_task_red_input_t
*)data
);
2400 @param loc Source location info
2401 @param gtid Global thread ID
2402 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2403 @param num Number of data items to reduce
2404 @param data Array of data for reduction
2405 @return The taskgroup identifier
2407 Initialize task reduction for a parallel or worksharing.
2409 Note: this entry supposes the optional compiler-generated initializer routine
2410 has two parameters, pointer to object to be initialized and pointer to omp_orig
2412 void *__kmpc_taskred_modifier_init(ident_t
*loc
, int gtid
, int is_ws
, int num
,
2414 return __kmp_task_reduction_modifier_init(loc
, gtid
, is_ws
, num
,
2415 (kmp_taskred_input_t
*)data
);
2420 @param loc Source location info
2421 @param gtid Global thread ID
2422 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2424 Finalize task reduction for a parallel or worksharing.
2426 void __kmpc_task_reduction_modifier_fini(ident_t
*loc
, int gtid
, int is_ws
) {
2427 __kmpc_end_taskgroup(loc
, gtid
);
2430 // __kmpc_taskgroup: Start a new taskgroup
2431 void __kmpc_taskgroup(ident_t
*loc
, int gtid
) {
2432 __kmp_assert_valid_gtid(gtid
);
2433 kmp_info_t
*thread
= __kmp_threads
[gtid
];
2434 kmp_taskdata_t
*taskdata
= thread
->th
.th_current_task
;
2435 kmp_taskgroup_t
*tg_new
=
2436 (kmp_taskgroup_t
*)__kmp_thread_malloc(thread
, sizeof(kmp_taskgroup_t
));
2437 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid
, loc
, tg_new
));
2438 KMP_ATOMIC_ST_RLX(&tg_new
->count
, 0);
2439 KMP_ATOMIC_ST_RLX(&tg_new
->cancel_request
, cancel_noreq
);
2440 tg_new
->parent
= taskdata
->td_taskgroup
;
2441 tg_new
->reduce_data
= NULL
;
2442 tg_new
->reduce_num_data
= 0;
2443 taskdata
->td_taskgroup
= tg_new
;
2445 #if OMPT_SUPPORT && OMPT_OPTIONAL
2446 if (UNLIKELY(ompt_enabled
.ompt_callback_sync_region
)) {
2447 void *codeptr
= OMPT_LOAD_RETURN_ADDRESS(gtid
);
2449 codeptr
= OMPT_GET_RETURN_ADDRESS(0);
2450 kmp_team_t
*team
= thread
->th
.th_team
;
2451 ompt_data_t my_task_data
= taskdata
->ompt_task_info
.task_data
;
2452 // FIXME: I think this is wrong for lwt!
2453 ompt_data_t my_parallel_data
= team
->t
.ompt_team_info
.parallel_data
;
2455 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
2456 ompt_sync_region_taskgroup
, ompt_scope_begin
, &(my_parallel_data
),
2457 &(my_task_data
), codeptr
);
2462 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2463 // and its descendants are complete
2464 void __kmpc_end_taskgroup(ident_t
*loc
, int gtid
) {
2465 __kmp_assert_valid_gtid(gtid
);
2466 kmp_info_t
*thread
= __kmp_threads
[gtid
];
2467 kmp_taskdata_t
*taskdata
= thread
->th
.th_current_task
;
2468 kmp_taskgroup_t
*taskgroup
= taskdata
->td_taskgroup
;
2469 int thread_finished
= FALSE
;
2471 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473 ompt_data_t my_task_data
;
2474 ompt_data_t my_parallel_data
;
2476 if (UNLIKELY(ompt_enabled
.enabled
)) {
2477 team
= thread
->th
.th_team
;
2478 my_task_data
= taskdata
->ompt_task_info
.task_data
;
2479 // FIXME: I think this is wrong for lwt!
2480 my_parallel_data
= team
->t
.ompt_team_info
.parallel_data
;
2481 codeptr
= OMPT_LOAD_RETURN_ADDRESS(gtid
);
2483 codeptr
= OMPT_GET_RETURN_ADDRESS(0);
2487 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid
, loc
));
2488 KMP_DEBUG_ASSERT(taskgroup
!= NULL
);
2489 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP
);
2491 if (__kmp_tasking_mode
!= tskm_immediate_exec
) {
2492 // mark task as waiting not on a barrier
2493 taskdata
->td_taskwait_counter
+= 1;
2494 taskdata
->td_taskwait_ident
= loc
;
2495 taskdata
->td_taskwait_thread
= gtid
+ 1;
2497 // For ITT the taskgroup wait is similar to taskwait until we need to
2499 void *itt_sync_obj
= __kmp_itt_taskwait_object(gtid
);
2500 if (itt_sync_obj
!= NULL
)
2501 __kmp_itt_taskwait_starting(gtid
, itt_sync_obj
);
2502 #endif /* USE_ITT_BUILD */
2504 #if OMPT_SUPPORT && OMPT_OPTIONAL
2505 if (UNLIKELY(ompt_enabled
.ompt_callback_sync_region_wait
)) {
2506 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
2507 ompt_sync_region_taskgroup
, ompt_scope_begin
, &(my_parallel_data
),
2508 &(my_task_data
), codeptr
);
2512 if (!taskdata
->td_flags
.team_serial
||
2513 (thread
->th
.th_task_team
!= NULL
&&
2514 thread
->th
.th_task_team
->tt
.tt_found_proxy_tasks
)) {
2515 kmp_flag_32
flag(RCAST(std::atomic
<kmp_uint32
> *, &(taskgroup
->count
)),
2517 while (KMP_ATOMIC_LD_ACQ(&taskgroup
->count
) != 0) {
2518 flag
.execute_tasks(thread
, gtid
, FALSE
,
2519 &thread_finished
USE_ITT_BUILD_ARG(itt_sync_obj
),
2520 __kmp_task_stealing_constraint
);
2523 taskdata
->td_taskwait_thread
= -taskdata
->td_taskwait_thread
; // end waiting
2525 #if OMPT_SUPPORT && OMPT_OPTIONAL
2526 if (UNLIKELY(ompt_enabled
.ompt_callback_sync_region_wait
)) {
2527 ompt_callbacks
.ompt_callback(ompt_callback_sync_region_wait
)(
2528 ompt_sync_region_taskgroup
, ompt_scope_end
, &(my_parallel_data
),
2529 &(my_task_data
), codeptr
);
2534 if (itt_sync_obj
!= NULL
)
2535 __kmp_itt_taskwait_finished(gtid
, itt_sync_obj
);
2536 KMP_FSYNC_ACQUIRED(taskdata
); // acquire self - sync with descendants
2537 #endif /* USE_ITT_BUILD */
2539 KMP_DEBUG_ASSERT(taskgroup
->count
== 0);
2541 if (taskgroup
->reduce_data
!= NULL
) { // need to reduce?
2544 kmp_team_t
*t
= thread
->th
.th_team
;
2545 kmp_taskred_data_t
*arr
= (kmp_taskred_data_t
*)taskgroup
->reduce_data
;
2546 // check if <priv> data of the first reduction variable shared for the team
2547 void *priv0
= arr
[0].reduce_priv
;
2548 if ((reduce_data
= KMP_ATOMIC_LD_ACQ(&t
->t
.t_tg_reduce_data
[0])) != NULL
&&
2549 ((kmp_taskred_data_t
*)reduce_data
)[0].reduce_priv
== priv0
) {
2550 // finishing task reduction on parallel
2551 cnt
= KMP_ATOMIC_INC(&t
->t
.t_tg_fini_counter
[0]);
2552 if (cnt
== thread
->th
.th_team_nproc
- 1) {
2553 // we are the last thread passing __kmpc_reduction_modifier_fini()
2554 // finalize task reduction:
2555 __kmp_task_reduction_fini(thread
, taskgroup
);
2556 // cleanup fields in the team structure:
2557 // TODO: is relaxed store enough here (whole barrier should follow)?
2558 __kmp_thread_free(thread
, reduce_data
);
2559 KMP_ATOMIC_ST_REL(&t
->t
.t_tg_reduce_data
[0], NULL
);
2560 KMP_ATOMIC_ST_REL(&t
->t
.t_tg_fini_counter
[0], 0);
2562 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2563 // so do not finalize reduction, just clean own copy of the data
2564 __kmp_task_reduction_clean(thread
, taskgroup
);
2566 } else if ((reduce_data
= KMP_ATOMIC_LD_ACQ(&t
->t
.t_tg_reduce_data
[1])) !=
2568 ((kmp_taskred_data_t
*)reduce_data
)[0].reduce_priv
== priv0
) {
2569 // finishing task reduction on worksharing
2570 cnt
= KMP_ATOMIC_INC(&t
->t
.t_tg_fini_counter
[1]);
2571 if (cnt
== thread
->th
.th_team_nproc
- 1) {
2572 // we are the last thread passing __kmpc_reduction_modifier_fini()
2573 __kmp_task_reduction_fini(thread
, taskgroup
);
2574 // cleanup fields in team structure:
2575 // TODO: is relaxed store enough here (whole barrier should follow)?
2576 __kmp_thread_free(thread
, reduce_data
);
2577 KMP_ATOMIC_ST_REL(&t
->t
.t_tg_reduce_data
[1], NULL
);
2578 KMP_ATOMIC_ST_REL(&t
->t
.t_tg_fini_counter
[1], 0);
2580 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2581 // so do not finalize reduction, just clean own copy of the data
2582 __kmp_task_reduction_clean(thread
, taskgroup
);
2585 // finishing task reduction on taskgroup
2586 __kmp_task_reduction_fini(thread
, taskgroup
);
2589 // Restore parent taskgroup for the current task
2590 taskdata
->td_taskgroup
= taskgroup
->parent
;
2591 __kmp_thread_free(thread
, taskgroup
);
2593 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2595 ANNOTATE_HAPPENS_AFTER(taskdata
);
2597 #if OMPT_SUPPORT && OMPT_OPTIONAL
2598 if (UNLIKELY(ompt_enabled
.ompt_callback_sync_region
)) {
2599 ompt_callbacks
.ompt_callback(ompt_callback_sync_region
)(
2600 ompt_sync_region_taskgroup
, ompt_scope_end
, &(my_parallel_data
),
2601 &(my_task_data
), codeptr
);
2606 // __kmp_remove_my_task: remove a task from my own deque
2607 static kmp_task_t
*__kmp_remove_my_task(kmp_info_t
*thread
, kmp_int32 gtid
,
2608 kmp_task_team_t
*task_team
,
2609 kmp_int32 is_constrained
) {
2611 kmp_taskdata_t
*taskdata
;
2612 kmp_thread_data_t
*thread_data
;
2615 KMP_DEBUG_ASSERT(__kmp_tasking_mode
!= tskm_immediate_exec
);
2616 KMP_DEBUG_ASSERT(task_team
->tt
.tt_threads_data
!=
2617 NULL
); // Caller should check this condition
2619 thread_data
= &task_team
->tt
.tt_threads_data
[__kmp_tid_from_gtid(gtid
)];
2621 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2622 gtid
, thread_data
->td
.td_deque_ntasks
,
2623 thread_data
->td
.td_deque_head
, thread_data
->td
.td_deque_tail
));
2625 if (TCR_4(thread_data
->td
.td_deque_ntasks
) == 0) {
2627 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2628 "ntasks=%d head=%u tail=%u\n",
2629 gtid
, thread_data
->td
.td_deque_ntasks
,
2630 thread_data
->td
.td_deque_head
, thread_data
->td
.td_deque_tail
));
2634 __kmp_acquire_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
2636 if (TCR_4(thread_data
->td
.td_deque_ntasks
) == 0) {
2637 __kmp_release_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
2639 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2640 "ntasks=%d head=%u tail=%u\n",
2641 gtid
, thread_data
->td
.td_deque_ntasks
,
2642 thread_data
->td
.td_deque_head
, thread_data
->td
.td_deque_tail
));
2646 tail
= (thread_data
->td
.td_deque_tail
- 1) &
2647 TASK_DEQUE_MASK(thread_data
->td
); // Wrap index.
2648 taskdata
= thread_data
->td
.td_deque
[tail
];
2650 if (!__kmp_task_is_allowed(gtid
, is_constrained
, taskdata
,
2651 thread
->th
.th_current_task
)) {
2652 // The TSC does not allow to steal victim task
2653 __kmp_release_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
2655 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2656 "ntasks=%d head=%u tail=%u\n",
2657 gtid
, thread_data
->td
.td_deque_ntasks
,
2658 thread_data
->td
.td_deque_head
, thread_data
->td
.td_deque_tail
));
2662 thread_data
->td
.td_deque_tail
= tail
;
2663 TCW_4(thread_data
->td
.td_deque_ntasks
, thread_data
->td
.td_deque_ntasks
- 1);
2665 __kmp_release_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
2667 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2668 "ntasks=%d head=%u tail=%u\n",
2669 gtid
, taskdata
, thread_data
->td
.td_deque_ntasks
,
2670 thread_data
->td
.td_deque_head
, thread_data
->td
.td_deque_tail
));
2672 task
= KMP_TASKDATA_TO_TASK(taskdata
);
2676 // __kmp_steal_task: remove a task from another thread's deque
2677 // Assume that calling thread has already checked existence of
2678 // task_team thread_data before calling this routine.
2679 static kmp_task_t
*__kmp_steal_task(kmp_info_t
*victim_thr
, kmp_int32 gtid
,
2680 kmp_task_team_t
*task_team
,
2681 std::atomic
<kmp_int32
> *unfinished_threads
,
2682 int *thread_finished
,
2683 kmp_int32 is_constrained
) {
2685 kmp_taskdata_t
*taskdata
;
2686 kmp_taskdata_t
*current
;
2687 kmp_thread_data_t
*victim_td
, *threads_data
;
2689 kmp_int32 victim_tid
;
2691 KMP_DEBUG_ASSERT(__kmp_tasking_mode
!= tskm_immediate_exec
);
2693 threads_data
= task_team
->tt
.tt_threads_data
;
2694 KMP_DEBUG_ASSERT(threads_data
!= NULL
); // Caller should check this condition
2696 victim_tid
= victim_thr
->th
.th_info
.ds
.ds_tid
;
2697 victim_td
= &threads_data
[victim_tid
];
2699 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2700 "task_team=%p ntasks=%d head=%u tail=%u\n",
2701 gtid
, __kmp_gtid_from_thread(victim_thr
), task_team
,
2702 victim_td
->td
.td_deque_ntasks
, victim_td
->td
.td_deque_head
,
2703 victim_td
->td
.td_deque_tail
));
2705 if (TCR_4(victim_td
->td
.td_deque_ntasks
) == 0) {
2706 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2707 "task_team=%p ntasks=%d head=%u tail=%u\n",
2708 gtid
, __kmp_gtid_from_thread(victim_thr
), task_team
,
2709 victim_td
->td
.td_deque_ntasks
, victim_td
->td
.td_deque_head
,
2710 victim_td
->td
.td_deque_tail
));
2714 __kmp_acquire_bootstrap_lock(&victim_td
->td
.td_deque_lock
);
2716 int ntasks
= TCR_4(victim_td
->td
.td_deque_ntasks
);
2717 // Check again after we acquire the lock
2719 __kmp_release_bootstrap_lock(&victim_td
->td
.td_deque_lock
);
2720 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2721 "task_team=%p ntasks=%d head=%u tail=%u\n",
2722 gtid
, __kmp_gtid_from_thread(victim_thr
), task_team
, ntasks
,
2723 victim_td
->td
.td_deque_head
, victim_td
->td
.td_deque_tail
));
2727 KMP_DEBUG_ASSERT(victim_td
->td
.td_deque
!= NULL
);
2728 current
= __kmp_threads
[gtid
]->th
.th_current_task
;
2729 taskdata
= victim_td
->td
.td_deque
[victim_td
->td
.td_deque_head
];
2730 if (__kmp_task_is_allowed(gtid
, is_constrained
, taskdata
, current
)) {
2731 // Bump head pointer and Wrap.
2732 victim_td
->td
.td_deque_head
=
2733 (victim_td
->td
.td_deque_head
+ 1) & TASK_DEQUE_MASK(victim_td
->td
);
2735 if (!task_team
->tt
.tt_untied_task_encountered
) {
2736 // The TSC does not allow to steal victim task
2737 __kmp_release_bootstrap_lock(&victim_td
->td
.td_deque_lock
);
2738 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2739 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2740 gtid
, __kmp_gtid_from_thread(victim_thr
), task_team
, ntasks
,
2741 victim_td
->td
.td_deque_head
, victim_td
->td
.td_deque_tail
));
2745 // walk through victim's deque trying to steal any task
2746 target
= victim_td
->td
.td_deque_head
;
2748 for (i
= 1; i
< ntasks
; ++i
) {
2749 target
= (target
+ 1) & TASK_DEQUE_MASK(victim_td
->td
);
2750 taskdata
= victim_td
->td
.td_deque
[target
];
2751 if (__kmp_task_is_allowed(gtid
, is_constrained
, taskdata
, current
)) {
2752 break; // found victim task
2757 if (taskdata
== NULL
) {
2758 // No appropriate candidate to steal found
2759 __kmp_release_bootstrap_lock(&victim_td
->td
.td_deque_lock
);
2760 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2761 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2762 gtid
, __kmp_gtid_from_thread(victim_thr
), task_team
, ntasks
,
2763 victim_td
->td
.td_deque_head
, victim_td
->td
.td_deque_tail
));
2767 for (i
= i
+ 1; i
< ntasks
; ++i
) {
2768 // shift remaining tasks in the deque left by 1
2769 target
= (target
+ 1) & TASK_DEQUE_MASK(victim_td
->td
);
2770 victim_td
->td
.td_deque
[prev
] = victim_td
->td
.td_deque
[target
];
2774 victim_td
->td
.td_deque_tail
==
2775 (kmp_uint32
)((target
+ 1) & TASK_DEQUE_MASK(victim_td
->td
)));
2776 victim_td
->td
.td_deque_tail
= target
; // tail -= 1 (wrapped))
2778 if (*thread_finished
) {
2779 // We need to un-mark this victim as a finished victim. This must be done
2780 // before releasing the lock, or else other threads (starting with the
2781 // master victim) might be prematurely released from the barrier!!!
2784 count
= KMP_ATOMIC_INC(unfinished_threads
);
2788 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2789 gtid
, count
+ 1, task_team
));
2791 *thread_finished
= FALSE
;
2793 TCW_4(victim_td
->td
.td_deque_ntasks
, ntasks
- 1);
2795 __kmp_release_bootstrap_lock(&victim_td
->td
.td_deque_lock
);
2797 KMP_COUNT_BLOCK(TASK_stolen
);
2799 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2800 "task_team=%p ntasks=%d head=%u tail=%u\n",
2801 gtid
, taskdata
, __kmp_gtid_from_thread(victim_thr
), task_team
,
2802 ntasks
, victim_td
->td
.td_deque_head
, victim_td
->td
.td_deque_tail
));
2804 task
= KMP_TASKDATA_TO_TASK(taskdata
);
2808 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2809 // condition is statisfied (return true) or there are none left (return false).
2811 // final_spin is TRUE if this is the spin at the release barrier.
2812 // thread_finished indicates whether the thread is finished executing all
2813 // the tasks it has on its deque, and is at the release barrier.
2814 // spinner is the location on which to spin.
2815 // spinner == NULL means only execute a single task and return.
2816 // checker is the value to check to terminate the spin.
2818 static inline int __kmp_execute_tasks_template(
2819 kmp_info_t
*thread
, kmp_int32 gtid
, C
*flag
, int final_spin
,
2820 int *thread_finished
USE_ITT_BUILD_ARG(void *itt_sync_obj
),
2821 kmp_int32 is_constrained
) {
2822 kmp_task_team_t
*task_team
= thread
->th
.th_task_team
;
2823 kmp_thread_data_t
*threads_data
;
2825 kmp_info_t
*other_thread
;
2826 kmp_taskdata_t
*current_task
= thread
->th
.th_current_task
;
2827 std::atomic
<kmp_int32
> *unfinished_threads
;
2828 kmp_int32 nthreads
, victim_tid
= -2, use_own_tasks
= 1, new_victim
= 0,
2829 tid
= thread
->th
.th_info
.ds
.ds_tid
;
2831 KMP_DEBUG_ASSERT(__kmp_tasking_mode
!= tskm_immediate_exec
);
2832 KMP_DEBUG_ASSERT(thread
== __kmp_threads
[gtid
]);
2834 if (task_team
== NULL
|| current_task
== NULL
)
2837 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2838 "*thread_finished=%d\n",
2839 gtid
, final_spin
, *thread_finished
));
2841 thread
->th
.th_reap_state
= KMP_NOT_SAFE_TO_REAP
;
2842 threads_data
= (kmp_thread_data_t
*)TCR_PTR(task_team
->tt
.tt_threads_data
);
2843 KMP_DEBUG_ASSERT(threads_data
!= NULL
);
2845 nthreads
= task_team
->tt
.tt_nproc
;
2846 unfinished_threads
= &(task_team
->tt
.tt_unfinished_threads
);
2847 KMP_DEBUG_ASSERT(nthreads
> 1 || task_team
->tt
.tt_found_proxy_tasks
);
2848 KMP_DEBUG_ASSERT(*unfinished_threads
>= 0);
2850 while (1) { // Outer loop keeps trying to find tasks in case of single thread
2851 // getting tasks from target constructs
2852 while (1) { // Inner loop to find a task and execute it
2854 if (use_own_tasks
) { // check on own queue first
2855 task
= __kmp_remove_my_task(thread
, gtid
, task_team
, is_constrained
);
2857 if ((task
== NULL
) && (nthreads
> 1)) { // Steal a task
2860 // Try to steal from the last place I stole from successfully.
2861 if (victim_tid
== -2) { // haven't stolen anything yet
2862 victim_tid
= threads_data
[tid
].td
.td_deque_last_stolen
;
2864 -1) // if we have a last stolen from victim, get the thread
2865 other_thread
= threads_data
[victim_tid
].td
.td_thr
;
2867 if (victim_tid
!= -1) { // found last victim
2869 } else if (!new_victim
) { // no recent steals and we haven't already
2870 // used a new victim; select a random thread
2871 do { // Find a different thread to steal work from.
2872 // Pick a random thread. Initial plan was to cycle through all the
2873 // threads, and only return if we tried to steal from every thread,
2874 // and failed. Arch says that's not such a great idea.
2875 victim_tid
= __kmp_get_random(thread
) % (nthreads
- 1);
2876 if (victim_tid
>= tid
) {
2877 ++victim_tid
; // Adjusts random distribution to exclude self
2879 // Found a potential victim
2880 other_thread
= threads_data
[victim_tid
].td
.td_thr
;
2881 // There is a slight chance that __kmp_enable_tasking() did not wake
2882 // up all threads waiting at the barrier. If victim is sleeping,
2883 // then wake it up. Since we were going to pay the cache miss
2884 // penalty for referencing another thread's kmp_info_t struct
2886 // the check shouldn't cost too much performance at this point. In
2887 // extra barrier mode, tasks do not sleep at the separate tasking
2888 // barrier, so this isn't a problem.
2890 if ((__kmp_tasking_mode
== tskm_task_teams
) &&
2891 (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
) &&
2892 (TCR_PTR(CCAST(void *, other_thread
->th
.th_sleep_loc
)) !=
2895 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread
),
2896 other_thread
->th
.th_sleep_loc
);
2897 // A sleeping thread should not have any tasks on it's queue.
2898 // There is a slight possibility that it resumes, steals a task
2899 // from another thread, which spawns more tasks, all in the time
2900 // that it takes this thread to check => don't write an assertion
2901 // that the victim's queue is empty. Try stealing from a
2902 // different thread.
2908 // We have a victim to try to steal from
2909 task
= __kmp_steal_task(other_thread
, gtid
, task_team
,
2910 unfinished_threads
, thread_finished
,
2913 if (task
!= NULL
) { // set last stolen to victim
2914 if (threads_data
[tid
].td
.td_deque_last_stolen
!= victim_tid
) {
2915 threads_data
[tid
].td
.td_deque_last_stolen
= victim_tid
;
2916 // The pre-refactored code did not try more than 1 successful new
2917 // vicitm, unless the last one generated more local tasks;
2918 // new_victim keeps track of this
2921 } else { // No tasks found; unset last_stolen
2922 KMP_CHECK_UPDATE(threads_data
[tid
].td
.td_deque_last_stolen
, -1);
2923 victim_tid
= -2; // no successful victim found
2927 if (task
== NULL
) // break out of tasking loop
2930 // Found a task; execute it
2931 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2932 if (__itt_sync_create_ptr
|| KMP_ITT_DEBUG
) {
2933 if (itt_sync_obj
== NULL
) { // we are at fork barrier where we could not
2934 // get the object reliably
2935 itt_sync_obj
= __kmp_itt_barrier_object(gtid
, bs_forkjoin_barrier
);
2937 __kmp_itt_task_starting(itt_sync_obj
);
2939 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2940 __kmp_invoke_task(gtid
, task
, current_task
);
2942 if (itt_sync_obj
!= NULL
)
2943 __kmp_itt_task_finished(itt_sync_obj
);
2944 #endif /* USE_ITT_BUILD */
2945 // If this thread is only partway through the barrier and the condition is
2946 // met, then return now, so that the barrier gather/release pattern can
2947 // proceed. If this thread is in the last spin loop in the barrier,
2948 // waiting to be released, we know that the termination condition will not
2949 // be satisfied, so don't waste any cycles checking it.
2950 if (flag
== NULL
|| (!final_spin
&& flag
->done_check())) {
2953 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2957 if (thread
->th
.th_task_team
== NULL
) {
2960 KMP_YIELD(__kmp_library
== library_throughput
); // Yield before next task
2961 // If execution of a stolen task results in more tasks being placed on our
2962 // run queue, reset use_own_tasks
2963 if (!use_own_tasks
&& TCR_4(threads_data
[tid
].td
.td_deque_ntasks
) != 0) {
2964 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2965 "other tasks, restart\n",
2972 // The task source has been exhausted. If in final spin loop of barrier,
2973 // check if termination condition is satisfied. The work queue may be empty
2974 // but there might be proxy tasks still executing.
2976 KMP_ATOMIC_LD_ACQ(¤t_task
->td_incomplete_child_tasks
) == 0) {
2977 // First, decrement the #unfinished threads, if that has not already been
2978 // done. This decrement might be to the spin location, and result in the
2979 // termination condition being satisfied.
2980 if (!*thread_finished
) {
2983 count
= KMP_ATOMIC_DEC(unfinished_threads
) - 1;
2984 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2985 "unfinished_threads to %d task_team=%p\n",
2986 gtid
, count
, task_team
));
2987 *thread_finished
= TRUE
;
2990 // It is now unsafe to reference thread->th.th_team !!!
2991 // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2992 // thread to pass through the barrier, where it might reset each thread's
2993 // th.th_team field for the next parallel region. If we can steal more
2994 // work, we know that this has not happened yet.
2995 if (flag
!= NULL
&& flag
->done_check()) {
2998 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3004 // If this thread's task team is NULL, master has recognized that there are
3005 // no more tasks; bail out
3006 if (thread
->th
.th_task_team
== NULL
) {
3008 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid
));
3012 // We could be getting tasks from target constructs; if this is the only
3013 // thread, keep trying to execute tasks from own queue
3018 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid
));
3024 int __kmp_execute_tasks_32(
3025 kmp_info_t
*thread
, kmp_int32 gtid
, kmp_flag_32
*flag
, int final_spin
,
3026 int *thread_finished
USE_ITT_BUILD_ARG(void *itt_sync_obj
),
3027 kmp_int32 is_constrained
) {
3028 return __kmp_execute_tasks_template(
3029 thread
, gtid
, flag
, final_spin
,
3030 thread_finished
USE_ITT_BUILD_ARG(itt_sync_obj
), is_constrained
);
3033 int __kmp_execute_tasks_64(
3034 kmp_info_t
*thread
, kmp_int32 gtid
, kmp_flag_64
*flag
, int final_spin
,
3035 int *thread_finished
USE_ITT_BUILD_ARG(void *itt_sync_obj
),
3036 kmp_int32 is_constrained
) {
3037 return __kmp_execute_tasks_template(
3038 thread
, gtid
, flag
, final_spin
,
3039 thread_finished
USE_ITT_BUILD_ARG(itt_sync_obj
), is_constrained
);
3042 int __kmp_execute_tasks_oncore(
3043 kmp_info_t
*thread
, kmp_int32 gtid
, kmp_flag_oncore
*flag
, int final_spin
,
3044 int *thread_finished
USE_ITT_BUILD_ARG(void *itt_sync_obj
),
3045 kmp_int32 is_constrained
) {
3046 return __kmp_execute_tasks_template(
3047 thread
, gtid
, flag
, final_spin
,
3048 thread_finished
USE_ITT_BUILD_ARG(itt_sync_obj
), is_constrained
);
3051 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3052 // next barrier so they can assist in executing enqueued tasks.
3053 // First thread in allocates the task team atomically.
3054 static void __kmp_enable_tasking(kmp_task_team_t
*task_team
,
3055 kmp_info_t
*this_thr
) {
3056 kmp_thread_data_t
*threads_data
;
3057 int nthreads
, i
, is_init_thread
;
3059 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3060 __kmp_gtid_from_thread(this_thr
)));
3062 KMP_DEBUG_ASSERT(task_team
!= NULL
);
3063 KMP_DEBUG_ASSERT(this_thr
->th
.th_team
!= NULL
);
3065 nthreads
= task_team
->tt
.tt_nproc
;
3066 KMP_DEBUG_ASSERT(nthreads
> 0);
3067 KMP_DEBUG_ASSERT(nthreads
== this_thr
->th
.th_team
->t
.t_nproc
);
3069 // Allocate or increase the size of threads_data if necessary
3070 is_init_thread
= __kmp_realloc_task_threads_data(this_thr
, task_team
);
3072 if (!is_init_thread
) {
3073 // Some other thread already set up the array.
3076 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3077 __kmp_gtid_from_thread(this_thr
)));
3080 threads_data
= (kmp_thread_data_t
*)TCR_PTR(task_team
->tt
.tt_threads_data
);
3081 KMP_DEBUG_ASSERT(threads_data
!= NULL
);
3083 if (__kmp_tasking_mode
== tskm_task_teams
&&
3084 (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
)) {
3085 // Release any threads sleeping at the barrier, so that they can steal
3086 // tasks and execute them. In extra barrier mode, tasks do not sleep
3087 // at the separate tasking barrier, so this isn't a problem.
3088 for (i
= 0; i
< nthreads
; i
++) {
3089 volatile void *sleep_loc
;
3090 kmp_info_t
*thread
= threads_data
[i
].td
.td_thr
;
3092 if (i
== this_thr
->th
.th_info
.ds
.ds_tid
) {
3095 // Since we haven't locked the thread's suspend mutex lock at this
3096 // point, there is a small window where a thread might be putting
3097 // itself to sleep, but hasn't set the th_sleep_loc field yet.
3098 // To work around this, __kmp_execute_tasks_template() periodically checks
3099 // see if other threads are sleeping (using the same random mechanism that
3100 // is used for task stealing) and awakens them if they are.
3101 if ((sleep_loc
= TCR_PTR(CCAST(void *, thread
->th
.th_sleep_loc
))) !=
3103 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3104 __kmp_gtid_from_thread(this_thr
),
3105 __kmp_gtid_from_thread(thread
)));
3106 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread
), sleep_loc
);
3108 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3109 __kmp_gtid_from_thread(this_thr
),
3110 __kmp_gtid_from_thread(thread
)));
3115 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3116 __kmp_gtid_from_thread(this_thr
)));
3119 /* // TODO: Check the comment consistency
3120 * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3121 * like a shadow of the kmp_team_t data struct, with a different lifetime.
3122 * After a child * thread checks into a barrier and calls __kmp_release() from
3123 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3124 * longer assume that the kmp_team_t structure is intact (at any moment, the
3125 * master thread may exit the barrier code and free the team data structure,
3126 * and return the threads to the thread pool).
3128 * This does not work with the tasking code, as the thread is still
3129 * expected to participate in the execution of any tasks that may have been
3130 * spawned my a member of the team, and the thread still needs access to all
3131 * to each thread in the team, so that it can steal work from it.
3133 * Enter the existence of the kmp_task_team_t struct. It employs a reference
3134 * counting mechanism, and is allocated by the master thread before calling
3135 * __kmp_<barrier_kind>_release, and then is release by the last thread to
3136 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3137 * of the kmp_task_team_t structs for consecutive barriers can overlap
3138 * (and will, unless the master thread is the last thread to exit the barrier
3139 * release phase, which is not typical). The existence of such a struct is
3140 * useful outside the context of tasking.
3142 * We currently use the existence of the threads array as an indicator that
3143 * tasks were spawned since the last barrier. If the structure is to be
3144 * useful outside the context of tasking, then this will have to change, but
3145 * not setting the field minimizes the performance impact of tasking on
3146 * barriers, when no explicit tasks were spawned (pushed, actually).
3149 static kmp_task_team_t
*__kmp_free_task_teams
=
3150 NULL
; // Free list for task_team data structures
3151 // Lock for task team data structures
3152 kmp_bootstrap_lock_t __kmp_task_team_lock
=
3153 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock
);
3155 // __kmp_alloc_task_deque:
3156 // Allocates a task deque for a particular thread, and initialize the necessary
3157 // data structures relating to the deque. This only happens once per thread
3158 // per task team since task teams are recycled. No lock is needed during
3159 // allocation since each thread allocates its own deque.
3160 static void __kmp_alloc_task_deque(kmp_info_t
*thread
,
3161 kmp_thread_data_t
*thread_data
) {
3162 __kmp_init_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
3163 KMP_DEBUG_ASSERT(thread_data
->td
.td_deque
== NULL
);
3165 // Initialize last stolen task field to "none"
3166 thread_data
->td
.td_deque_last_stolen
= -1;
3168 KMP_DEBUG_ASSERT(TCR_4(thread_data
->td
.td_deque_ntasks
) == 0);
3169 KMP_DEBUG_ASSERT(thread_data
->td
.td_deque_head
== 0);
3170 KMP_DEBUG_ASSERT(thread_data
->td
.td_deque_tail
== 0);
3174 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3175 __kmp_gtid_from_thread(thread
), INITIAL_TASK_DEQUE_SIZE
, thread_data
));
3176 // Allocate space for task deque, and zero the deque
3177 // Cannot use __kmp_thread_calloc() because threads not around for
3178 // kmp_reap_task_team( ).
3179 thread_data
->td
.td_deque
= (kmp_taskdata_t
**)__kmp_allocate(
3180 INITIAL_TASK_DEQUE_SIZE
* sizeof(kmp_taskdata_t
*));
3181 thread_data
->td
.td_deque_size
= INITIAL_TASK_DEQUE_SIZE
;
3184 // __kmp_free_task_deque:
3185 // Deallocates a task deque for a particular thread. Happens at library
3186 // deallocation so don't need to reset all thread data fields.
3187 static void __kmp_free_task_deque(kmp_thread_data_t
*thread_data
) {
3188 if (thread_data
->td
.td_deque
!= NULL
) {
3189 __kmp_acquire_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
3190 TCW_4(thread_data
->td
.td_deque_ntasks
, 0);
3191 __kmp_free(thread_data
->td
.td_deque
);
3192 thread_data
->td
.td_deque
= NULL
;
3193 __kmp_release_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
3196 #ifdef BUILD_TIED_TASK_STACK
3197 // GEH: Figure out what to do here for td_susp_tied_tasks
3198 if (thread_data
->td
.td_susp_tied_tasks
.ts_entries
!= TASK_STACK_EMPTY
) {
3199 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid
), thread_data
);
3201 #endif // BUILD_TIED_TASK_STACK
3204 // __kmp_realloc_task_threads_data:
3205 // Allocates a threads_data array for a task team, either by allocating an
3206 // initial array or enlarging an existing array. Only the first thread to get
3207 // the lock allocs or enlarges the array and re-initializes the array elements.
3208 // That thread returns "TRUE", the rest return "FALSE".
3209 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3210 // The current size is given by task_team -> tt.tt_max_threads.
3211 static int __kmp_realloc_task_threads_data(kmp_info_t
*thread
,
3212 kmp_task_team_t
*task_team
) {
3213 kmp_thread_data_t
**threads_data_p
;
3214 kmp_int32 nthreads
, maxthreads
;
3215 int is_init_thread
= FALSE
;
3217 if (TCR_4(task_team
->tt
.tt_found_tasks
)) {
3218 // Already reallocated and initialized.
3222 threads_data_p
= &task_team
->tt
.tt_threads_data
;
3223 nthreads
= task_team
->tt
.tt_nproc
;
3224 maxthreads
= task_team
->tt
.tt_max_threads
;
3226 // All threads must lock when they encounter the first task of the implicit
3227 // task region to make sure threads_data fields are (re)initialized before
3229 __kmp_acquire_bootstrap_lock(&task_team
->tt
.tt_threads_lock
);
3231 if (!TCR_4(task_team
->tt
.tt_found_tasks
)) {
3232 // first thread to enable tasking
3233 kmp_team_t
*team
= thread
->th
.th_team
;
3236 is_init_thread
= TRUE
;
3237 if (maxthreads
< nthreads
) {
3239 if (*threads_data_p
!= NULL
) {
3240 kmp_thread_data_t
*old_data
= *threads_data_p
;
3241 kmp_thread_data_t
*new_data
= NULL
;
3245 ("__kmp_realloc_task_threads_data: T#%d reallocating "
3246 "threads data for task_team %p, new_size = %d, old_size = %d\n",
3247 __kmp_gtid_from_thread(thread
), task_team
, nthreads
, maxthreads
));
3248 // Reallocate threads_data to have more elements than current array
3249 // Cannot use __kmp_thread_realloc() because threads not around for
3250 // kmp_reap_task_team( ). Note all new array entries are initialized
3251 // to zero by __kmp_allocate().
3252 new_data
= (kmp_thread_data_t
*)__kmp_allocate(
3253 nthreads
* sizeof(kmp_thread_data_t
));
3254 // copy old data to new data
3255 KMP_MEMCPY_S((void *)new_data
, nthreads
* sizeof(kmp_thread_data_t
),
3256 (void *)old_data
, maxthreads
* sizeof(kmp_thread_data_t
));
3258 #ifdef BUILD_TIED_TASK_STACK
3259 // GEH: Figure out if this is the right thing to do
3260 for (i
= maxthreads
; i
< nthreads
; i
++) {
3261 kmp_thread_data_t
*thread_data
= &(*threads_data_p
)[i
];
3262 __kmp_init_task_stack(__kmp_gtid_from_thread(thread
), thread_data
);
3264 #endif // BUILD_TIED_TASK_STACK
3265 // Install the new data and free the old data
3266 (*threads_data_p
) = new_data
;
3267 __kmp_free(old_data
);
3269 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3270 "threads data for task_team %p, size = %d\n",
3271 __kmp_gtid_from_thread(thread
), task_team
, nthreads
));
3272 // Make the initial allocate for threads_data array, and zero entries
3273 // Cannot use __kmp_thread_calloc() because threads not around for
3274 // kmp_reap_task_team( ).
3275 ANNOTATE_IGNORE_WRITES_BEGIN();
3276 *threads_data_p
= (kmp_thread_data_t
*)__kmp_allocate(
3277 nthreads
* sizeof(kmp_thread_data_t
));
3278 ANNOTATE_IGNORE_WRITES_END();
3279 #ifdef BUILD_TIED_TASK_STACK
3280 // GEH: Figure out if this is the right thing to do
3281 for (i
= 0; i
< nthreads
; i
++) {
3282 kmp_thread_data_t
*thread_data
= &(*threads_data_p
)[i
];
3283 __kmp_init_task_stack(__kmp_gtid_from_thread(thread
), thread_data
);
3285 #endif // BUILD_TIED_TASK_STACK
3287 task_team
->tt
.tt_max_threads
= nthreads
;
3289 // If array has (more than) enough elements, go ahead and use it
3290 KMP_DEBUG_ASSERT(*threads_data_p
!= NULL
);
3293 // initialize threads_data pointers back to thread_info structures
3294 for (i
= 0; i
< nthreads
; i
++) {
3295 kmp_thread_data_t
*thread_data
= &(*threads_data_p
)[i
];
3296 thread_data
->td
.td_thr
= team
->t
.t_threads
[i
];
3298 if (thread_data
->td
.td_deque_last_stolen
>= nthreads
) {
3299 // The last stolen field survives across teams / barrier, and the number
3300 // of threads may have changed. It's possible (likely?) that a new
3301 // parallel region will exhibit the same behavior as previous region.
3302 thread_data
->td
.td_deque_last_stolen
= -1;
3307 TCW_SYNC_4(task_team
->tt
.tt_found_tasks
, TRUE
);
3310 __kmp_release_bootstrap_lock(&task_team
->tt
.tt_threads_lock
);
3311 return is_init_thread
;
3314 // __kmp_free_task_threads_data:
3315 // Deallocates a threads_data array for a task team, including any attached
3316 // tasking deques. Only occurs at library shutdown.
3317 static void __kmp_free_task_threads_data(kmp_task_team_t
*task_team
) {
3318 __kmp_acquire_bootstrap_lock(&task_team
->tt
.tt_threads_lock
);
3319 if (task_team
->tt
.tt_threads_data
!= NULL
) {
3321 for (i
= 0; i
< task_team
->tt
.tt_max_threads
; i
++) {
3322 __kmp_free_task_deque(&task_team
->tt
.tt_threads_data
[i
]);
3324 __kmp_free(task_team
->tt
.tt_threads_data
);
3325 task_team
->tt
.tt_threads_data
= NULL
;
3327 __kmp_release_bootstrap_lock(&task_team
->tt
.tt_threads_lock
);
3330 // __kmp_allocate_task_team:
3331 // Allocates a task team associated with a specific team, taking it from
3332 // the global task team free list if possible. Also initializes data
3334 static kmp_task_team_t
*__kmp_allocate_task_team(kmp_info_t
*thread
,
3336 kmp_task_team_t
*task_team
= NULL
;
3339 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3340 (thread
? __kmp_gtid_from_thread(thread
) : -1), team
));
3342 if (TCR_PTR(__kmp_free_task_teams
) != NULL
) {
3343 // Take a task team from the task team pool
3344 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock
);
3345 if (__kmp_free_task_teams
!= NULL
) {
3346 task_team
= __kmp_free_task_teams
;
3347 TCW_PTR(__kmp_free_task_teams
, task_team
->tt
.tt_next
);
3348 task_team
->tt
.tt_next
= NULL
;
3350 __kmp_release_bootstrap_lock(&__kmp_task_team_lock
);
3353 if (task_team
== NULL
) {
3354 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3355 "task team for team %p\n",
3356 __kmp_gtid_from_thread(thread
), team
));
3357 // Allocate a new task team if one is not available. Cannot use
3358 // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3359 task_team
= (kmp_task_team_t
*)__kmp_allocate(sizeof(kmp_task_team_t
));
3360 __kmp_init_bootstrap_lock(&task_team
->tt
.tt_threads_lock
);
3361 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3362 // suppress race conditions detection on synchronization flags in debug mode
3363 // this helps to analyze library internals eliminating false positives
3364 __itt_suppress_mark_range(
3365 __itt_suppress_range
, __itt_suppress_threading_errors
,
3366 &task_team
->tt
.tt_found_tasks
, sizeof(task_team
->tt
.tt_found_tasks
));
3367 __itt_suppress_mark_range(__itt_suppress_range
,
3368 __itt_suppress_threading_errors
,
3369 CCAST(kmp_uint32
*, &task_team
->tt
.tt_active
),
3370 sizeof(task_team
->tt
.tt_active
));
3371 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3372 // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3373 // task_team->tt.tt_threads_data = NULL;
3374 // task_team->tt.tt_max_threads = 0;
3375 // task_team->tt.tt_next = NULL;
3378 TCW_4(task_team
->tt
.tt_found_tasks
, FALSE
);
3379 TCW_4(task_team
->tt
.tt_found_proxy_tasks
, FALSE
);
3380 task_team
->tt
.tt_nproc
= nthreads
= team
->t
.t_nproc
;
3382 KMP_ATOMIC_ST_REL(&task_team
->tt
.tt_unfinished_threads
, nthreads
);
3383 TCW_4(task_team
->tt
.tt_active
, TRUE
);
3385 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3386 "unfinished_threads init'd to %d\n",
3387 (thread
? __kmp_gtid_from_thread(thread
) : -1), task_team
,
3388 KMP_ATOMIC_LD_RLX(&task_team
->tt
.tt_unfinished_threads
)));
3392 // __kmp_free_task_team:
3393 // Frees the task team associated with a specific thread, and adds it
3394 // to the global task team free list.
3395 void __kmp_free_task_team(kmp_info_t
*thread
, kmp_task_team_t
*task_team
) {
3396 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3397 thread
? __kmp_gtid_from_thread(thread
) : -1, task_team
));
3399 // Put task team back on free list
3400 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock
);
3402 KMP_DEBUG_ASSERT(task_team
->tt
.tt_next
== NULL
);
3403 task_team
->tt
.tt_next
= __kmp_free_task_teams
;
3404 TCW_PTR(__kmp_free_task_teams
, task_team
);
3406 __kmp_release_bootstrap_lock(&__kmp_task_team_lock
);
3409 // __kmp_reap_task_teams:
3410 // Free all the task teams on the task team free list.
3411 // Should only be done during library shutdown.
3412 // Cannot do anything that needs a thread structure or gtid since they are
3414 void __kmp_reap_task_teams(void) {
3415 kmp_task_team_t
*task_team
;
3417 if (TCR_PTR(__kmp_free_task_teams
) != NULL
) {
3418 // Free all task_teams on the free list
3419 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock
);
3420 while ((task_team
= __kmp_free_task_teams
) != NULL
) {
3421 __kmp_free_task_teams
= task_team
->tt
.tt_next
;
3422 task_team
->tt
.tt_next
= NULL
;
3424 // Free threads_data if necessary
3425 if (task_team
->tt
.tt_threads_data
!= NULL
) {
3426 __kmp_free_task_threads_data(task_team
);
3428 __kmp_free(task_team
);
3430 __kmp_release_bootstrap_lock(&__kmp_task_team_lock
);
3434 // __kmp_wait_to_unref_task_teams:
3435 // Some threads could still be in the fork barrier release code, possibly
3436 // trying to steal tasks. Wait for each thread to unreference its task team.
3437 void __kmp_wait_to_unref_task_teams(void) {
3442 KMP_INIT_YIELD(spins
);
3447 // TODO: GEH - this may be is wrong because some sync would be necessary
3448 // in case threads are added to the pool during the traversal. Need to
3449 // verify that lock for thread pool is held when calling this routine.
3450 for (thread
= CCAST(kmp_info_t
*, __kmp_thread_pool
); thread
!= NULL
;
3451 thread
= thread
->th
.th_next_pool
) {
3455 if (TCR_PTR(thread
->th
.th_task_team
) == NULL
) {
3456 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3457 __kmp_gtid_from_thread(thread
)));
3461 // TODO: GEH - add this check for Linux* OS / OS X* as well?
3462 if (!__kmp_is_thread_alive(thread
, &exit_val
)) {
3463 thread
->th
.th_task_team
= NULL
;
3468 done
= FALSE
; // Because th_task_team pointer is not NULL for this thread
3470 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3471 "unreference task_team\n",
3472 __kmp_gtid_from_thread(thread
)));
3474 if (__kmp_dflt_blocktime
!= KMP_MAX_BLOCKTIME
) {
3475 volatile void *sleep_loc
;
3476 // If the thread is sleeping, awaken it.
3477 if ((sleep_loc
= TCR_PTR(CCAST(void *, thread
->th
.th_sleep_loc
))) !=
3481 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3482 __kmp_gtid_from_thread(thread
), __kmp_gtid_from_thread(thread
)));
3483 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread
), sleep_loc
);
3491 // If oversubscribed or have waited a bit, yield.
3492 KMP_YIELD_OVERSUB_ELSE_SPIN(spins
);
3496 // __kmp_task_team_setup: Create a task_team for the current team, but use
3497 // an already created, unused one if it already exists.
3498 void __kmp_task_team_setup(kmp_info_t
*this_thr
, kmp_team_t
*team
, int always
) {
3499 KMP_DEBUG_ASSERT(__kmp_tasking_mode
!= tskm_immediate_exec
);
3501 // If this task_team hasn't been created yet, allocate it. It will be used in
3502 // the region after the next.
3503 // If it exists, it is the current task team and shouldn't be touched yet as
3504 // it may still be in use.
3505 if (team
->t
.t_task_team
[this_thr
->th
.th_task_state
] == NULL
&&
3506 (always
|| team
->t
.t_nproc
> 1)) {
3507 team
->t
.t_task_team
[this_thr
->th
.th_task_state
] =
3508 __kmp_allocate_task_team(this_thr
, team
);
3509 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3510 "for team %d at parity=%d\n",
3511 __kmp_gtid_from_thread(this_thr
),
3512 team
->t
.t_task_team
[this_thr
->th
.th_task_state
],
3513 ((team
!= NULL
) ? team
->t
.t_id
: -1),
3514 this_thr
->th
.th_task_state
));
3517 // After threads exit the release, they will call sync, and then point to this
3518 // other task_team; make sure it is allocated and properly initialized. As
3519 // threads spin in the barrier release phase, they will continue to use the
3520 // previous task_team struct(above), until they receive the signal to stop
3521 // checking for tasks (they can't safely reference the kmp_team_t struct,
3522 // which could be reallocated by the master thread). No task teams are formed
3523 // for serialized teams.
3524 if (team
->t
.t_nproc
> 1) {
3525 int other_team
= 1 - this_thr
->th
.th_task_state
;
3526 if (team
->t
.t_task_team
[other_team
] == NULL
) { // setup other team as well
3527 team
->t
.t_task_team
[other_team
] =
3528 __kmp_allocate_task_team(this_thr
, team
);
3529 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3530 "task_team %p for team %d at parity=%d\n",
3531 __kmp_gtid_from_thread(this_thr
),
3532 team
->t
.t_task_team
[other_team
],
3533 ((team
!= NULL
) ? team
->t
.t_id
: -1), other_team
));
3534 } else { // Leave the old task team struct in place for the upcoming region;
3536 kmp_task_team_t
*task_team
= team
->t
.t_task_team
[other_team
];
3537 if (!task_team
->tt
.tt_active
||
3538 team
->t
.t_nproc
!= task_team
->tt
.tt_nproc
) {
3539 TCW_4(task_team
->tt
.tt_nproc
, team
->t
.t_nproc
);
3540 TCW_4(task_team
->tt
.tt_found_tasks
, FALSE
);
3541 TCW_4(task_team
->tt
.tt_found_proxy_tasks
, FALSE
);
3542 KMP_ATOMIC_ST_REL(&task_team
->tt
.tt_unfinished_threads
,
3544 TCW_4(task_team
->tt
.tt_active
, TRUE
);
3546 // if team size has changed, the first thread to enable tasking will
3547 // realloc threads_data if necessary
3548 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3549 "%p for team %d at parity=%d\n",
3550 __kmp_gtid_from_thread(this_thr
),
3551 team
->t
.t_task_team
[other_team
],
3552 ((team
!= NULL
) ? team
->t
.t_id
: -1), other_team
));
3557 // __kmp_task_team_sync: Propagation of task team data from team to threads
3558 // which happens just after the release phase of a team barrier. This may be
3559 // called by any thread, but only for teams with # threads > 1.
3560 void __kmp_task_team_sync(kmp_info_t
*this_thr
, kmp_team_t
*team
) {
3561 KMP_DEBUG_ASSERT(__kmp_tasking_mode
!= tskm_immediate_exec
);
3563 // Toggle the th_task_state field, to switch which task_team this thread
3565 this_thr
->th
.th_task_state
= 1 - this_thr
->th
.th_task_state
;
3566 // It is now safe to propagate the task team pointer from the team struct to
3567 // the current thread.
3568 TCW_PTR(this_thr
->th
.th_task_team
,
3569 team
->t
.t_task_team
[this_thr
->th
.th_task_state
]);
3571 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3572 "%p from Team #%d (parity=%d)\n",
3573 __kmp_gtid_from_thread(this_thr
), this_thr
->th
.th_task_team
,
3574 ((team
!= NULL
) ? team
->t
.t_id
: -1), this_thr
->th
.th_task_state
));
3577 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3578 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3579 // if proxy tasks were created.
3581 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3582 // by passing in 0 optionally as the last argument. When wait is zero, master
3583 // thread does not wait for unfinished_threads to reach 0.
3584 void __kmp_task_team_wait(
3585 kmp_info_t
*this_thr
,
3586 kmp_team_t
*team
USE_ITT_BUILD_ARG(void *itt_sync_obj
), int wait
) {
3587 kmp_task_team_t
*task_team
= team
->t
.t_task_team
[this_thr
->th
.th_task_state
];
3589 KMP_DEBUG_ASSERT(__kmp_tasking_mode
!= tskm_immediate_exec
);
3590 KMP_DEBUG_ASSERT(task_team
== this_thr
->th
.th_task_team
);
3592 if ((task_team
!= NULL
) && KMP_TASKING_ENABLED(task_team
)) {
3594 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3595 "(for unfinished_threads to reach 0) on task_team = %p\n",
3596 __kmp_gtid_from_thread(this_thr
), task_team
));
3597 // Worker threads may have dropped through to release phase, but could
3598 // still be executing tasks. Wait here for tasks to complete. To avoid
3599 // memory contention, only master thread checks termination condition.
3600 kmp_flag_32
flag(RCAST(std::atomic
<kmp_uint32
> *,
3601 &task_team
->tt
.tt_unfinished_threads
),
3603 flag
.wait(this_thr
, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj
));
3605 // Deactivate the old task team, so that the worker threads will stop
3606 // referencing it while spinning.
3609 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3610 "setting active to false, setting local and team's pointer to NULL\n",
3611 __kmp_gtid_from_thread(this_thr
), task_team
));
3612 KMP_DEBUG_ASSERT(task_team
->tt
.tt_nproc
> 1 ||
3613 task_team
->tt
.tt_found_proxy_tasks
== TRUE
);
3614 TCW_SYNC_4(task_team
->tt
.tt_found_proxy_tasks
, FALSE
);
3615 KMP_CHECK_UPDATE(task_team
->tt
.tt_untied_task_encountered
, 0);
3616 TCW_SYNC_4(task_team
->tt
.tt_active
, FALSE
);
3619 TCW_PTR(this_thr
->th
.th_task_team
, NULL
);
3623 // __kmp_tasking_barrier:
3624 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3625 // Internal function to execute all tasks prior to a regular barrier or a join
3626 // barrier. It is a full barrier itself, which unfortunately turns regular
3627 // barriers into double barriers and join barriers into 1 1/2 barriers.
3628 void __kmp_tasking_barrier(kmp_team_t
*team
, kmp_info_t
*thread
, int gtid
) {
3629 std::atomic
<kmp_uint32
> *spin
= RCAST(
3630 std::atomic
<kmp_uint32
> *,
3631 &team
->t
.t_task_team
[thread
->th
.th_task_state
]->tt
.tt_unfinished_threads
);
3633 KMP_DEBUG_ASSERT(__kmp_tasking_mode
== tskm_extra_barrier
);
3636 KMP_FSYNC_SPIN_INIT(spin
, NULL
);
3637 #endif /* USE_ITT_BUILD */
3638 kmp_flag_32
spin_flag(spin
, 0U);
3639 while (!spin_flag
.execute_tasks(thread
, gtid
, TRUE
,
3640 &flag
USE_ITT_BUILD_ARG(NULL
), 0)) {
3642 // TODO: What about itt_sync_obj??
3643 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin
));
3644 #endif /* USE_ITT_BUILD */
3646 if (TCR_4(__kmp_global
.g
.g_done
)) {
3647 if (__kmp_global
.g
.g_abort
)
3648 __kmp_abort_thread();
3654 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin
));
3655 #endif /* USE_ITT_BUILD */
3658 // __kmp_give_task puts a task into a given thread queue if:
3659 // - the queue for that thread was created
3660 // - there's space in that queue
3661 // Because of this, __kmp_push_task needs to check if there's space after
3663 static bool __kmp_give_task(kmp_info_t
*thread
, kmp_int32 tid
, kmp_task_t
*task
,
3665 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
3666 kmp_task_team_t
*task_team
= taskdata
->td_task_team
;
3668 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3671 // If task_team is NULL something went really bad...
3672 KMP_DEBUG_ASSERT(task_team
!= NULL
);
3674 bool result
= false;
3675 kmp_thread_data_t
*thread_data
= &task_team
->tt
.tt_threads_data
[tid
];
3677 if (thread_data
->td
.td_deque
== NULL
) {
3678 // There's no queue in this thread, go find another one
3679 // We're guaranteed that at least one thread has a queue
3681 ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3686 if (TCR_4(thread_data
->td
.td_deque_ntasks
) >=
3687 TASK_DEQUE_SIZE(thread_data
->td
)) {
3690 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3693 // if this deque is bigger than the pass ratio give a chance to another
3695 if (TASK_DEQUE_SIZE(thread_data
->td
) / INITIAL_TASK_DEQUE_SIZE
>= pass
)
3698 __kmp_acquire_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
3699 if (TCR_4(thread_data
->td
.td_deque_ntasks
) >=
3700 TASK_DEQUE_SIZE(thread_data
->td
)) {
3701 // expand deque to push the task which is not allowed to execute
3702 __kmp_realloc_task_deque(thread
, thread_data
);
3707 __kmp_acquire_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
3709 if (TCR_4(thread_data
->td
.td_deque_ntasks
) >=
3710 TASK_DEQUE_SIZE(thread_data
->td
)) {
3711 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3715 // if this deque is bigger than the pass ratio give a chance to another
3717 if (TASK_DEQUE_SIZE(thread_data
->td
) / INITIAL_TASK_DEQUE_SIZE
>= pass
)
3718 goto release_and_exit
;
3720 __kmp_realloc_task_deque(thread
, thread_data
);
3724 // lock is held here, and there is space in the deque
3726 thread_data
->td
.td_deque
[thread_data
->td
.td_deque_tail
] = taskdata
;
3728 thread_data
->td
.td_deque_tail
=
3729 (thread_data
->td
.td_deque_tail
+ 1) & TASK_DEQUE_MASK(thread_data
->td
);
3730 TCW_4(thread_data
->td
.td_deque_ntasks
,
3731 TCR_4(thread_data
->td
.td_deque_ntasks
) + 1);
3734 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3738 __kmp_release_bootstrap_lock(&thread_data
->td
.td_deque_lock
);
3743 /* The finish of the proxy tasks is divided in two pieces:
3744 - the top half is the one that can be done from a thread outside the team
3745 - the bottom half must be run from a thread within the team
3747 In order to run the bottom half the task gets queued back into one of the
3748 threads of the team. Once the td_incomplete_child_task counter of the parent
3749 is decremented the threads can leave the barriers. So, the bottom half needs
3750 to be queued before the counter is decremented. The top half is therefore
3751 divided in two parts:
3752 - things that can be run before queuing the bottom half
3753 - things that must be run after queuing the bottom half
3755 This creates a second race as the bottom half can free the task before the
3756 second top half is executed. To avoid this we use the
3757 td_incomplete_child_task of the proxy task to synchronize the top and bottom
3759 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t
*taskdata
) {
3760 KMP_DEBUG_ASSERT(taskdata
->td_flags
.tasktype
== TASK_EXPLICIT
);
3761 KMP_DEBUG_ASSERT(taskdata
->td_flags
.proxy
== TASK_PROXY
);
3762 KMP_DEBUG_ASSERT(taskdata
->td_flags
.complete
== 0);
3763 KMP_DEBUG_ASSERT(taskdata
->td_flags
.freed
== 0);
3765 taskdata
->td_flags
.complete
= 1; // mark the task as completed
3767 if (taskdata
->td_taskgroup
)
3768 KMP_ATOMIC_DEC(&taskdata
->td_taskgroup
->count
);
3770 // Create an imaginary children for this task so the bottom half cannot
3771 // release the task before we have completed the second top half
3772 KMP_ATOMIC_INC(&taskdata
->td_incomplete_child_tasks
);
3775 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t
*taskdata
) {
3776 kmp_int32 children
= 0;
3778 // Predecrement simulated by "- 1" calculation
3780 KMP_ATOMIC_DEC(&taskdata
->td_parent
->td_incomplete_child_tasks
) - 1;
3781 KMP_DEBUG_ASSERT(children
>= 0);
3783 // Remove the imaginary children
3784 KMP_ATOMIC_DEC(&taskdata
->td_incomplete_child_tasks
);
3787 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid
, kmp_task_t
*ptask
) {
3788 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(ptask
);
3789 kmp_info_t
*thread
= __kmp_threads
[gtid
];
3791 KMP_DEBUG_ASSERT(taskdata
->td_flags
.proxy
== TASK_PROXY
);
3792 KMP_DEBUG_ASSERT(taskdata
->td_flags
.complete
==
3793 1); // top half must run before bottom half
3795 // We need to wait to make sure the top half is finished
3796 // Spinning here should be ok as this should happen quickly
3797 while (KMP_ATOMIC_LD_ACQ(&taskdata
->td_incomplete_child_tasks
) > 0)
3800 __kmp_release_deps(gtid
, taskdata
);
3801 __kmp_free_task_and_ancestors(gtid
, taskdata
, thread
);
3806 @param gtid Global Thread ID of encountering thread
3807 @param ptask Task which execution is completed
3809 Execute the completion of a proxy task from a thread of that is part of the
3810 team. Run first and bottom halves directly.
3812 void __kmpc_proxy_task_completed(kmp_int32 gtid
, kmp_task_t
*ptask
) {
3813 KMP_DEBUG_ASSERT(ptask
!= NULL
);
3814 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(ptask
);
3816 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3818 __kmp_assert_valid_gtid(gtid
);
3819 KMP_DEBUG_ASSERT(taskdata
->td_flags
.proxy
== TASK_PROXY
);
3821 __kmp_first_top_half_finish_proxy(taskdata
);
3822 __kmp_second_top_half_finish_proxy(taskdata
);
3823 __kmp_bottom_half_finish_proxy(gtid
, ptask
);
3826 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3832 @param ptask Task which execution is completed
3834 Execute the completion of a proxy task from a thread that could not belong to
3837 void __kmpc_proxy_task_completed_ooo(kmp_task_t
*ptask
) {
3838 KMP_DEBUG_ASSERT(ptask
!= NULL
);
3839 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(ptask
);
3843 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3846 KMP_DEBUG_ASSERT(taskdata
->td_flags
.proxy
== TASK_PROXY
);
3848 __kmp_first_top_half_finish_proxy(taskdata
);
3850 // Enqueue task to complete bottom half completion from a thread within the
3851 // corresponding team
3852 kmp_team_t
*team
= taskdata
->td_team
;
3853 kmp_int32 nthreads
= team
->t
.t_nproc
;
3856 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3857 // but we cannot use __kmp_get_random here
3858 kmp_int32 start_k
= 0;
3860 kmp_int32 k
= start_k
;
3863 // For now we're just linearly trying to find a thread
3864 thread
= team
->t
.t_threads
[k
];
3865 k
= (k
+ 1) % nthreads
;
3867 // we did a full pass through all the threads
3871 } while (!__kmp_give_task(thread
, k
, ptask
, pass
));
3873 __kmp_second_top_half_finish_proxy(taskdata
);
3877 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3881 kmp_event_t
*__kmpc_task_allow_completion_event(ident_t
*loc_ref
, int gtid
,
3883 kmp_taskdata_t
*td
= KMP_TASK_TO_TASKDATA(task
);
3884 if (td
->td_allow_completion_event
.type
== KMP_EVENT_UNINITIALIZED
) {
3885 td
->td_allow_completion_event
.type
= KMP_EVENT_ALLOW_COMPLETION
;
3886 td
->td_allow_completion_event
.ed
.task
= task
;
3887 __kmp_init_tas_lock(&td
->td_allow_completion_event
.lock
);
3889 return &td
->td_allow_completion_event
;
3892 void __kmp_fulfill_event(kmp_event_t
*event
) {
3893 if (event
->type
== KMP_EVENT_ALLOW_COMPLETION
) {
3894 kmp_task_t
*ptask
= event
->ed
.task
;
3895 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(ptask
);
3896 bool detached
= false;
3897 int gtid
= __kmp_get_gtid();
3899 // The associated task might have completed or could be completing at this
3901 // We need to take the lock to avoid races
3902 __kmp_acquire_tas_lock(&event
->lock
, gtid
);
3903 if (taskdata
->td_flags
.proxy
== TASK_PROXY
) {
3907 // The OMPT event must occur under mutual exclusion,
3908 // otherwise the tool might access ptask after free
3909 if (UNLIKELY(ompt_enabled
.enabled
))
3910 __ompt_task_finish(ptask
, NULL
, ompt_task_early_fulfill
);
3913 event
->type
= KMP_EVENT_UNINITIALIZED
;
3914 __kmp_release_tas_lock(&event
->lock
, gtid
);
3918 // We free ptask afterwards and know the task is finished,
3919 // so locking is not necessary
3920 if (UNLIKELY(ompt_enabled
.enabled
))
3921 __ompt_task_finish(ptask
, NULL
, ompt_task_late_fulfill
);
3923 // If the task detached complete the proxy task
3925 kmp_team_t
*team
= taskdata
->td_team
;
3926 kmp_info_t
*thread
= __kmp_get_thread();
3927 if (thread
->th
.th_team
== team
) {
3928 __kmpc_proxy_task_completed(gtid
, ptask
);
3934 __kmpc_proxy_task_completed_ooo(ptask
);
3939 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3942 // thread: allocating thread
3943 // task_src: pointer to source task to be duplicated
3944 // returns: a pointer to the allocated kmp_task_t structure (task).
3945 kmp_task_t
*__kmp_task_dup_alloc(kmp_info_t
*thread
, kmp_task_t
*task_src
) {
3947 kmp_taskdata_t
*taskdata
;
3948 kmp_taskdata_t
*taskdata_src
= KMP_TASK_TO_TASKDATA(task_src
);
3949 kmp_taskdata_t
*parent_task
= taskdata_src
->td_parent
; // same parent task
3950 size_t shareds_offset
;
3953 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread
,
3955 KMP_DEBUG_ASSERT(taskdata_src
->td_flags
.proxy
==
3956 TASK_FULL
); // it should not be proxy task
3957 KMP_DEBUG_ASSERT(taskdata_src
->td_flags
.tasktype
== TASK_EXPLICIT
);
3958 task_size
= taskdata_src
->td_size_alloc
;
3960 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3961 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread
,
3964 taskdata
= (kmp_taskdata_t
*)__kmp_fast_allocate(thread
, task_size
);
3966 taskdata
= (kmp_taskdata_t
*)__kmp_thread_malloc(thread
, task_size
);
3967 #endif /* USE_FAST_MEMORY */
3968 KMP_MEMCPY(taskdata
, taskdata_src
, task_size
);
3970 task
= KMP_TASKDATA_TO_TASK(taskdata
);
3972 // Initialize new task (only specific fields not affected by memcpy)
3973 taskdata
->td_task_id
= KMP_GEN_TASK_ID();
3974 if (task
->shareds
!= NULL
) { // need setup shareds pointer
3975 shareds_offset
= (char *)task_src
->shareds
- (char *)taskdata_src
;
3976 task
->shareds
= &((char *)taskdata
)[shareds_offset
];
3977 KMP_DEBUG_ASSERT((((kmp_uintptr_t
)task
->shareds
) & (sizeof(void *) - 1)) ==
3980 taskdata
->td_alloc_thread
= thread
;
3981 taskdata
->td_parent
= parent_task
;
3982 // task inherits the taskgroup from the parent task
3983 taskdata
->td_taskgroup
= parent_task
->td_taskgroup
;
3984 // tied task needs to initialize the td_last_tied at creation,
3985 // untied one does this when it is scheduled for execution
3986 if (taskdata
->td_flags
.tiedness
== TASK_TIED
)
3987 taskdata
->td_last_tied
= taskdata
;
3989 // Only need to keep track of child task counts if team parallel and tasking
3991 if (!(taskdata
->td_flags
.team_serial
|| taskdata
->td_flags
.tasking_ser
)) {
3992 KMP_ATOMIC_INC(&parent_task
->td_incomplete_child_tasks
);
3993 if (parent_task
->td_taskgroup
)
3994 KMP_ATOMIC_INC(&parent_task
->td_taskgroup
->count
);
3995 // Only need to keep track of allocated child tasks for explicit tasks since
3996 // implicit not deallocated
3997 if (taskdata
->td_parent
->td_flags
.tasktype
== TASK_EXPLICIT
)
3998 KMP_ATOMIC_INC(&taskdata
->td_parent
->td_allocated_child_tasks
);
4002 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4003 thread
, taskdata
, taskdata
->td_parent
));
4005 if (UNLIKELY(ompt_enabled
.enabled
))
4006 __ompt_task_init(taskdata
, thread
->th
.th_info
.ds
.ds_gtid
);
4011 // Routine optionally generated by the compiler for setting the lastprivate flag
4012 // and calling needed constructors for private/firstprivate objects
4013 // (used to form taskloop tasks from pattern task)
4014 // Parameters: dest task, src task, lastprivate flag.
4015 typedef void (*p_task_dup_t
)(kmp_task_t
*, kmp_task_t
*, kmp_int32
);
4017 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4019 // class to encapsulate manipulating loop bounds in a taskloop task.
4020 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4021 // the loop bound variables.
4022 class kmp_taskloop_bounds_t
{
4024 const kmp_taskdata_t
*taskdata
;
4025 size_t lower_offset
;
4026 size_t upper_offset
;
4029 kmp_taskloop_bounds_t(kmp_task_t
*_task
, kmp_uint64
*lb
, kmp_uint64
*ub
)
4030 : task(_task
), taskdata(KMP_TASK_TO_TASKDATA(task
)),
4031 lower_offset((char *)lb
- (char *)task
),
4032 upper_offset((char *)ub
- (char *)task
) {
4033 KMP_DEBUG_ASSERT((char *)lb
> (char *)_task
);
4034 KMP_DEBUG_ASSERT((char *)ub
> (char *)_task
);
4036 kmp_taskloop_bounds_t(kmp_task_t
*_task
, const kmp_taskloop_bounds_t
&bounds
)
4037 : task(_task
), taskdata(KMP_TASK_TO_TASKDATA(_task
)),
4038 lower_offset(bounds
.lower_offset
), upper_offset(bounds
.upper_offset
) {}
4039 size_t get_lower_offset() const { return lower_offset
; }
4040 size_t get_upper_offset() const { return upper_offset
; }
4041 kmp_uint64
get_lb() const {
4043 #if defined(KMP_GOMP_COMPAT)
4044 // Intel task just returns the lower bound normally
4045 if (!taskdata
->td_flags
.native
) {
4046 retval
= *(kmp_int64
*)((char *)task
+ lower_offset
);
4048 // GOMP task has to take into account the sizeof(long)
4049 if (taskdata
->td_size_loop_bounds
== 4) {
4050 kmp_int32
*lb
= RCAST(kmp_int32
*, task
->shareds
);
4051 retval
= (kmp_int64
)*lb
;
4053 kmp_int64
*lb
= RCAST(kmp_int64
*, task
->shareds
);
4054 retval
= (kmp_int64
)*lb
;
4058 retval
= *(kmp_int64
*)((char *)task
+ lower_offset
);
4059 #endif // defined(KMP_GOMP_COMPAT)
4062 kmp_uint64
get_ub() const {
4064 #if defined(KMP_GOMP_COMPAT)
4065 // Intel task just returns the upper bound normally
4066 if (!taskdata
->td_flags
.native
) {
4067 retval
= *(kmp_int64
*)((char *)task
+ upper_offset
);
4069 // GOMP task has to take into account the sizeof(long)
4070 if (taskdata
->td_size_loop_bounds
== 4) {
4071 kmp_int32
*ub
= RCAST(kmp_int32
*, task
->shareds
) + 1;
4072 retval
= (kmp_int64
)*ub
;
4074 kmp_int64
*ub
= RCAST(kmp_int64
*, task
->shareds
) + 1;
4075 retval
= (kmp_int64
)*ub
;
4079 retval
= *(kmp_int64
*)((char *)task
+ upper_offset
);
4080 #endif // defined(KMP_GOMP_COMPAT)
4083 void set_lb(kmp_uint64 lb
) {
4084 #if defined(KMP_GOMP_COMPAT)
4085 // Intel task just sets the lower bound normally
4086 if (!taskdata
->td_flags
.native
) {
4087 *(kmp_uint64
*)((char *)task
+ lower_offset
) = lb
;
4089 // GOMP task has to take into account the sizeof(long)
4090 if (taskdata
->td_size_loop_bounds
== 4) {
4091 kmp_uint32
*lower
= RCAST(kmp_uint32
*, task
->shareds
);
4092 *lower
= (kmp_uint32
)lb
;
4094 kmp_uint64
*lower
= RCAST(kmp_uint64
*, task
->shareds
);
4095 *lower
= (kmp_uint64
)lb
;
4099 *(kmp_uint64
*)((char *)task
+ lower_offset
) = lb
;
4100 #endif // defined(KMP_GOMP_COMPAT)
4102 void set_ub(kmp_uint64 ub
) {
4103 #if defined(KMP_GOMP_COMPAT)
4104 // Intel task just sets the upper bound normally
4105 if (!taskdata
->td_flags
.native
) {
4106 *(kmp_uint64
*)((char *)task
+ upper_offset
) = ub
;
4108 // GOMP task has to take into account the sizeof(long)
4109 if (taskdata
->td_size_loop_bounds
== 4) {
4110 kmp_uint32
*upper
= RCAST(kmp_uint32
*, task
->shareds
) + 1;
4111 *upper
= (kmp_uint32
)ub
;
4113 kmp_uint64
*upper
= RCAST(kmp_uint64
*, task
->shareds
) + 1;
4114 *upper
= (kmp_uint64
)ub
;
4118 *(kmp_uint64
*)((char *)task
+ upper_offset
) = ub
;
4119 #endif // defined(KMP_GOMP_COMPAT)
4123 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4125 // loc Source location information
4126 // gtid Global thread ID
4127 // task Pattern task, exposes the loop iteration range
4128 // lb Pointer to loop lower bound in task structure
4129 // ub Pointer to loop upper bound in task structure
4131 // ub_glob Global upper bound (used for lastprivate check)
4132 // num_tasks Number of tasks to execute
4133 // grainsize Number of loop iterations per task
4134 // extras Number of chunks with grainsize+1 iterations
4135 // tc Iterations count
4136 // task_dup Tasks duplication routine
4137 // codeptr_ra Return address for OMPT events
4138 void __kmp_taskloop_linear(ident_t
*loc
, int gtid
, kmp_task_t
*task
,
4139 kmp_uint64
*lb
, kmp_uint64
*ub
, kmp_int64 st
,
4140 kmp_uint64 ub_glob
, kmp_uint64 num_tasks
,
4141 kmp_uint64 grainsize
, kmp_uint64 extras
,
4147 KMP_COUNT_BLOCK(OMP_TASKLOOP
);
4148 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling
);
4149 p_task_dup_t ptask_dup
= (p_task_dup_t
)task_dup
;
4150 // compiler provides global bounds here
4151 kmp_taskloop_bounds_t
task_bounds(task
, lb
, ub
);
4152 kmp_uint64 lower
= task_bounds
.get_lb();
4153 kmp_uint64 upper
= task_bounds
.get_ub();
4155 kmp_info_t
*thread
= __kmp_threads
[gtid
];
4156 kmp_taskdata_t
*current_task
= thread
->th
.th_current_task
;
4157 kmp_task_t
*next_task
;
4158 kmp_int32 lastpriv
= 0;
4160 KMP_DEBUG_ASSERT(tc
== num_tasks
* grainsize
+ extras
);
4161 KMP_DEBUG_ASSERT(num_tasks
> extras
);
4162 KMP_DEBUG_ASSERT(num_tasks
> 0);
4163 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4164 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4165 gtid
, num_tasks
, grainsize
, extras
, lower
, upper
, ub_glob
, st
,
4168 // Launch num_tasks tasks, assign grainsize iterations each task
4169 for (i
= 0; i
< num_tasks
; ++i
) {
4170 kmp_uint64 chunk_minus_1
;
4172 chunk_minus_1
= grainsize
- 1;
4174 chunk_minus_1
= grainsize
;
4175 --extras
; // first extras iterations get bigger chunk (grainsize+1)
4177 upper
= lower
+ st
* chunk_minus_1
;
4178 if (i
== num_tasks
- 1) {
4179 // schedule the last task, set lastprivate flag if needed
4180 if (st
== 1) { // most common case
4181 KMP_DEBUG_ASSERT(upper
== *ub
);
4182 if (upper
== ub_glob
)
4184 } else if (st
> 0) { // positive loop stride
4185 KMP_DEBUG_ASSERT((kmp_uint64
)st
> *ub
- upper
);
4186 if ((kmp_uint64
)st
> ub_glob
- upper
)
4188 } else { // negative loop stride
4189 KMP_DEBUG_ASSERT(upper
+ st
< *ub
);
4190 if (upper
- ub_glob
< (kmp_uint64
)(-st
))
4194 next_task
= __kmp_task_dup_alloc(thread
, task
); // allocate new task
4195 kmp_taskdata_t
*next_taskdata
= KMP_TASK_TO_TASKDATA(next_task
);
4196 kmp_taskloop_bounds_t next_task_bounds
=
4197 kmp_taskloop_bounds_t(next_task
, task_bounds
);
4199 // adjust task-specific bounds
4200 next_task_bounds
.set_lb(lower
);
4201 if (next_taskdata
->td_flags
.native
) {
4202 next_task_bounds
.set_ub(upper
+ (st
> 0 ? 1 : -1));
4204 next_task_bounds
.set_ub(upper
);
4206 if (ptask_dup
!= NULL
) // set lastprivate flag, construct firstprivates,
4208 ptask_dup(next_task
, task
, lastpriv
);
4210 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4211 "upper %lld stride %lld, (offsets %p %p)\n",
4212 gtid
, i
, next_task
, lower
, upper
, st
,
4213 next_task_bounds
.get_lower_offset(),
4214 next_task_bounds
.get_upper_offset()));
4216 __kmp_omp_taskloop_task(NULL
, gtid
, next_task
,
4217 codeptr_ra
); // schedule new task
4219 __kmp_omp_task(gtid
, next_task
, true); // schedule new task
4221 lower
= upper
+ st
; // adjust lower bound for the next iteration
4223 // free the pattern task and exit
4224 __kmp_task_start(gtid
, task
, current_task
); // make internal bookkeeping
4225 // do not execute the pattern task, just do internal bookkeeping
4226 __kmp_task_finish
<false>(gtid
, task
, current_task
);
4229 // Structure to keep taskloop parameters for auxiliary task
4230 // kept in the shareds of the task structure.
4231 typedef struct __taskloop_params
{
4238 kmp_uint64 num_tasks
;
4239 kmp_uint64 grainsize
;
4242 kmp_uint64 num_t_min
;
4246 } __taskloop_params_t
;
4248 void __kmp_taskloop_recur(ident_t
*, int, kmp_task_t
*, kmp_uint64
*,
4249 kmp_uint64
*, kmp_int64
, kmp_uint64
, kmp_uint64
,
4250 kmp_uint64
, kmp_uint64
, kmp_uint64
, kmp_uint64
,
4256 // Execute part of the taskloop submitted as a task.
4257 int __kmp_taskloop_task(int gtid
, void *ptask
) {
4258 __taskloop_params_t
*p
=
4259 (__taskloop_params_t
*)((kmp_task_t
*)ptask
)->shareds
;
4260 kmp_task_t
*task
= p
->task
;
4261 kmp_uint64
*lb
= p
->lb
;
4262 kmp_uint64
*ub
= p
->ub
;
4263 void *task_dup
= p
->task_dup
;
4264 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4265 kmp_int64 st
= p
->st
;
4266 kmp_uint64 ub_glob
= p
->ub_glob
;
4267 kmp_uint64 num_tasks
= p
->num_tasks
;
4268 kmp_uint64 grainsize
= p
->grainsize
;
4269 kmp_uint64 extras
= p
->extras
;
4270 kmp_uint64 tc
= p
->tc
;
4271 kmp_uint64 num_t_min
= p
->num_t_min
;
4273 void *codeptr_ra
= p
->codeptr_ra
;
4276 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
4277 KMP_DEBUG_ASSERT(task
!= NULL
);
4278 KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4279 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4280 gtid
, taskdata
, num_tasks
, grainsize
, extras
, *lb
, *ub
, st
,
4283 KMP_DEBUG_ASSERT(num_tasks
* 2 + 1 > num_t_min
);
4284 if (num_tasks
> num_t_min
)
4285 __kmp_taskloop_recur(NULL
, gtid
, task
, lb
, ub
, st
, ub_glob
, num_tasks
,
4286 grainsize
, extras
, tc
, num_t_min
,
4292 __kmp_taskloop_linear(NULL
, gtid
, task
, lb
, ub
, st
, ub_glob
, num_tasks
,
4293 grainsize
, extras
, tc
,
4299 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid
));
4303 // Schedule part of the taskloop as a task,
4304 // execute the rest of the taskloop.
4306 // loc Source location information
4307 // gtid Global thread ID
4308 // task Pattern task, exposes the loop iteration range
4309 // lb Pointer to loop lower bound in task structure
4310 // ub Pointer to loop upper bound in task structure
4312 // ub_glob Global upper bound (used for lastprivate check)
4313 // num_tasks Number of tasks to execute
4314 // grainsize Number of loop iterations per task
4315 // extras Number of chunks with grainsize+1 iterations
4316 // tc Iterations count
4317 // num_t_min Threshold to launch tasks recursively
4318 // task_dup Tasks duplication routine
4319 // codeptr_ra Return address for OMPT events
4320 void __kmp_taskloop_recur(ident_t
*loc
, int gtid
, kmp_task_t
*task
,
4321 kmp_uint64
*lb
, kmp_uint64
*ub
, kmp_int64 st
,
4322 kmp_uint64 ub_glob
, kmp_uint64 num_tasks
,
4323 kmp_uint64 grainsize
, kmp_uint64 extras
,
4324 kmp_uint64 tc
, kmp_uint64 num_t_min
,
4329 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
4330 KMP_DEBUG_ASSERT(task
!= NULL
);
4331 KMP_DEBUG_ASSERT(num_tasks
> num_t_min
);
4332 KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4333 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4334 gtid
, taskdata
, num_tasks
, grainsize
, extras
, *lb
, *ub
, st
,
4336 p_task_dup_t ptask_dup
= (p_task_dup_t
)task_dup
;
4337 kmp_uint64 lower
= *lb
;
4338 kmp_info_t
*thread
= __kmp_threads
[gtid
];
4339 // kmp_taskdata_t *current_task = thread->th.th_current_task;
4340 kmp_task_t
*next_task
;
4341 size_t lower_offset
=
4342 (char *)lb
- (char *)task
; // remember offset of lb in the task structure
4343 size_t upper_offset
=
4344 (char *)ub
- (char *)task
; // remember offset of ub in the task structure
4346 KMP_DEBUG_ASSERT(tc
== num_tasks
* grainsize
+ extras
);
4347 KMP_DEBUG_ASSERT(num_tasks
> extras
);
4348 KMP_DEBUG_ASSERT(num_tasks
> 0);
4350 // split the loop in two halves
4351 kmp_uint64 lb1
, ub0
, tc0
, tc1
, ext0
, ext1
;
4352 kmp_uint64 gr_size0
= grainsize
;
4353 kmp_uint64 n_tsk0
= num_tasks
>> 1; // num_tasks/2 to execute
4354 kmp_uint64 n_tsk1
= num_tasks
- n_tsk0
; // to schedule as a task
4355 if (n_tsk0
<= extras
) {
4356 gr_size0
++; // integrate extras into grainsize
4357 ext0
= 0; // no extra iters in 1st half
4358 ext1
= extras
- n_tsk0
; // remaining extras
4359 tc0
= gr_size0
* n_tsk0
;
4361 } else { // n_tsk0 > extras
4362 ext1
= 0; // no extra iters in 2nd half
4364 tc1
= grainsize
* n_tsk1
;
4367 ub0
= lower
+ st
* (tc0
- 1);
4370 // create pattern task for 2nd half of the loop
4371 next_task
= __kmp_task_dup_alloc(thread
, task
); // duplicate the task
4372 // adjust lower bound (upper bound is not changed) for the 2nd half
4373 *(kmp_uint64
*)((char *)next_task
+ lower_offset
) = lb1
;
4374 if (ptask_dup
!= NULL
) // construct firstprivates, etc.
4375 ptask_dup(next_task
, task
, 0);
4376 *ub
= ub0
; // adjust upper bound for the 1st half
4378 // create auxiliary task for 2nd half of the loop
4379 // make sure new task has same parent task as the pattern task
4380 kmp_taskdata_t
*current_task
= thread
->th
.th_current_task
;
4381 thread
->th
.th_current_task
= taskdata
->td_parent
;
4382 kmp_task_t
*new_task
=
4383 __kmpc_omp_task_alloc(loc
, gtid
, 1, 3 * sizeof(void *),
4384 sizeof(__taskloop_params_t
), &__kmp_taskloop_task
);
4385 // restore current task
4386 thread
->th
.th_current_task
= current_task
;
4387 __taskloop_params_t
*p
= (__taskloop_params_t
*)new_task
->shareds
;
4388 p
->task
= next_task
;
4389 p
->lb
= (kmp_uint64
*)((char *)next_task
+ lower_offset
);
4390 p
->ub
= (kmp_uint64
*)((char *)next_task
+ upper_offset
);
4391 p
->task_dup
= task_dup
;
4393 p
->ub_glob
= ub_glob
;
4394 p
->num_tasks
= n_tsk1
;
4395 p
->grainsize
= grainsize
;
4398 p
->num_t_min
= num_t_min
;
4400 p
->codeptr_ra
= codeptr_ra
;
4404 // schedule new task with correct return address for OMPT events
4405 __kmp_omp_taskloop_task(NULL
, gtid
, new_task
, codeptr_ra
);
4407 __kmp_omp_task(gtid
, new_task
, true); // schedule new task
4410 // execute the 1st half of current subrange
4411 if (n_tsk0
> num_t_min
)
4412 __kmp_taskloop_recur(loc
, gtid
, task
, lb
, ub
, st
, ub_glob
, n_tsk0
, gr_size0
,
4413 ext0
, tc0
, num_t_min
,
4419 __kmp_taskloop_linear(loc
, gtid
, task
, lb
, ub
, st
, ub_glob
, n_tsk0
,
4420 gr_size0
, ext0
, tc0
,
4426 KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid
));
4431 @param loc Source location information
4432 @param gtid Global thread ID
4433 @param task Task structure
4434 @param if_val Value of the if clause
4435 @param lb Pointer to loop lower bound in task structure
4436 @param ub Pointer to loop upper bound in task structure
4437 @param st Loop stride
4438 @param nogroup Flag, 1 if no taskgroup needs to be added, 0 otherwise
4439 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
4440 @param grainsize Schedule value if specified
4441 @param task_dup Tasks duplication routine
4443 Execute the taskloop construct.
4445 void __kmpc_taskloop(ident_t
*loc
, int gtid
, kmp_task_t
*task
, int if_val
,
4446 kmp_uint64
*lb
, kmp_uint64
*ub
, kmp_int64 st
, int nogroup
,
4447 int sched
, kmp_uint64 grainsize
, void *task_dup
) {
4448 kmp_taskdata_t
*taskdata
= KMP_TASK_TO_TASKDATA(task
);
4449 KMP_DEBUG_ASSERT(task
!= NULL
);
4450 __kmp_assert_valid_gtid(gtid
);
4452 #if OMPT_SUPPORT && OMPT_OPTIONAL
4453 OMPT_STORE_RETURN_ADDRESS(gtid
);
4455 __kmpc_taskgroup(loc
, gtid
);
4458 // =========================================================================
4459 // calculate loop parameters
4460 kmp_taskloop_bounds_t
task_bounds(task
, lb
, ub
);
4462 // compiler provides global bounds here
4463 kmp_uint64 lower
= task_bounds
.get_lb();
4464 kmp_uint64 upper
= task_bounds
.get_ub();
4465 kmp_uint64 ub_glob
= upper
; // global upper used to calc lastprivate flag
4466 kmp_uint64 num_tasks
= 0, extras
= 0;
4467 kmp_uint64 num_tasks_min
= __kmp_taskloop_min_tasks
;
4468 kmp_info_t
*thread
= __kmp_threads
[gtid
];
4469 kmp_taskdata_t
*current_task
= thread
->th
.th_current_task
;
4471 KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4472 "grain %llu(%d), dup %p\n",
4473 gtid
, taskdata
, lower
, upper
, st
, grainsize
, sched
, task_dup
));
4475 // compute trip count
4476 if (st
== 1) { // most common case
4477 tc
= upper
- lower
+ 1;
4478 } else if (st
< 0) {
4479 tc
= (lower
- upper
) / (-st
) + 1;
4481 tc
= (upper
- lower
) / st
+ 1;
4484 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid
));
4485 // free the pattern task and exit
4486 __kmp_task_start(gtid
, task
, current_task
);
4487 // do not execute anything for zero-trip loop
4488 __kmp_task_finish
<false>(gtid
, task
, current_task
);
4492 #if OMPT_SUPPORT && OMPT_OPTIONAL
4493 ompt_team_info_t
*team_info
= __ompt_get_teaminfo(0, NULL
);
4494 ompt_task_info_t
*task_info
= __ompt_get_task_info_object(0);
4495 if (ompt_enabled
.ompt_callback_work
) {
4496 ompt_callbacks
.ompt_callback(ompt_callback_work
)(
4497 ompt_work_taskloop
, ompt_scope_begin
, &(team_info
->parallel_data
),
4498 &(task_info
->task_data
), tc
, OMPT_GET_RETURN_ADDRESS(0));
4502 if (num_tasks_min
== 0)
4503 // TODO: can we choose better default heuristic?
4505 KMP_MIN(thread
->th
.th_team_nproc
* 10, INITIAL_TASK_DEQUE_SIZE
);
4507 // compute num_tasks/grainsize based on the input provided
4509 case 0: // no schedule clause specified, we can choose the default
4510 // let's try to schedule (team_size*10) tasks
4511 grainsize
= thread
->th
.th_team_nproc
* 10;
4513 case 2: // num_tasks provided
4514 if (grainsize
> tc
) {
4515 num_tasks
= tc
; // too big num_tasks requested, adjust values
4519 num_tasks
= grainsize
;
4520 grainsize
= tc
/ num_tasks
;
4521 extras
= tc
% num_tasks
;
4524 case 1: // grainsize provided
4525 if (grainsize
> tc
) {
4526 num_tasks
= 1; // too big grainsize requested, adjust values
4530 num_tasks
= tc
/ grainsize
;
4531 // adjust grainsize for balanced distribution of iterations
4532 grainsize
= tc
/ num_tasks
;
4533 extras
= tc
% num_tasks
;
4537 KMP_ASSERT2(0, "unknown scheduling of taskloop");
4539 KMP_DEBUG_ASSERT(tc
== num_tasks
* grainsize
+ extras
);
4540 KMP_DEBUG_ASSERT(num_tasks
> extras
);
4541 KMP_DEBUG_ASSERT(num_tasks
> 0);
4542 // =========================================================================
4544 // check if clause value first
4545 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4546 if (if_val
== 0) { // if(0) specified, mark task as serial
4547 taskdata
->td_flags
.task_serial
= 1;
4548 taskdata
->td_flags
.tiedness
= TASK_TIED
; // AC: serial task cannot be untied
4549 // always start serial tasks linearly
4550 __kmp_taskloop_linear(loc
, gtid
, task
, lb
, ub
, st
, ub_glob
, num_tasks
,
4551 grainsize
, extras
, tc
,
4553 OMPT_GET_RETURN_ADDRESS(0),
4556 // !taskdata->td_flags.native => currently force linear spawning of tasks
4557 // for GOMP_taskloop
4558 } else if (num_tasks
> num_tasks_min
&& !taskdata
->td_flags
.native
) {
4559 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4560 "(%lld), grain %llu, extras %llu\n",
4561 gtid
, tc
, num_tasks
, num_tasks_min
, grainsize
, extras
));
4562 __kmp_taskloop_recur(loc
, gtid
, task
, lb
, ub
, st
, ub_glob
, num_tasks
,
4563 grainsize
, extras
, tc
, num_tasks_min
,
4565 OMPT_GET_RETURN_ADDRESS(0),
4569 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4570 "(%lld), grain %llu, extras %llu\n",
4571 gtid
, tc
, num_tasks
, num_tasks_min
, grainsize
, extras
));
4572 __kmp_taskloop_linear(loc
, gtid
, task
, lb
, ub
, st
, ub_glob
, num_tasks
,
4573 grainsize
, extras
, tc
,
4575 OMPT_GET_RETURN_ADDRESS(0),
4580 #if OMPT_SUPPORT && OMPT_OPTIONAL
4581 if (ompt_enabled
.ompt_callback_work
) {
4582 ompt_callbacks
.ompt_callback(ompt_callback_work
)(
4583 ompt_work_taskloop
, ompt_scope_end
, &(team_info
->parallel_data
),
4584 &(task_info
->task_data
), tc
, OMPT_GET_RETURN_ADDRESS(0));
4589 #if OMPT_SUPPORT && OMPT_OPTIONAL
4590 OMPT_STORE_RETURN_ADDRESS(gtid
);
4592 __kmpc_end_taskgroup(loc
, gtid
);
4594 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid
));