openmp/runtime/src/kmp_tasking.cpp

   1 /*
   2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "kmp.h"
  14 #include "kmp_i18n.h"
  15 #include "kmp_itt.h"
  16 #include "kmp_stats.h"
  17 #include "kmp_wait_release.h"
  18 #include "kmp_taskdeps.h"
  19
  20 #if OMPT_SUPPORT
  21 #include "ompt-specific.h"
  22 #endif
  23
  24 #include "tsan_annotations.h"
  25
  26 /* forward declaration */
  27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  28                                  kmp_info_t *this_thr);
  29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
  30                                    kmp_thread_data_t *thread_data);
  31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
  32                                            kmp_task_team_t *task_team);
  33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
  34
  35 #ifdef BUILD_TIED_TASK_STACK
  36
  37 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
  38 //  from top do bottom
  39 //
  40 //  gtid: global thread identifier for thread containing stack
  41 //  thread_data: thread data for task team thread containing stack
  42 //  threshold: value above which the trace statement triggers
  43 //  location: string identifying call site of this function (for trace)
  44 static void __kmp_trace_task_stack(kmp_int32 gtid,
  45                                    kmp_thread_data_t *thread_data,
  46                                    int threshold, char *location) {
  47   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
  48   kmp_taskdata_t **stack_top = task_stack->ts_top;
  49   kmp_int32 entries = task_stack->ts_entries;
  50   kmp_taskdata_t *tied_task;
  51
  52   KA_TRACE(
  53       threshold,
  54       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
  55        "first_block = %p, stack_top = %p \n",
  56        location, gtid, entries, task_stack->ts_first_block, stack_top));
  57
  58   KMP_DEBUG_ASSERT(stack_top != NULL);
  59   KMP_DEBUG_ASSERT(entries > 0);
  60
  61   while (entries != 0) {
  62     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
  63     // fix up ts_top if we need to pop from previous block
  64     if (entries & TASK_STACK_INDEX_MASK == 0) {
  65       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
  66
  67       stack_block = stack_block->sb_prev;
  68       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
  69     }
  70
  71     // finish bookkeeping
  72     stack_top--;
  73     entries--;
  74
  75     tied_task = *stack_top;
  76
  77     KMP_DEBUG_ASSERT(tied_task != NULL);
  78     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
  79
  80     KA_TRACE(threshold,
  81              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
  82               "stack_top=%p, tied_task=%p\n",
  83               location, gtid, entries, stack_top, tied_task));
  84   }
  85   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
  86
  87   KA_TRACE(threshold,
  88            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
  89             location, gtid));
  90 }
  91
  92 //  __kmp_init_task_stack: initialize the task stack for the first time
  93 //  after a thread_data structure is created.
  94 //  It should not be necessary to do this again (assuming the stack works).
  95 //
  96 //  gtid: global thread identifier of calling thread
  97 //  thread_data: thread data for task team thread containing stack
  98 static void __kmp_init_task_stack(kmp_int32 gtid,
  99                                   kmp_thread_data_t *thread_data) {
 100   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
 101   kmp_stack_block_t *first_block;
 102
 103   // set up the first block of the stack
 104   first_block = &task_stack->ts_first_block;
 105   task_stack->ts_top = (kmp_taskdata_t **)first_block;
 106   memset((void *)first_block, '\0',
 107          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
 108
 109   // initialize the stack to be empty
 110   task_stack->ts_entries = TASK_STACK_EMPTY;
 111   first_block->sb_next = NULL;
 112   first_block->sb_prev = NULL;
 113 }
 114
 115 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
 116 //
 117 //  gtid: global thread identifier for calling thread
 118 //  thread_data: thread info for thread containing stack
 119 static void __kmp_free_task_stack(kmp_int32 gtid,
 120                                   kmp_thread_data_t *thread_data) {
 121   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
 122   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
 123
 124   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
 125   // free from the second block of the stack
 126   while (stack_block != NULL) {
 127     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
 128
 129     stack_block->sb_next = NULL;
 130     stack_block->sb_prev = NULL;
 131     if (stack_block != &task_stack->ts_first_block) {
 132       __kmp_thread_free(thread,
 133                         stack_block); // free the block, if not the first
 134     }
 135     stack_block = next_block;
 136   }
 137   // initialize the stack to be empty
 138   task_stack->ts_entries = 0;
 139   task_stack->ts_top = NULL;
 140 }
 141
 142 //  __kmp_push_task_stack: Push the tied task onto the task stack.
 143 //     Grow the stack if necessary by allocating another block.
 144 //
 145 //  gtid: global thread identifier for calling thread
 146 //  thread: thread info for thread containing stack
 147 //  tied_task: the task to push on the stack
 148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
 149                                   kmp_taskdata_t *tied_task) {
 150   // GEH - need to consider what to do if tt_threads_data not allocated yet
 151   kmp_thread_data_t *thread_data =
 152       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
 153   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
 154
 155   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
 156     return; // Don't push anything on stack if team or team tasks are serialized
 157   }
 158
 159   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
 160   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
 161
 162   KA_TRACE(20,
 163            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
 164             gtid, thread, tied_task));
 165   // Store entry
 166   *(task_stack->ts_top) = tied_task;
 167
 168   // Do bookkeeping for next push
 169   task_stack->ts_top++;
 170   task_stack->ts_entries++;
 171
 172   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
 173     // Find beginning of this task block
 174     kmp_stack_block_t *stack_block =
 175         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
 176
 177     // Check if we already have a block
 178     if (stack_block->sb_next !=
 179         NULL) { // reset ts_top to beginning of next block
 180       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
 181     } else { // Alloc new block and link it up
 182       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
 183           thread, sizeof(kmp_stack_block_t));
 184
 185       task_stack->ts_top = &new_block->sb_block[0];
 186       stack_block->sb_next = new_block;
 187       new_block->sb_prev = stack_block;
 188       new_block->sb_next = NULL;
 189
 190       KA_TRACE(
 191           30,
 192           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
 193            gtid, tied_task, new_block));
 194     }
 195   }
 196   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
 197                 tied_task));
 198 }
 199
 200 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
 201 //  the task, just check to make sure it matches the ending task passed in.
 202 //
 203 //  gtid: global thread identifier for the calling thread
 204 //  thread: thread info structure containing stack
 205 //  tied_task: the task popped off the stack
 206 //  ending_task: the task that is ending (should match popped task)
 207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
 208                                  kmp_taskdata_t *ending_task) {
 209   // GEH - need to consider what to do if tt_threads_data not allocated yet
 210   kmp_thread_data_t *thread_data =
 211       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
 212   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
 213   kmp_taskdata_t *tied_task;
 214
 215   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
 216     // Don't pop anything from stack if team or team tasks are serialized
 217     return;
 218   }
 219
 220   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
 221   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
 222
 223   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
 224                 thread));
 225
 226   // fix up ts_top if we need to pop from previous block
 227   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
 228     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
 229
 230     stack_block = stack_block->sb_prev;
 231     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
 232   }
 233
 234   // finish bookkeeping
 235   task_stack->ts_top--;
 236   task_stack->ts_entries--;
 237
 238   tied_task = *(task_stack->ts_top);
 239
 240   KMP_DEBUG_ASSERT(tied_task != NULL);
 241   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
 242   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
 243
 244   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
 245                 tied_task));
 246   return;
 247 }
 248 #endif /* BUILD_TIED_TASK_STACK */
 249
 250 // returns 1 if new task is allowed to execute, 0 otherwise
 251 // checks Task Scheduling constraint (if requested) and
 252 // mutexinoutset dependencies if any
 253 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
 254                                   const kmp_taskdata_t *tasknew,
 255                                   const kmp_taskdata_t *taskcurr) {
 256   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
 257     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
 258     // only descendant of all deferred tied tasks can be scheduled, checking
 259     // the last one is enough, as it in turn is the descendant of all others
 260     kmp_taskdata_t *current = taskcurr->td_last_tied;
 261     KMP_DEBUG_ASSERT(current != NULL);
 262     // check if the task is not suspended on barrier
 263     if (current->td_flags.tasktype == TASK_EXPLICIT ||
 264         current->td_taskwait_thread > 0) { // <= 0 on barrier
 265       kmp_int32 level = current->td_level;
 266       kmp_taskdata_t *parent = tasknew->td_parent;
 267       while (parent != current && parent->td_level > level) {
 268         // check generation up to the level of the current task
 269         parent = parent->td_parent;
 270         KMP_DEBUG_ASSERT(parent != NULL);
 271       }
 272       if (parent != current)
 273         return false;
 274     }
 275   }
 276   // Check mutexinoutset dependencies, acquire locks
 277   kmp_depnode_t *node = tasknew->td_depnode;
 278   if (node && (node->dn.mtx_num_locks > 0)) {
 279     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
 280       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
 281       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
 282         continue;
 283       // could not get the lock, release previous locks
 284       for (int j = i - 1; j >= 0; --j)
 285         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
 286       return false;
 287     }
 288     // negative num_locks means all locks acquired successfully
 289     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
 290   }
 291   return true;
 292 }
 293
 294 // __kmp_realloc_task_deque:
 295 // Re-allocates a task deque for a particular thread, copies the content from
 296 // the old deque and adjusts the necessary data structures relating to the
 297 // deque. This operation must be done with the deque_lock being held
 298 static void __kmp_realloc_task_deque(kmp_info_t *thread,
 299                                      kmp_thread_data_t *thread_data) {
 300   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
 301   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
 302   kmp_int32 new_size = 2 * size;
 303
 304   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
 305                 "%d] for thread_data %p\n",
 306                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
 307
 308   kmp_taskdata_t **new_deque =
 309       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
 310
 311   int i, j;
 312   for (i = thread_data->td.td_deque_head, j = 0; j < size;
 313        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
 314     new_deque[j] = thread_data->td.td_deque[i];
 315
 316   __kmp_free(thread_data->td.td_deque);
 317
 318   thread_data->td.td_deque_head = 0;
 319   thread_data->td.td_deque_tail = size;
 320   thread_data->td.td_deque = new_deque;
 321   thread_data->td.td_deque_size = new_size;
 322 }
 323
 324 //  __kmp_push_task: Add a task to the thread's deque
 325 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
 326   kmp_info_t *thread = __kmp_threads[gtid];
 327   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 328   kmp_task_team_t *task_team = thread->th.th_task_team;
 329   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
 330   kmp_thread_data_t *thread_data;
 331
 332   KA_TRACE(20,
 333            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
 334
 335   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
 336     // untied task needs to increment counter so that the task structure is not
 337     // freed prematurely
 338     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
 339     KMP_DEBUG_USE_VAR(counter);
 340     KA_TRACE(
 341         20,
 342         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
 343          gtid, counter, taskdata));
 344   }
 345
 346   // The first check avoids building task_team thread data if serialized
 347   if (taskdata->td_flags.task_serial) {
 348     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
 349                   "TASK_NOT_PUSHED for task %p\n",
 350                   gtid, taskdata));
 351     return TASK_NOT_PUSHED;
 352   }
 353
 354   // Now that serialized tasks have returned, we can assume that we are not in
 355   // immediate exec mode
 356   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
 357   if (!KMP_TASKING_ENABLED(task_team)) {
 358     __kmp_enable_tasking(task_team, thread);
 359   }
 360   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
 361   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
 362
 363   // Find tasking deque specific to encountering thread
 364   thread_data = &task_team->tt.tt_threads_data[tid];
 365
 366   // No lock needed since only owner can allocate
 367   if (thread_data->td.td_deque == NULL) {
 368     __kmp_alloc_task_deque(thread, thread_data);
 369   }
 370
 371   int locked = 0;
 372   // Check if deque is full
 373   if (TCR_4(thread_data->td.td_deque_ntasks) >=
 374       TASK_DEQUE_SIZE(thread_data->td)) {
 375     if (__kmp_enable_task_throttling &&
 376         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
 377                               thread->th.th_current_task)) {
 378       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
 379                     "TASK_NOT_PUSHED for task %p\n",
 380                     gtid, taskdata));
 381       return TASK_NOT_PUSHED;
 382     } else {
 383       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
 384       locked = 1;
 385       if (TCR_4(thread_data->td.td_deque_ntasks) >=
 386           TASK_DEQUE_SIZE(thread_data->td)) {
 387         // expand deque to push the task which is not allowed to execute
 388         __kmp_realloc_task_deque(thread, thread_data);
 389       }
 390     }
 391   }
 392   // Lock the deque for the task push operation
 393   if (!locked) {
 394     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
 395     // Need to recheck as we can get a proxy task from thread outside of OpenMP
 396     if (TCR_4(thread_data->td.td_deque_ntasks) >=
 397         TASK_DEQUE_SIZE(thread_data->td)) {
 398       if (__kmp_enable_task_throttling &&
 399           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
 400                                 thread->th.th_current_task)) {
 401         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 402         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
 403                       "returning TASK_NOT_PUSHED for task %p\n",
 404                       gtid, taskdata));
 405         return TASK_NOT_PUSHED;
 406       } else {
 407         // expand deque to push the task which is not allowed to execute
 408         __kmp_realloc_task_deque(thread, thread_data);
 409       }
 410     }
 411   }
 412   // Must have room since no thread can add tasks but calling thread
 413   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
 414                    TASK_DEQUE_SIZE(thread_data->td));
 415
 416   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
 417       taskdata; // Push taskdata
 418   // Wrap index.
 419   thread_data->td.td_deque_tail =
 420       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
 421   TCW_4(thread_data->td.td_deque_ntasks,
 422         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
 423   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
 424   KMP_FSYNC_RELEASING(taskdata); // releasing child
 425   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
 426                 "task=%p ntasks=%d head=%u tail=%u\n",
 427                 gtid, taskdata, thread_data->td.td_deque_ntasks,
 428                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
 429
 430   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 431
 432   return TASK_SUCCESSFULLY_PUSHED;
 433 }
 434
 435 // __kmp_pop_current_task_from_thread: set up current task from called thread
 436 // when team ends
 437 //
 438 // this_thr: thread structure to set current_task in.
 439 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
 440   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
 441                 "this_thread=%p, curtask=%p, "
 442                 "curtask_parent=%p\n",
 443                 0, this_thr, this_thr->th.th_current_task,
 444                 this_thr->th.th_current_task->td_parent));
 445
 446   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
 447
 448   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
 449                 "this_thread=%p, curtask=%p, "
 450                 "curtask_parent=%p\n",
 451                 0, this_thr, this_thr->th.th_current_task,
 452                 this_thr->th.th_current_task->td_parent));
 453 }
 454
 455 // __kmp_push_current_task_to_thread: set up current task in called thread for a
 456 // new team
 457 //
 458 // this_thr: thread structure to set up
 459 // team: team for implicit task data
 460 // tid: thread within team to set up
 461 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
 462                                        int tid) {
 463   // current task of the thread is a parent of the new just created implicit
 464   // tasks of new team
 465   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
 466                 "curtask=%p "
 467                 "parent_task=%p\n",
 468                 tid, this_thr, this_thr->th.th_current_task,
 469                 team->t.t_implicit_task_taskdata[tid].td_parent));
 470
 471   KMP_DEBUG_ASSERT(this_thr != NULL);
 472
 473   if (tid == 0) {
 474     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
 475       team->t.t_implicit_task_taskdata[0].td_parent =
 476           this_thr->th.th_current_task;
 477       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
 478     }
 479   } else {
 480     team->t.t_implicit_task_taskdata[tid].td_parent =
 481         team->t.t_implicit_task_taskdata[0].td_parent;
 482     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
 483   }
 484
 485   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
 486                 "curtask=%p "
 487                 "parent_task=%p\n",
 488                 tid, this_thr, this_thr->th.th_current_task,
 489                 team->t.t_implicit_task_taskdata[tid].td_parent));
 490 }
 491
 492 // __kmp_task_start: bookkeeping for a task starting execution
 493 //
 494 // GTID: global thread id of calling thread
 495 // task: task starting execution
 496 // current_task: task suspending
 497 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
 498                              kmp_taskdata_t *current_task) {
 499   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 500   kmp_info_t *thread = __kmp_threads[gtid];
 501
 502   KA_TRACE(10,
 503            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
 504             gtid, taskdata, current_task));
 505
 506   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 507
 508   // mark currently executing task as suspended
 509   // TODO: GEH - make sure root team implicit task is initialized properly.
 510   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
 511   current_task->td_flags.executing = 0;
 512
 513 // Add task to stack if tied
 514 #ifdef BUILD_TIED_TASK_STACK
 515   if (taskdata->td_flags.tiedness == TASK_TIED) {
 516     __kmp_push_task_stack(gtid, thread, taskdata);
 517   }
 518 #endif /* BUILD_TIED_TASK_STACK */
 519
 520   // mark starting task as executing and as current task
 521   thread->th.th_current_task = taskdata;
 522
 523   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
 524                    taskdata->td_flags.tiedness == TASK_UNTIED);
 525   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
 526                    taskdata->td_flags.tiedness == TASK_UNTIED);
 527   taskdata->td_flags.started = 1;
 528   taskdata->td_flags.executing = 1;
 529   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
 530   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 531
 532   // GEH TODO: shouldn't we pass some sort of location identifier here?
 533   // APT: yes, we will pass location here.
 534   // need to store current thread state (in a thread or taskdata structure)
 535   // before setting work_state, otherwise wrong state is set after end of task
 536
 537   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
 538
 539   return;
 540 }
 541
 542 #if OMPT_SUPPORT
 543 //------------------------------------------------------------------------------
 544 // __ompt_task_init:
 545 //   Initialize OMPT fields maintained by a task. This will only be called after
 546 //   ompt_start_tool, so we already know whether ompt is enabled or not.
 547
 548 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
 549   // The calls to __ompt_task_init already have the ompt_enabled condition.
 550   task->ompt_task_info.task_data.value = 0;
 551   task->ompt_task_info.frame.exit_frame = ompt_data_none;
 552   task->ompt_task_info.frame.enter_frame = ompt_data_none;
 553   task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
 554   task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
 555 }
 556
 557 // __ompt_task_start:
 558 //   Build and trigger task-begin event
 559 static inline void __ompt_task_start(kmp_task_t *task,
 560                                      kmp_taskdata_t *current_task,
 561                                      kmp_int32 gtid) {
 562   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 563   ompt_task_status_t status = ompt_task_switch;
 564   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
 565     status = ompt_task_yield;
 566     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
 567   }
 568   /* let OMPT know that we're about to run this task */
 569   if (ompt_enabled.ompt_callback_task_schedule) {
 570     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
 571         &(current_task->ompt_task_info.task_data), status,
 572         &(taskdata->ompt_task_info.task_data));
 573   }
 574   taskdata->ompt_task_info.scheduling_parent = current_task;
 575 }
 576
 577 // __ompt_task_finish:
 578 //   Build and trigger final task-schedule event
 579 static inline void __ompt_task_finish(kmp_task_t *task,
 580                                       kmp_taskdata_t *resumed_task,
 581                                       ompt_task_status_t status) {
 582   if (ompt_enabled.ompt_callback_task_schedule) {
 583     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 584     if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
 585         taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
 586       status = ompt_task_cancel;
 587     }
 588
 589     /* let OMPT know that we're returning to the callee task */
 590     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
 591         &(taskdata->ompt_task_info.task_data), status,
 592         (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
 593   }
 594 }
 595 #endif
 596
 597 template <bool ompt>
 598 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
 599                                                kmp_task_t *task,
 600                                                void *frame_address,
 601                                                void *return_address) {
 602   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 603   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
 604
 605   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
 606                 "current_task=%p\n",
 607                 gtid, loc_ref, taskdata, current_task));
 608
 609   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
 610     // untied task needs to increment counter so that the task structure is not
 611     // freed prematurely
 612     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
 613     KMP_DEBUG_USE_VAR(counter);
 614     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
 615                   "incremented for task %p\n",
 616                   gtid, counter, taskdata));
 617   }
 618
 619   taskdata->td_flags.task_serial =
 620       1; // Execute this task immediately, not deferred.
 621   __kmp_task_start(gtid, task, current_task);
 622
 623 #if OMPT_SUPPORT
 624   if (ompt) {
 625     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
 626       current_task->ompt_task_info.frame.enter_frame.ptr =
 627           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
 628       current_task->ompt_task_info.frame.enter_frame_flags =
 629           taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
 630     }
 631     if (ompt_enabled.ompt_callback_task_create) {
 632       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
 633       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
 634           &(parent_info->task_data), &(parent_info->frame),
 635           &(taskdata->ompt_task_info.task_data),
 636           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
 637           return_address);
 638     }
 639     __ompt_task_start(task, current_task, gtid);
 640   }
 641 #endif // OMPT_SUPPORT
 642
 643   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
 644                 loc_ref, taskdata));
 645 }
 646
 647 #if OMPT_SUPPORT
 648 OMPT_NOINLINE
 649 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
 650                                            kmp_task_t *task,
 651                                            void *frame_address,
 652                                            void *return_address) {
 653   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
 654                                            return_address);
 655 }
 656 #endif // OMPT_SUPPORT
 657
 658 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
 659 // execution
 660 //
 661 // loc_ref: source location information; points to beginning of task block.
 662 // gtid: global thread number.
 663 // task: task thunk for the started task.
 664 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
 665                                kmp_task_t *task) {
 666 #if OMPT_SUPPORT
 667   if (UNLIKELY(ompt_enabled.enabled)) {
 668     OMPT_STORE_RETURN_ADDRESS(gtid);
 669     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
 670                                    OMPT_GET_FRAME_ADDRESS(1),
 671                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
 672     return;
 673   }
 674 #endif
 675   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
 676 }
 677
 678 #ifdef TASK_UNUSED
 679 // __kmpc_omp_task_begin: report that a given task has started execution
 680 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
 681 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
 682   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
 683
 684   KA_TRACE(
 685       10,
 686       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
 687        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
 688
 689   __kmp_task_start(gtid, task, current_task);
 690
 691   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
 692                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
 693   return;
 694 }
 695 #endif // TASK_UNUSED
 696
 697 // __kmp_free_task: free the current task space and the space for shareds
 698 //
 699 // gtid: Global thread ID of calling thread
 700 // taskdata: task to free
 701 // thread: thread data structure of caller
 702 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
 703                             kmp_info_t *thread) {
 704   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
 705                 taskdata));
 706
 707   // Check to make sure all flags and counters have the correct values
 708   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 709   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
 710   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
 711   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 712   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
 713                    taskdata->td_flags.task_serial == 1);
 714   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
 715
 716   taskdata->td_flags.freed = 1;
 717   ANNOTATE_HAPPENS_BEFORE(taskdata);
 718 // deallocate the taskdata and shared variable blocks associated with this task
 719 #if USE_FAST_MEMORY
 720   __kmp_fast_free(thread, taskdata);
 721 #else /* ! USE_FAST_MEMORY */
 722   __kmp_thread_free(thread, taskdata);
 723 #endif
 724
 725   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
 726 }
 727
 728 // __kmp_free_task_and_ancestors: free the current task and ancestors without
 729 // children
 730 //
 731 // gtid: Global thread ID of calling thread
 732 // taskdata: task to free
 733 // thread: thread data structure of caller
 734 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
 735                                           kmp_taskdata_t *taskdata,
 736                                           kmp_info_t *thread) {
 737   // Proxy tasks must always be allowed to free their parents
 738   // because they can be run in background even in serial mode.
 739   kmp_int32 team_serial =
 740       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
 741       !taskdata->td_flags.proxy;
 742   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 743
 744   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
 745   KMP_DEBUG_ASSERT(children >= 0);
 746
 747   // Now, go up the ancestor tree to see if any ancestors can now be freed.
 748   while (children == 0) {
 749     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
 750
 751     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
 752                   "and freeing itself\n",
 753                   gtid, taskdata));
 754
 755     // --- Deallocate my ancestor task ---
 756     __kmp_free_task(gtid, taskdata, thread);
 757
 758     taskdata = parent_taskdata;
 759
 760     if (team_serial)
 761       return;
 762     // Stop checking ancestors at implicit task instead of walking up ancestor
 763     // tree to avoid premature deallocation of ancestors.
 764     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
 765       if (taskdata->td_dephash) { // do we need to cleanup dephash?
 766         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
 767         kmp_tasking_flags_t flags_old = taskdata->td_flags;
 768         if (children == 0 && flags_old.complete == 1) {
 769           kmp_tasking_flags_t flags_new = flags_old;
 770           flags_new.complete = 0;
 771           if (KMP_COMPARE_AND_STORE_ACQ32(
 772                   RCAST(kmp_int32 *, &taskdata->td_flags),
 773                   *RCAST(kmp_int32 *, &flags_old),
 774                   *RCAST(kmp_int32 *, &flags_new))) {
 775             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
 776                            "dephash of implicit task %p\n",
 777                            gtid, taskdata));
 778             // cleanup dephash of finished implicit task
 779             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
 780           }
 781         }
 782       }
 783       return;
 784     }
 785     // Predecrement simulated by "- 1" calculation
 786     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
 787     KMP_DEBUG_ASSERT(children >= 0);
 788   }
 789
 790   KA_TRACE(
 791       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
 792            "not freeing it yet\n",
 793            gtid, taskdata, children));
 794 }
 795
 796 // __kmp_task_finish: bookkeeping to do when a task finishes execution
 797 //
 798 // gtid: global thread ID for calling thread
 799 // task: task to be finished
 800 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
 801 //
 802 // template<ompt>: effectively ompt_enabled.enabled!=0
 803 // the version with ompt=false is inlined, allowing to optimize away all ompt
 804 // code in this case
 805 template <bool ompt>
 806 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
 807                               kmp_taskdata_t *resumed_task) {
 808   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 809   kmp_info_t *thread = __kmp_threads[gtid];
 810   kmp_task_team_t *task_team =
 811       thread->th.th_task_team; // might be NULL for serial teams...
 812   kmp_int32 children = 0;
 813
 814   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
 815                 "task %p\n",
 816                 gtid, taskdata, resumed_task));
 817
 818   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 819
 820 // Pop task from stack if tied
 821 #ifdef BUILD_TIED_TASK_STACK
 822   if (taskdata->td_flags.tiedness == TASK_TIED) {
 823     __kmp_pop_task_stack(gtid, thread, taskdata);
 824   }
 825 #endif /* BUILD_TIED_TASK_STACK */
 826
 827   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
 828     // untied task needs to check the counter so that the task structure is not
 829     // freed prematurely
 830     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
 831     KA_TRACE(
 832         20,
 833         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
 834          gtid, counter, taskdata));
 835     if (counter > 0) {
 836       // untied task is not done, to be continued possibly by other thread, do
 837       // not free it now
 838       if (resumed_task == NULL) {
 839         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
 840         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
 841         // task is the parent
 842       }
 843       thread->th.th_current_task = resumed_task; // restore current_task
 844       resumed_task->td_flags.executing = 1; // resume previous task
 845       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
 846                     "resuming task %p\n",
 847                     gtid, taskdata, resumed_task));
 848       return;
 849     }
 850   }
 851
 852   // Check mutexinoutset dependencies, release locks
 853   kmp_depnode_t *node = taskdata->td_depnode;
 854   if (node && (node->dn.mtx_num_locks < 0)) {
 855     // negative num_locks means all locks were acquired
 856     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
 857     for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
 858       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
 859       __kmp_release_lock(node->dn.mtx_locks[i], gtid);
 860     }
 861   }
 862
 863   // bookkeeping for resuming task:
 864   // GEH - note tasking_ser => task_serial
 865   KMP_DEBUG_ASSERT(
 866       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
 867       taskdata->td_flags.task_serial);
 868   if (taskdata->td_flags.task_serial) {
 869     if (resumed_task == NULL) {
 870       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
 871       // task is the parent
 872     }
 873   } else {
 874     KMP_DEBUG_ASSERT(resumed_task !=
 875                      NULL); // verify that resumed task is passed as argument
 876   }
 877
 878   /* If the tasks' destructor thunk flag has been set, we need to invoke the
 879      destructor thunk that has been generated by the compiler. The code is
 880      placed here, since at this point other tasks might have been released
 881      hence overlapping the destructor invocations with some other work in the
 882      released tasks.  The OpenMP spec is not specific on when the destructors
 883      are invoked, so we should be free to choose. */
 884   if (taskdata->td_flags.destructors_thunk) {
 885     kmp_routine_entry_t destr_thunk = task->data1.destructors;
 886     KMP_ASSERT(destr_thunk);
 887     destr_thunk(gtid, task);
 888   }
 889
 890   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
 891   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
 892   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 893
 894   bool detach = false;
 895   if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
 896     if (taskdata->td_allow_completion_event.type ==
 897         KMP_EVENT_ALLOW_COMPLETION) {
 898       // event hasn't been fulfilled yet. Try to detach task.
 899       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
 900       if (taskdata->td_allow_completion_event.type ==
 901           KMP_EVENT_ALLOW_COMPLETION) {
 902         // task finished execution
 903         KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
 904         taskdata->td_flags.executing = 0; // suspend the finishing task
 905
 906 #if OMPT_SUPPORT
 907         // For a detached task, which is not completed, we switch back
 908         // the omp_fulfill_event signals completion
 909         // locking is necessary to avoid a race with ompt_task_late_fulfill
 910         if (ompt)
 911           __ompt_task_finish(task, resumed_task, ompt_task_detach);
 912 #endif
 913
 914         // no access to taskdata after this point!
 915         // __kmp_fulfill_event might free taskdata at any time from now
 916
 917         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
 918         detach = true;
 919       }
 920       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
 921     }
 922   }
 923
 924   if (!detach) {
 925     taskdata->td_flags.complete = 1; // mark the task as completed
 926
 927 #if OMPT_SUPPORT
 928     // This is not a detached task, we are done here
 929     if (ompt)
 930       __ompt_task_finish(task, resumed_task, ompt_task_complete);
 931 #endif
 932
 933     // Only need to keep track of count if team parallel and tasking not
 934     // serialized, or task is detachable and event has already been fulfilled
 935     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
 936         taskdata->td_flags.detachable == TASK_DETACHABLE) {
 937       // Predecrement simulated by "- 1" calculation
 938       children =
 939           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
 940       KMP_DEBUG_ASSERT(children >= 0);
 941       if (taskdata->td_taskgroup)
 942         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
 943       __kmp_release_deps(gtid, taskdata);
 944     } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
 945       // if we found proxy tasks there could exist a dependency chain
 946       // with the proxy task as origin
 947       __kmp_release_deps(gtid, taskdata);
 948     }
 949     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
 950     // called. Othertwise, if a task is executed immediately from the
 951     // release_deps code, the flag will be reset to 1 again by this same
 952     // function
 953     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
 954     taskdata->td_flags.executing = 0; // suspend the finishing task
 955   }
 956
 957
 958   KA_TRACE(
 959       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
 960            gtid, taskdata, children));
 961
 962   // Free this task and then ancestor tasks if they have no children.
 963   // Restore th_current_task first as suggested by John:
 964   // johnmc: if an asynchronous inquiry peers into the runtime system
 965   // it doesn't see the freed task as the current task.
 966   thread->th.th_current_task = resumed_task;
 967   if (!detach)
 968     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
 969
 970   // TODO: GEH - make sure root team implicit task is initialized properly.
 971   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
 972   resumed_task->td_flags.executing = 1; // resume previous task
 973
 974   KA_TRACE(
 975       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
 976            gtid, taskdata, resumed_task));
 977
 978   return;
 979 }
 980
 981 template <bool ompt>
 982 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
 983                                                   kmp_int32 gtid,
 984                                                   kmp_task_t *task) {
 985   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
 986                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
 987   __kmp_assert_valid_gtid(gtid);
 988   // this routine will provide task to resume
 989   __kmp_task_finish<ompt>(gtid, task, NULL);
 990
 991   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
 992                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
 993
 994 #if OMPT_SUPPORT
 995   if (ompt) {
 996     ompt_frame_t *ompt_frame;
 997     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
 998     ompt_frame->enter_frame = ompt_data_none;
 999     ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
1000   }
1001 #endif
1002
1003   return;
1004 }
1005
1006 #if OMPT_SUPPORT
1007 OMPT_NOINLINE
1008 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1009                                        kmp_task_t *task) {
1010   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1011 }
1012 #endif // OMPT_SUPPORT
1013
1014 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1015 //
1016 // loc_ref: source location information; points to end of task block.
1017 // gtid: global thread number.
1018 // task: task thunk for the completed task.
1019 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1020                                   kmp_task_t *task) {
1021 #if OMPT_SUPPORT
1022   if (UNLIKELY(ompt_enabled.enabled)) {
1023     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1024     return;
1025   }
1026 #endif
1027   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1028 }
1029
1030 #ifdef TASK_UNUSED
1031 // __kmpc_omp_task_complete: report that a task has completed execution
1032 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1033 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1034                               kmp_task_t *task) {
1035   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1036                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1037
1038   __kmp_task_finish<false>(gtid, task,
1039                            NULL); // Not sure how to find task to resume
1040
1041   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1042                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1043   return;
1044 }
1045 #endif // TASK_UNUSED
1046
1047 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1048 // task for a given thread
1049 //
1050 // loc_ref:  reference to source location of parallel region
1051 // this_thr:  thread data structure corresponding to implicit task
1052 // team: team for this_thr
1053 // tid: thread id of given thread within team
1054 // set_curr_task: TRUE if need to push current task to thread
1055 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1056 // have already been done elsewhere.
1057 // TODO: Get better loc_ref.  Value passed in may be NULL
1058 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1059                               kmp_team_t *team, int tid, int set_curr_task) {
1060   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1061
1062   KF_TRACE(
1063       10,
1064       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1065        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1066
1067   task->td_task_id = KMP_GEN_TASK_ID();
1068   task->td_team = team;
1069   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1070   //    in debugger)
1071   task->td_ident = loc_ref;
1072   task->td_taskwait_ident = NULL;
1073   task->td_taskwait_counter = 0;
1074   task->td_taskwait_thread = 0;
1075
1076   task->td_flags.tiedness = TASK_TIED;
1077   task->td_flags.tasktype = TASK_IMPLICIT;
1078   task->td_flags.proxy = TASK_FULL;
1079
1080   // All implicit tasks are executed immediately, not deferred
1081   task->td_flags.task_serial = 1;
1082   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1083   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1084
1085   task->td_flags.started = 1;
1086   task->td_flags.executing = 1;
1087   task->td_flags.complete = 0;
1088   task->td_flags.freed = 0;
1089
1090   task->td_depnode = NULL;
1091   task->td_last_tied = task;
1092   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1093
1094   if (set_curr_task) { // only do this init first time thread is created
1095     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1096     // Not used: don't need to deallocate implicit task
1097     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1098     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1099     task->td_dephash = NULL;
1100     __kmp_push_current_task_to_thread(this_thr, team, tid);
1101   } else {
1102     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1103     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1104   }
1105
1106 #if OMPT_SUPPORT
1107   if (UNLIKELY(ompt_enabled.enabled))
1108     __ompt_task_init(task, tid);
1109 #endif
1110
1111   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1112                 team, task));
1113 }
1114
1115 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1116 // at the end of parallel regions. Some resources are kept for reuse in the next
1117 // parallel region.
1118 //
1119 // thread:  thread data structure corresponding to implicit task
1120 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1121   kmp_taskdata_t *task = thread->th.th_current_task;
1122   if (task->td_dephash) {
1123     int children;
1124     task->td_flags.complete = 1;
1125     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1126     kmp_tasking_flags_t flags_old = task->td_flags;
1127     if (children == 0 && flags_old.complete == 1) {
1128       kmp_tasking_flags_t flags_new = flags_old;
1129       flags_new.complete = 0;
1130       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1131                                       *RCAST(kmp_int32 *, &flags_old),
1132                                       *RCAST(kmp_int32 *, &flags_new))) {
1133         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1134                        "dephash of implicit task %p\n",
1135                        thread->th.th_info.ds.ds_gtid, task));
1136         __kmp_dephash_free_entries(thread, task->td_dephash);
1137       }
1138     }
1139   }
1140 }
1141
1142 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1143 // when these are destroyed regions
1144 //
1145 // thread:  thread data structure corresponding to implicit task
1146 void __kmp_free_implicit_task(kmp_info_t *thread) {
1147   kmp_taskdata_t *task = thread->th.th_current_task;
1148   if (task && task->td_dephash) {
1149     __kmp_dephash_free(thread, task->td_dephash);
1150     task->td_dephash = NULL;
1151   }
1152 }
1153
1154 // Round up a size to a power of two specified by val: Used to insert padding
1155 // between structures co-allocated using a single malloc() call
1156 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1157   if (size & (val - 1)) {
1158     size &= ~(val - 1);
1159     if (size <= KMP_SIZE_T_MAX - val) {
1160       size += val; // Round up if there is no overflow.
1161     }
1162   }
1163   return size;
1164 } // __kmp_round_up_to_va
1165
1166 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1167 //
1168 // loc_ref: source location information
1169 // gtid: global thread number.
1170 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1171 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1172 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1173 // private vars accessed in task.
1174 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1175 // in task.
1176 // task_entry: Pointer to task code entry point generated by compiler.
1177 // returns: a pointer to the allocated kmp_task_t structure (task).
1178 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1179                              kmp_tasking_flags_t *flags,
1180                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1181                              kmp_routine_entry_t task_entry) {
1182   kmp_task_t *task;
1183   kmp_taskdata_t *taskdata;
1184   kmp_info_t *thread = __kmp_threads[gtid];
1185   kmp_team_t *team = thread->th.th_team;
1186   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1187   size_t shareds_offset;
1188
1189   if (!TCR_4(__kmp_init_middle))
1190     __kmp_middle_initialize();
1191
1192   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1193                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1194                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1195                 sizeof_shareds, task_entry));
1196
1197   if (parent_task->td_flags.final) {
1198     if (flags->merged_if0) {
1199     }
1200     flags->final = 1;
1201   }
1202   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1203     // Untied task encountered causes the TSC algorithm to check entire deque of
1204     // the victim thread. If no untied task encountered, then checking the head
1205     // of the deque should be enough.
1206     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1207   }
1208
1209   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1210   // the tasking setup
1211   // when that happens is too late.
1212   if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
1213     if (flags->proxy == TASK_PROXY) {
1214       flags->tiedness = TASK_UNTIED;
1215       flags->merged_if0 = 1;
1216     }
1217     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1218        tasking support enabled */
1219     if ((thread->th.th_task_team) == NULL) {
1220       /* This should only happen if the team is serialized
1221           setup a task team and propagate it to the thread */
1222       KMP_DEBUG_ASSERT(team->t.t_serialized);
1223       KA_TRACE(30,
1224                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1225                 gtid));
1226       __kmp_task_team_setup(
1227           thread, team,
1228           1); // 1 indicates setup the current team regardless of nthreads
1229       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1230     }
1231     kmp_task_team_t *task_team = thread->th.th_task_team;
1232
1233     /* tasking must be enabled now as the task might not be pushed */
1234     if (!KMP_TASKING_ENABLED(task_team)) {
1235       KA_TRACE(
1236           30,
1237           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1238       __kmp_enable_tasking(task_team, thread);
1239       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1240       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1241       // No lock needed since only owner can allocate
1242       if (thread_data->td.td_deque == NULL) {
1243         __kmp_alloc_task_deque(thread, thread_data);
1244       }
1245     }
1246
1247     if (task_team->tt.tt_found_proxy_tasks == FALSE)
1248       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1249   }
1250
1251   // Calculate shared structure offset including padding after kmp_task_t struct
1252   // to align pointers in shared struct
1253   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1254   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1255
1256   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1257   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1258                 shareds_offset));
1259   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1260                 sizeof_shareds));
1261
1262 // Avoid double allocation here by combining shareds with taskdata
1263 #if USE_FAST_MEMORY
1264   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1265                                                                sizeof_shareds);
1266 #else /* ! USE_FAST_MEMORY */
1267   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1268                                                                sizeof_shareds);
1269 #endif /* USE_FAST_MEMORY */
1270   ANNOTATE_HAPPENS_AFTER(taskdata);
1271
1272   task = KMP_TASKDATA_TO_TASK(taskdata);
1273
1274 // Make sure task & taskdata are aligned appropriately
1275 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1276   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1277   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1278 #else
1279   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1280   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1281 #endif
1282   if (sizeof_shareds > 0) {
1283     // Avoid double allocation here by combining shareds with taskdata
1284     task->shareds = &((char *)taskdata)[shareds_offset];
1285     // Make sure shareds struct is aligned to pointer size
1286     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1287                      0);
1288   } else {
1289     task->shareds = NULL;
1290   }
1291   task->routine = task_entry;
1292   task->part_id = 0; // AC: Always start with 0 part id
1293
1294   taskdata->td_task_id = KMP_GEN_TASK_ID();
1295   taskdata->td_team = team;
1296   taskdata->td_alloc_thread = thread;
1297   taskdata->td_parent = parent_task;
1298   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1299   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1300   taskdata->td_ident = loc_ref;
1301   taskdata->td_taskwait_ident = NULL;
1302   taskdata->td_taskwait_counter = 0;
1303   taskdata->td_taskwait_thread = 0;
1304   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1305   // avoid copying icvs for proxy tasks
1306   if (flags->proxy == TASK_FULL)
1307     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1308
1309   taskdata->td_flags.tiedness = flags->tiedness;
1310   taskdata->td_flags.final = flags->final;
1311   taskdata->td_flags.merged_if0 = flags->merged_if0;
1312   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1313   taskdata->td_flags.proxy = flags->proxy;
1314   taskdata->td_flags.detachable = flags->detachable;
1315   taskdata->td_task_team = thread->th.th_task_team;
1316   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1317   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1318
1319   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1320   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1321
1322   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1323   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1324
1325   // GEH - Note we serialize the task if the team is serialized to make sure
1326   // implicit parallel region tasks are not left until program termination to
1327   // execute. Also, it helps locality to execute immediately.
1328
1329   taskdata->td_flags.task_serial =
1330       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1331        taskdata->td_flags.tasking_ser || flags->merged_if0);
1332
1333   taskdata->td_flags.started = 0;
1334   taskdata->td_flags.executing = 0;
1335   taskdata->td_flags.complete = 0;
1336   taskdata->td_flags.freed = 0;
1337
1338   taskdata->td_flags.native = flags->native;
1339
1340   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1341   // start at one because counts current task and children
1342   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1343   taskdata->td_taskgroup =
1344       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1345   taskdata->td_dephash = NULL;
1346   taskdata->td_depnode = NULL;
1347   if (flags->tiedness == TASK_UNTIED)
1348     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1349   else
1350     taskdata->td_last_tied = taskdata;
1351   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1352 #if OMPT_SUPPORT
1353   if (UNLIKELY(ompt_enabled.enabled))
1354     __ompt_task_init(taskdata, gtid);
1355 #endif
1356 // Only need to keep track of child task counts if team parallel and tasking not
1357 // serialized or if it is a proxy or detachable task
1358   if (flags->proxy == TASK_PROXY ||
1359       flags->detachable == TASK_DETACHABLE ||
1360       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1361   {
1362     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1363     if (parent_task->td_taskgroup)
1364       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1365     // Only need to keep track of allocated child tasks for explicit tasks since
1366     // implicit not deallocated
1367     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1368       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1369     }
1370   }
1371
1372   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1373                 gtid, taskdata, taskdata->td_parent));
1374   ANNOTATE_HAPPENS_BEFORE(task);
1375
1376   return task;
1377 }
1378
1379 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1380                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1381                                   size_t sizeof_shareds,
1382                                   kmp_routine_entry_t task_entry) {
1383   kmp_task_t *retval;
1384   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1385   __kmp_assert_valid_gtid(gtid);
1386   input_flags->native = FALSE;
1387 // __kmp_task_alloc() sets up all other runtime flags
1388   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1389                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1390                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1391                 input_flags->proxy ? "proxy" : "",
1392                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1393                 sizeof_shareds, task_entry));
1394
1395   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1396                             sizeof_shareds, task_entry);
1397
1398   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1399
1400   return retval;
1401 }
1402
1403 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1404                                          kmp_int32 flags,
1405                                          size_t sizeof_kmp_task_t,
1406                                          size_t sizeof_shareds,
1407                                          kmp_routine_entry_t task_entry,
1408                                          kmp_int64 device_id) {
1409   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1410                                sizeof_shareds, task_entry);
1411 }
1412
1413 /*!
1414 @ingroup TASKING
1415 @param loc_ref location of the original task directive
1416 @param gtid Global Thread ID of encountering thread
1417 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1418 task''
1419 @param naffins Number of affinity items
1420 @param affin_list List of affinity items
1421 @return Returns non-zero if registering affinity information was not successful.
1422  Returns 0 if registration was successful
1423 This entry registers the affinity information attached to a task with the task
1424 thunk structure kmp_taskdata_t.
1425 */
1426 kmp_int32
1427 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1428                                   kmp_task_t *new_task, kmp_int32 naffins,
1429                                   kmp_task_affinity_info_t *affin_list) {
1430   return 0;
1431 }
1432
1433 //  __kmp_invoke_task: invoke the specified task
1434 //
1435 // gtid: global thread ID of caller
1436 // task: the task to invoke
1437 // current_task: the task to resume after task invocation
1438 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1439                               kmp_taskdata_t *current_task) {
1440   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1441   kmp_info_t *thread;
1442   int discard = 0 /* false */;
1443   KA_TRACE(
1444       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1445            gtid, taskdata, current_task));
1446   KMP_DEBUG_ASSERT(task);
1447   if (taskdata->td_flags.proxy == TASK_PROXY &&
1448       taskdata->td_flags.complete == 1) {
1449     // This is a proxy task that was already completed but it needs to run
1450     // its bottom-half finish
1451     KA_TRACE(
1452         30,
1453         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1454          gtid, taskdata));
1455
1456     __kmp_bottom_half_finish_proxy(gtid, task);
1457
1458     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1459                   "proxy task %p, resuming task %p\n",
1460                   gtid, taskdata, current_task));
1461
1462     return;
1463   }
1464
1465 #if OMPT_SUPPORT
1466   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1467   // does not execute code.
1468   ompt_thread_info_t oldInfo;
1469   if (UNLIKELY(ompt_enabled.enabled)) {
1470     // Store the threads states and restore them after the task
1471     thread = __kmp_threads[gtid];
1472     oldInfo = thread->th.ompt_thread_info;
1473     thread->th.ompt_thread_info.wait_id = 0;
1474     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1475                                             ? ompt_state_work_serial
1476                                             : ompt_state_work_parallel;
1477     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1478   }
1479 #endif
1480
1481   // Proxy tasks are not handled by the runtime
1482   if (taskdata->td_flags.proxy != TASK_PROXY) {
1483     ANNOTATE_HAPPENS_AFTER(task);
1484     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1485   }
1486
1487   // TODO: cancel tasks if the parallel region has also been cancelled
1488   // TODO: check if this sequence can be hoisted above __kmp_task_start
1489   // if cancellation has been enabled for this run ...
1490   if (__kmp_omp_cancellation) {
1491     thread = __kmp_threads[gtid];
1492     kmp_team_t *this_team = thread->th.th_team;
1493     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1494     if ((taskgroup && taskgroup->cancel_request) ||
1495         (this_team->t.t_cancel_request == cancel_parallel)) {
1496 #if OMPT_SUPPORT && OMPT_OPTIONAL
1497       ompt_data_t *task_data;
1498       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1499         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1500         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1501             task_data,
1502             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1503                                                       : ompt_cancel_parallel) |
1504                 ompt_cancel_discarded_task,
1505             NULL);
1506       }
1507 #endif
1508       KMP_COUNT_BLOCK(TASK_cancelled);
1509       // this task belongs to a task group and we need to cancel it
1510       discard = 1 /* true */;
1511     }
1512   }
1513
1514   // Invoke the task routine and pass in relevant data.
1515   // Thunks generated by gcc take a different argument list.
1516   if (!discard) {
1517     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1518       taskdata->td_last_tied = current_task->td_last_tied;
1519       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1520     }
1521 #if KMP_STATS_ENABLED
1522     KMP_COUNT_BLOCK(TASK_executed);
1523     switch (KMP_GET_THREAD_STATE()) {
1524     case FORK_JOIN_BARRIER:
1525       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1526       break;
1527     case PLAIN_BARRIER:
1528       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1529       break;
1530     case TASKYIELD:
1531       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1532       break;
1533     case TASKWAIT:
1534       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1535       break;
1536     case TASKGROUP:
1537       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1538       break;
1539     default:
1540       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1541       break;
1542     }
1543 #endif // KMP_STATS_ENABLED
1544
1545 // OMPT task begin
1546 #if OMPT_SUPPORT
1547     if (UNLIKELY(ompt_enabled.enabled))
1548       __ompt_task_start(task, current_task, gtid);
1549 #endif
1550
1551 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1552     kmp_uint64 cur_time;
1553     kmp_int32 kmp_itt_count_task =
1554         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1555         current_task->td_flags.tasktype == TASK_IMPLICIT;
1556     if (kmp_itt_count_task) {
1557       thread = __kmp_threads[gtid];
1558       // Time outer level explicit task on barrier for adjusting imbalance time
1559       if (thread->th.th_bar_arrive_time)
1560         cur_time = __itt_get_timestamp();
1561       else
1562         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1563     }
1564     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1565 #endif
1566
1567 #ifdef KMP_GOMP_COMPAT
1568     if (taskdata->td_flags.native) {
1569       ((void (*)(void *))(*(task->routine)))(task->shareds);
1570     } else
1571 #endif /* KMP_GOMP_COMPAT */
1572     {
1573       (*(task->routine))(gtid, task);
1574     }
1575     KMP_POP_PARTITIONED_TIMER();
1576
1577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1578     if (kmp_itt_count_task) {
1579       // Barrier imbalance - adjust arrive time with the task duration
1580       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1581     }
1582     KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1583     KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1584 #endif
1585
1586   }
1587
1588   // Proxy tasks are not handled by the runtime
1589   if (taskdata->td_flags.proxy != TASK_PROXY) {
1590     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1591 #if OMPT_SUPPORT
1592     if (UNLIKELY(ompt_enabled.enabled)) {
1593       thread->th.ompt_thread_info = oldInfo;
1594       if (taskdata->td_flags.tiedness == TASK_TIED) {
1595         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1596       }
1597       __kmp_task_finish<true>(gtid, task, current_task);
1598     } else
1599 #endif
1600       __kmp_task_finish<false>(gtid, task, current_task);
1601   }
1602
1603   KA_TRACE(
1604       30,
1605       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1606        gtid, taskdata, current_task));
1607   return;
1608 }
1609
1610 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1611 //
1612 // loc_ref: location of original task pragma (ignored)
1613 // gtid: Global Thread ID of encountering thread
1614 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1615 // Returns:
1616 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1617 //    be resumed later.
1618 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1619 //    resumed later.
1620 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1621                                 kmp_task_t *new_task) {
1622   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1623
1624   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1625                 loc_ref, new_taskdata));
1626
1627 #if OMPT_SUPPORT
1628   kmp_taskdata_t *parent;
1629   if (UNLIKELY(ompt_enabled.enabled)) {
1630     parent = new_taskdata->td_parent;
1631     if (ompt_enabled.ompt_callback_task_create) {
1632       ompt_data_t task_data = ompt_data_none;
1633       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1634           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1635           parent ? &(parent->ompt_task_info.frame) : NULL,
1636           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1637           OMPT_GET_RETURN_ADDRESS(0));
1638     }
1639   }
1640 #endif
1641
1642   /* Should we execute the new task or queue it? For now, let's just always try
1643      to queue it.  If the queue fills up, then we'll execute it.  */
1644
1645   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1646   { // Execute this task immediately
1647     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1648     new_taskdata->td_flags.task_serial = 1;
1649     __kmp_invoke_task(gtid, new_task, current_task);
1650   }
1651
1652   KA_TRACE(
1653       10,
1654       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1655        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1656        gtid, loc_ref, new_taskdata));
1657
1658   ANNOTATE_HAPPENS_BEFORE(new_task);
1659 #if OMPT_SUPPORT
1660   if (UNLIKELY(ompt_enabled.enabled)) {
1661     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1662   }
1663 #endif
1664   return TASK_CURRENT_NOT_QUEUED;
1665 }
1666
1667 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1668 //
1669 // gtid: Global Thread ID of encountering thread
1670 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1671 // serialize_immediate: if TRUE then if the task is executed immediately its
1672 // execution will be serialized
1673 // Returns:
1674 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1675 //    be resumed later.
1676 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1677 //    resumed later.
1678 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1679                          bool serialize_immediate) {
1680   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1681
1682   /* Should we execute the new task or queue it? For now, let's just always try
1683      to queue it.  If the queue fills up, then we'll execute it.  */
1684   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1685       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1686   { // Execute this task immediately
1687     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1688     if (serialize_immediate)
1689       new_taskdata->td_flags.task_serial = 1;
1690     __kmp_invoke_task(gtid, new_task, current_task);
1691   }
1692
1693   ANNOTATE_HAPPENS_BEFORE(new_task);
1694   return TASK_CURRENT_NOT_QUEUED;
1695 }
1696
1697 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1698 // non-thread-switchable task from the parent thread only!
1699 //
1700 // loc_ref: location of original task pragma (ignored)
1701 // gtid: Global Thread ID of encountering thread
1702 // new_task: non-thread-switchable task thunk allocated by
1703 // __kmp_omp_task_alloc()
1704 // Returns:
1705 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1706 //    be resumed later.
1707 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1708 //    resumed later.
1709 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1710                           kmp_task_t *new_task) {
1711   kmp_int32 res;
1712   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1713
1714 #if KMP_DEBUG || OMPT_SUPPORT
1715   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1716 #endif
1717   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1718                 new_taskdata));
1719   __kmp_assert_valid_gtid(gtid);
1720
1721 #if OMPT_SUPPORT
1722   kmp_taskdata_t *parent = NULL;
1723   if (UNLIKELY(ompt_enabled.enabled)) {
1724     if (!new_taskdata->td_flags.started) {
1725       OMPT_STORE_RETURN_ADDRESS(gtid);
1726       parent = new_taskdata->td_parent;
1727       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1728         parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1729       }
1730       if (ompt_enabled.ompt_callback_task_create) {
1731         ompt_data_t task_data = ompt_data_none;
1732         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1733             parent ? &(parent->ompt_task_info.task_data) : &task_data,
1734             parent ? &(parent->ompt_task_info.frame) : NULL,
1735             &(new_taskdata->ompt_task_info.task_data),
1736             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1737             OMPT_LOAD_RETURN_ADDRESS(gtid));
1738       }
1739     } else {
1740       // We are scheduling the continuation of an UNTIED task.
1741       // Scheduling back to the parent task.
1742       __ompt_task_finish(new_task,
1743                          new_taskdata->ompt_task_info.scheduling_parent,
1744                          ompt_task_switch);
1745       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1746     }
1747   }
1748 #endif
1749
1750   res = __kmp_omp_task(gtid, new_task, true);
1751
1752   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1753                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1754                 gtid, loc_ref, new_taskdata));
1755 #if OMPT_SUPPORT
1756   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1757     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1758   }
1759 #endif
1760   return res;
1761 }
1762
1763 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1764 // a taskloop task with the correct OMPT return address
1765 //
1766 // loc_ref: location of original task pragma (ignored)
1767 // gtid: Global Thread ID of encountering thread
1768 // new_task: non-thread-switchable task thunk allocated by
1769 // __kmp_omp_task_alloc()
1770 // codeptr_ra: return address for OMPT callback
1771 // Returns:
1772 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1773 //    be resumed later.
1774 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1775 //    resumed later.
1776 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1777                                   kmp_task_t *new_task, void *codeptr_ra) {
1778   kmp_int32 res;
1779   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1780
1781 #if KMP_DEBUG || OMPT_SUPPORT
1782   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1783 #endif
1784   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1785                 new_taskdata));
1786
1787 #if OMPT_SUPPORT
1788   kmp_taskdata_t *parent = NULL;
1789   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1790     parent = new_taskdata->td_parent;
1791     if (!parent->ompt_task_info.frame.enter_frame.ptr)
1792       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1793     if (ompt_enabled.ompt_callback_task_create) {
1794       ompt_data_t task_data = ompt_data_none;
1795       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1796           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1797           parent ? &(parent->ompt_task_info.frame) : NULL,
1798           &(new_taskdata->ompt_task_info.task_data),
1799           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1800           codeptr_ra);
1801     }
1802   }
1803 #endif
1804
1805   res = __kmp_omp_task(gtid, new_task, true);
1806
1807   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1808                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1809                 gtid, loc_ref, new_taskdata));
1810 #if OMPT_SUPPORT
1811   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1812     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1813   }
1814 #endif
1815   return res;
1816 }
1817
1818 template <bool ompt>
1819 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1820                                               void *frame_address,
1821                                               void *return_address) {
1822   kmp_taskdata_t *taskdata;
1823   kmp_info_t *thread;
1824   int thread_finished = FALSE;
1825   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1826
1827   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1828   __kmp_assert_valid_gtid(gtid);
1829
1830   if (__kmp_tasking_mode != tskm_immediate_exec) {
1831     thread = __kmp_threads[gtid];
1832     taskdata = thread->th.th_current_task;
1833
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835     ompt_data_t *my_task_data;
1836     ompt_data_t *my_parallel_data;
1837
1838     if (ompt) {
1839       my_task_data = &(taskdata->ompt_task_info.task_data);
1840       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1841
1842       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1843
1844       if (ompt_enabled.ompt_callback_sync_region) {
1845         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1846             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1847             my_task_data, return_address);
1848       }
1849
1850       if (ompt_enabled.ompt_callback_sync_region_wait) {
1851         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1852             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1853             my_task_data, return_address);
1854       }
1855     }
1856 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1857
1858 // Debugger: The taskwait is active. Store location and thread encountered the
1859 // taskwait.
1860 #if USE_ITT_BUILD
1861 // Note: These values are used by ITT events as well.
1862 #endif /* USE_ITT_BUILD */
1863     taskdata->td_taskwait_counter += 1;
1864     taskdata->td_taskwait_ident = loc_ref;
1865     taskdata->td_taskwait_thread = gtid + 1;
1866
1867 #if USE_ITT_BUILD
1868     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1869     if (itt_sync_obj != NULL)
1870       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1871 #endif /* USE_ITT_BUILD */
1872
1873     bool must_wait =
1874         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1875
1876     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1877                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1878     if (must_wait) {
1879       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1880                              &(taskdata->td_incomplete_child_tasks)),
1881                        0U);
1882       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1883         flag.execute_tasks(thread, gtid, FALSE,
1884                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1885                            __kmp_task_stealing_constraint);
1886       }
1887     }
1888 #if USE_ITT_BUILD
1889     if (itt_sync_obj != NULL)
1890       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1891     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
1892 #endif /* USE_ITT_BUILD */
1893
1894     // Debugger:  The taskwait is completed. Location remains, but thread is
1895     // negated.
1896     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1897
1898 #if OMPT_SUPPORT && OMPT_OPTIONAL
1899     if (ompt) {
1900       if (ompt_enabled.ompt_callback_sync_region_wait) {
1901         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1902             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1903             my_task_data, return_address);
1904       }
1905       if (ompt_enabled.ompt_callback_sync_region) {
1906         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1907             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1908             my_task_data, return_address);
1909       }
1910       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1911     }
1912 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1913
1914     ANNOTATE_HAPPENS_AFTER(taskdata);
1915   }
1916
1917   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1918                 "returning TASK_CURRENT_NOT_QUEUED\n",
1919                 gtid, taskdata));
1920
1921   return TASK_CURRENT_NOT_QUEUED;
1922 }
1923
1924 #if OMPT_SUPPORT && OMPT_OPTIONAL
1925 OMPT_NOINLINE
1926 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1927                                           void *frame_address,
1928                                           void *return_address) {
1929   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1930                                             return_address);
1931 }
1932 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1933
1934 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1935 // complete
1936 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1937 #if OMPT_SUPPORT && OMPT_OPTIONAL
1938   if (UNLIKELY(ompt_enabled.enabled)) {
1939     OMPT_STORE_RETURN_ADDRESS(gtid);
1940     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1941                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
1942   }
1943 #endif
1944   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1945 }
1946
1947 // __kmpc_omp_taskyield: switch to a different task
1948 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1949   kmp_taskdata_t *taskdata;
1950   kmp_info_t *thread;
1951   int thread_finished = FALSE;
1952
1953   KMP_COUNT_BLOCK(OMP_TASKYIELD);
1954   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1955
1956   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1957                 gtid, loc_ref, end_part));
1958   __kmp_assert_valid_gtid(gtid);
1959
1960   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1961     thread = __kmp_threads[gtid];
1962     taskdata = thread->th.th_current_task;
1963 // Should we model this as a task wait or not?
1964 // Debugger: The taskwait is active. Store location and thread encountered the
1965 // taskwait.
1966 #if USE_ITT_BUILD
1967 // Note: These values are used by ITT events as well.
1968 #endif /* USE_ITT_BUILD */
1969     taskdata->td_taskwait_counter += 1;
1970     taskdata->td_taskwait_ident = loc_ref;
1971     taskdata->td_taskwait_thread = gtid + 1;
1972
1973 #if USE_ITT_BUILD
1974     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1975     if (itt_sync_obj != NULL)
1976       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1977 #endif /* USE_ITT_BUILD */
1978     if (!taskdata->td_flags.team_serial) {
1979       kmp_task_team_t *task_team = thread->th.th_task_team;
1980       if (task_team != NULL) {
1981         if (KMP_TASKING_ENABLED(task_team)) {
1982 #if OMPT_SUPPORT
1983           if (UNLIKELY(ompt_enabled.enabled))
1984             thread->th.ompt_thread_info.ompt_task_yielded = 1;
1985 #endif
1986           __kmp_execute_tasks_32(
1987               thread, gtid, NULL, FALSE,
1988               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1989               __kmp_task_stealing_constraint);
1990 #if OMPT_SUPPORT
1991           if (UNLIKELY(ompt_enabled.enabled))
1992             thread->th.ompt_thread_info.ompt_task_yielded = 0;
1993 #endif
1994         }
1995       }
1996     }
1997 #if USE_ITT_BUILD
1998     if (itt_sync_obj != NULL)
1999       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2000 #endif /* USE_ITT_BUILD */
2001
2002     // Debugger:  The taskwait is completed. Location remains, but thread is
2003     // negated.
2004     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2005   }
2006
2007   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2008                 "returning TASK_CURRENT_NOT_QUEUED\n",
2009                 gtid, taskdata));
2010
2011   return TASK_CURRENT_NOT_QUEUED;
2012 }
2013
2014 // Task Reduction implementation
2015 //
2016 // Note: initial implementation didn't take into account the possibility
2017 // to specify omp_orig for initializer of the UDR (user defined reduction).
2018 // Corrected implementation takes into account the omp_orig object.
2019 // Compiler is free to use old implementation if omp_orig is not specified.
2020
2021 /*!
2022 @ingroup BASIC_TYPES
2023 @{
2024 */
2025
2026 /*!
2027 Flags for special info per task reduction item.
2028 */
2029 typedef struct kmp_taskred_flags {
2030   /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
2031   unsigned lazy_priv : 1;
2032   unsigned reserved31 : 31;
2033 } kmp_taskred_flags_t;
2034
2035 /*!
2036 Internal struct for reduction data item related info set up by compiler.
2037 */
2038 typedef struct kmp_task_red_input {
2039   void *reduce_shar; /**< shared between tasks item to reduce into */
2040   size_t reduce_size; /**< size of data item in bytes */
2041   // three compiler-generated routines (init, fini are optional):
2042   void *reduce_init; /**< data initialization routine (single parameter) */
2043   void *reduce_fini; /**< data finalization routine */
2044   void *reduce_comb; /**< data combiner routine */
2045   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2046 } kmp_task_red_input_t;
2047
2048 /*!
2049 Internal struct for reduction data item related info saved by the library.
2050 */
2051 typedef struct kmp_taskred_data {
2052   void *reduce_shar; /**< shared between tasks item to reduce into */
2053   size_t reduce_size; /**< size of data item */
2054   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2055   void *reduce_priv; /**< array of thread specific items */
2056   void *reduce_pend; /**< end of private data for faster comparison op */
2057   // three compiler-generated routines (init, fini are optional):
2058   void *reduce_comb; /**< data combiner routine */
2059   void *reduce_init; /**< data initialization routine (two parameters) */
2060   void *reduce_fini; /**< data finalization routine */
2061   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2062 } kmp_taskred_data_t;
2063
2064 /*!
2065 Internal struct for reduction data item related info set up by compiler.
2066
2067 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2068 */
2069 typedef struct kmp_taskred_input {
2070   void *reduce_shar; /**< shared between tasks item to reduce into */
2071   void *reduce_orig; /**< original reduction item used for initialization */
2072   size_t reduce_size; /**< size of data item */
2073   // three compiler-generated routines (init, fini are optional):
2074   void *reduce_init; /**< data initialization routine (two parameters) */
2075   void *reduce_fini; /**< data finalization routine */
2076   void *reduce_comb; /**< data combiner routine */
2077   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2078 } kmp_taskred_input_t;
2079 /*!
2080 @}
2081 */
2082
2083 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2084 template <>
2085 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2086                                              kmp_task_red_input_t &src) {
2087   item.reduce_orig = NULL;
2088 }
2089 template <>
2090 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2091                                             kmp_taskred_input_t &src) {
2092   if (src.reduce_orig != NULL) {
2093     item.reduce_orig = src.reduce_orig;
2094   } else {
2095     item.reduce_orig = src.reduce_shar;
2096   } // non-NULL reduce_orig means new interface used
2097 }
2098
2099 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
2100 template <>
2101 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2102                                            int offset) {
2103   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2104 }
2105 template <>
2106 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2107                                           int offset) {
2108   ((void (*)(void *, void *))item.reduce_init)(
2109       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2110 }
2111
2112 template <typename T>
2113 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2114   __kmp_assert_valid_gtid(gtid);
2115   kmp_info_t *thread = __kmp_threads[gtid];
2116   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2117   kmp_int32 nth = thread->th.th_team_nproc;
2118   kmp_taskred_data_t *arr;
2119
2120   // check input data just in case
2121   KMP_ASSERT(tg != NULL);
2122   KMP_ASSERT(data != NULL);
2123   KMP_ASSERT(num > 0);
2124   if (nth == 1) {
2125     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2126                   gtid, tg));
2127     return (void *)tg;
2128   }
2129   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2130                 gtid, tg, num));
2131   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2132       thread, num * sizeof(kmp_taskred_data_t));
2133   for (int i = 0; i < num; ++i) {
2134     size_t size = data[i].reduce_size - 1;
2135     // round the size up to cache line per thread-specific item
2136     size += CACHE_LINE - size % CACHE_LINE;
2137     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2138     arr[i].reduce_shar = data[i].reduce_shar;
2139     arr[i].reduce_size = size;
2140     arr[i].flags = data[i].flags;
2141     arr[i].reduce_comb = data[i].reduce_comb;
2142     arr[i].reduce_init = data[i].reduce_init;
2143     arr[i].reduce_fini = data[i].reduce_fini;
2144     __kmp_assign_orig<T>(arr[i], data[i]);
2145     if (!arr[i].flags.lazy_priv) {
2146       // allocate cache-line aligned block and fill it with zeros
2147       arr[i].reduce_priv = __kmp_allocate(nth * size);
2148       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2149       if (arr[i].reduce_init != NULL) {
2150         // initialize all thread-specific items
2151         for (int j = 0; j < nth; ++j) {
2152           __kmp_call_init<T>(arr[i], j * size);
2153         }
2154       }
2155     } else {
2156       // only allocate space for pointers now,
2157       // objects will be lazily allocated/initialized if/when requested
2158       // note that __kmp_allocate zeroes the allocated memory
2159       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2160     }
2161   }
2162   tg->reduce_data = (void *)arr;
2163   tg->reduce_num_data = num;
2164   return (void *)tg;
2165 }
2166
2167 /*!
2168 @ingroup TASKING
2169 @param gtid      Global thread ID
2170 @param num       Number of data items to reduce
2171 @param data      Array of data for reduction
2172 @return The taskgroup identifier
2173
2174 Initialize task reduction for the taskgroup.
2175
2176 Note: this entry supposes the optional compiler-generated initializer routine
2177 has single parameter - pointer to object to be initialized. That means
2178 the reduction either does not use omp_orig object, or the omp_orig is accessible
2179 without help of the runtime library.
2180 */
2181 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2182   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2183 }
2184
2185 /*!
2186 @ingroup TASKING
2187 @param gtid      Global thread ID
2188 @param num       Number of data items to reduce
2189 @param data      Array of data for reduction
2190 @return The taskgroup identifier
2191
2192 Initialize task reduction for the taskgroup.
2193
2194 Note: this entry supposes the optional compiler-generated initializer routine
2195 has two parameters, pointer to object to be initialized and pointer to omp_orig
2196 */
2197 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2198   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2199 }
2200
2201 // Copy task reduction data (except for shared pointers).
2202 template <typename T>
2203 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2204                                     kmp_taskgroup_t *tg, void *reduce_data) {
2205   kmp_taskred_data_t *arr;
2206   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2207                 " from data %p\n",
2208                 thr, tg, reduce_data));
2209   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2210       thr, num * sizeof(kmp_taskred_data_t));
2211   // threads will share private copies, thunk routines, sizes, flags, etc.:
2212   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2213   for (int i = 0; i < num; ++i) {
2214     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2215   }
2216   tg->reduce_data = (void *)arr;
2217   tg->reduce_num_data = num;
2218 }
2219
2220 /*!
2221 @ingroup TASKING
2222 @param gtid    Global thread ID
2223 @param tskgrp  The taskgroup ID (optional)
2224 @param data    Shared location of the item
2225 @return The pointer to per-thread data
2226
2227 Get thread-specific location of data item
2228 */
2229 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2230   __kmp_assert_valid_gtid(gtid);
2231   kmp_info_t *thread = __kmp_threads[gtid];
2232   kmp_int32 nth = thread->th.th_team_nproc;
2233   if (nth == 1)
2234     return data; // nothing to do
2235
2236   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2237   if (tg == NULL)
2238     tg = thread->th.th_current_task->td_taskgroup;
2239   KMP_ASSERT(tg != NULL);
2240   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2241   kmp_int32 num = tg->reduce_num_data;
2242   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2243
2244   KMP_ASSERT(data != NULL);
2245   while (tg != NULL) {
2246     for (int i = 0; i < num; ++i) {
2247       if (!arr[i].flags.lazy_priv) {
2248         if (data == arr[i].reduce_shar ||
2249             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2250           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2251       } else {
2252         // check shared location first
2253         void **p_priv = (void **)(arr[i].reduce_priv);
2254         if (data == arr[i].reduce_shar)
2255           goto found;
2256         // check if we get some thread specific location as parameter
2257         for (int j = 0; j < nth; ++j)
2258           if (data == p_priv[j])
2259             goto found;
2260         continue; // not found, continue search
2261       found:
2262         if (p_priv[tid] == NULL) {
2263           // allocate thread specific object lazily
2264           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2265           if (arr[i].reduce_init != NULL) {
2266             if (arr[i].reduce_orig != NULL) { // new interface
2267               ((void (*)(void *, void *))arr[i].reduce_init)(
2268                   p_priv[tid], arr[i].reduce_orig);
2269             } else { // old interface (single parameter)
2270               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2271             }
2272           }
2273         }
2274         return p_priv[tid];
2275       }
2276     }
2277     tg = tg->parent;
2278     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2279     num = tg->reduce_num_data;
2280   }
2281   KMP_ASSERT2(0, "Unknown task reduction item");
2282   return NULL; // ERROR, this line never executed
2283 }
2284
2285 // Finalize task reduction.
2286 // Called from __kmpc_end_taskgroup()
2287 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2288   kmp_int32 nth = th->th.th_team_nproc;
2289   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2290   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2291   kmp_int32 num = tg->reduce_num_data;
2292   for (int i = 0; i < num; ++i) {
2293     void *sh_data = arr[i].reduce_shar;
2294     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2295     void (*f_comb)(void *, void *) =
2296         (void (*)(void *, void *))(arr[i].reduce_comb);
2297     if (!arr[i].flags.lazy_priv) {
2298       void *pr_data = arr[i].reduce_priv;
2299       size_t size = arr[i].reduce_size;
2300       for (int j = 0; j < nth; ++j) {
2301         void *priv_data = (char *)pr_data + j * size;
2302         f_comb(sh_data, priv_data); // combine results
2303         if (f_fini)
2304           f_fini(priv_data); // finalize if needed
2305       }
2306     } else {
2307       void **pr_data = (void **)(arr[i].reduce_priv);
2308       for (int j = 0; j < nth; ++j) {
2309         if (pr_data[j] != NULL) {
2310           f_comb(sh_data, pr_data[j]); // combine results
2311           if (f_fini)
2312             f_fini(pr_data[j]); // finalize if needed
2313           __kmp_free(pr_data[j]);
2314         }
2315       }
2316     }
2317     __kmp_free(arr[i].reduce_priv);
2318   }
2319   __kmp_thread_free(th, arr);
2320   tg->reduce_data = NULL;
2321   tg->reduce_num_data = 0;
2322 }
2323
2324 // Cleanup task reduction data for parallel or worksharing,
2325 // do not touch task private data other threads still working with.
2326 // Called from __kmpc_end_taskgroup()
2327 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2328   __kmp_thread_free(th, tg->reduce_data);
2329   tg->reduce_data = NULL;
2330   tg->reduce_num_data = 0;
2331 }
2332
2333 template <typename T>
2334 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2335                                          int num, T *data) {
2336   __kmp_assert_valid_gtid(gtid);
2337   kmp_info_t *thr = __kmp_threads[gtid];
2338   kmp_int32 nth = thr->th.th_team_nproc;
2339   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2340   if (nth == 1) {
2341     KA_TRACE(10,
2342              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2343               gtid, thr->th.th_current_task->td_taskgroup));
2344     return (void *)thr->th.th_current_task->td_taskgroup;
2345   }
2346   kmp_team_t *team = thr->th.th_team;
2347   void *reduce_data;
2348   kmp_taskgroup_t *tg;
2349   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2350   if (reduce_data == NULL &&
2351       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2352                                  (void *)1)) {
2353     // single thread enters this block to initialize common reduction data
2354     KMP_DEBUG_ASSERT(reduce_data == NULL);
2355     // first initialize own data, then make a copy other threads can use
2356     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2357     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2358     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2359     // fini counters should be 0 at this point
2360     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2361     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2362     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2363   } else {
2364     while (
2365         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2366         (void *)1) { // wait for task reduction initialization
2367       KMP_CPU_PAUSE();
2368     }
2369     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2370     tg = thr->th.th_current_task->td_taskgroup;
2371     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2372   }
2373   return tg;
2374 }
2375
2376 /*!
2377 @ingroup TASKING
2378 @param loc       Source location info
2379 @param gtid      Global thread ID
2380 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2381 @param num       Number of data items to reduce
2382 @param data      Array of data for reduction
2383 @return The taskgroup identifier
2384
2385 Initialize task reduction for a parallel or worksharing.
2386
2387 Note: this entry supposes the optional compiler-generated initializer routine
2388 has single parameter - pointer to object to be initialized. That means
2389 the reduction either does not use omp_orig object, or the omp_orig is accessible
2390 without help of the runtime library.
2391 */
2392 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2393                                           int num, void *data) {
2394   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2395                                             (kmp_task_red_input_t *)data);
2396 }
2397
2398 /*!
2399 @ingroup TASKING
2400 @param loc       Source location info
2401 @param gtid      Global thread ID
2402 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2403 @param num       Number of data items to reduce
2404 @param data      Array of data for reduction
2405 @return The taskgroup identifier
2406
2407 Initialize task reduction for a parallel or worksharing.
2408
2409 Note: this entry supposes the optional compiler-generated initializer routine
2410 has two parameters, pointer to object to be initialized and pointer to omp_orig
2411 */
2412 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2413                                    void *data) {
2414   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2415                                             (kmp_taskred_input_t *)data);
2416 }
2417
2418 /*!
2419 @ingroup TASKING
2420 @param loc       Source location info
2421 @param gtid      Global thread ID
2422 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2423
2424 Finalize task reduction for a parallel or worksharing.
2425 */
2426 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2427   __kmpc_end_taskgroup(loc, gtid);
2428 }
2429
2430 // __kmpc_taskgroup: Start a new taskgroup
2431 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2432   __kmp_assert_valid_gtid(gtid);
2433   kmp_info_t *thread = __kmp_threads[gtid];
2434   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2435   kmp_taskgroup_t *tg_new =
2436       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2437   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2438   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2439   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2440   tg_new->parent = taskdata->td_taskgroup;
2441   tg_new->reduce_data = NULL;
2442   tg_new->reduce_num_data = 0;
2443   taskdata->td_taskgroup = tg_new;
2444
2445 #if OMPT_SUPPORT && OMPT_OPTIONAL
2446   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2447     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2448     if (!codeptr)
2449       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2450     kmp_team_t *team = thread->th.th_team;
2451     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2452     // FIXME: I think this is wrong for lwt!
2453     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2454
2455     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2456         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2457         &(my_task_data), codeptr);
2458   }
2459 #endif
2460 }
2461
2462 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2463 //                       and its descendants are complete
2464 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2465   __kmp_assert_valid_gtid(gtid);
2466   kmp_info_t *thread = __kmp_threads[gtid];
2467   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2468   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2469   int thread_finished = FALSE;
2470
2471 #if OMPT_SUPPORT && OMPT_OPTIONAL
2472   kmp_team_t *team;
2473   ompt_data_t my_task_data;
2474   ompt_data_t my_parallel_data;
2475   void *codeptr;
2476   if (UNLIKELY(ompt_enabled.enabled)) {
2477     team = thread->th.th_team;
2478     my_task_data = taskdata->ompt_task_info.task_data;
2479     // FIXME: I think this is wrong for lwt!
2480     my_parallel_data = team->t.ompt_team_info.parallel_data;
2481     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2482     if (!codeptr)
2483       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2484   }
2485 #endif
2486
2487   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2488   KMP_DEBUG_ASSERT(taskgroup != NULL);
2489   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2490
2491   if (__kmp_tasking_mode != tskm_immediate_exec) {
2492     // mark task as waiting not on a barrier
2493     taskdata->td_taskwait_counter += 1;
2494     taskdata->td_taskwait_ident = loc;
2495     taskdata->td_taskwait_thread = gtid + 1;
2496 #if USE_ITT_BUILD
2497     // For ITT the taskgroup wait is similar to taskwait until we need to
2498     // distinguish them
2499     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2500     if (itt_sync_obj != NULL)
2501       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2502 #endif /* USE_ITT_BUILD */
2503
2504 #if OMPT_SUPPORT && OMPT_OPTIONAL
2505     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2506       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2507           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2508           &(my_task_data), codeptr);
2509     }
2510 #endif
2511
2512     if (!taskdata->td_flags.team_serial ||
2513         (thread->th.th_task_team != NULL &&
2514          thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
2515       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2516                        0U);
2517       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2518         flag.execute_tasks(thread, gtid, FALSE,
2519                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2520                            __kmp_task_stealing_constraint);
2521       }
2522     }
2523     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2524
2525 #if OMPT_SUPPORT && OMPT_OPTIONAL
2526     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2527       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2528           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2529           &(my_task_data), codeptr);
2530     }
2531 #endif
2532
2533 #if USE_ITT_BUILD
2534     if (itt_sync_obj != NULL)
2535       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2536     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2537 #endif /* USE_ITT_BUILD */
2538   }
2539   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2540
2541   if (taskgroup->reduce_data != NULL) { // need to reduce?
2542     int cnt;
2543     void *reduce_data;
2544     kmp_team_t *t = thread->th.th_team;
2545     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2546     // check if <priv> data of the first reduction variable shared for the team
2547     void *priv0 = arr[0].reduce_priv;
2548     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2549         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2550       // finishing task reduction on parallel
2551       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2552       if (cnt == thread->th.th_team_nproc - 1) {
2553         // we are the last thread passing __kmpc_reduction_modifier_fini()
2554         // finalize task reduction:
2555         __kmp_task_reduction_fini(thread, taskgroup);
2556         // cleanup fields in the team structure:
2557         // TODO: is relaxed store enough here (whole barrier should follow)?
2558         __kmp_thread_free(thread, reduce_data);
2559         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2560         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2561       } else {
2562         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2563         // so do not finalize reduction, just clean own copy of the data
2564         __kmp_task_reduction_clean(thread, taskgroup);
2565       }
2566     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2567                    NULL &&
2568                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2569       // finishing task reduction on worksharing
2570       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2571       if (cnt == thread->th.th_team_nproc - 1) {
2572         // we are the last thread passing __kmpc_reduction_modifier_fini()
2573         __kmp_task_reduction_fini(thread, taskgroup);
2574         // cleanup fields in team structure:
2575         // TODO: is relaxed store enough here (whole barrier should follow)?
2576         __kmp_thread_free(thread, reduce_data);
2577         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2578         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2579       } else {
2580         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2581         // so do not finalize reduction, just clean own copy of the data
2582         __kmp_task_reduction_clean(thread, taskgroup);
2583       }
2584     } else {
2585       // finishing task reduction on taskgroup
2586       __kmp_task_reduction_fini(thread, taskgroup);
2587     }
2588   }
2589   // Restore parent taskgroup for the current task
2590   taskdata->td_taskgroup = taskgroup->parent;
2591   __kmp_thread_free(thread, taskgroup);
2592
2593   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2594                 gtid, taskdata));
2595   ANNOTATE_HAPPENS_AFTER(taskdata);
2596
2597 #if OMPT_SUPPORT && OMPT_OPTIONAL
2598   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2599     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2600         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2601         &(my_task_data), codeptr);
2602   }
2603 #endif
2604 }
2605
2606 // __kmp_remove_my_task: remove a task from my own deque
2607 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2608                                         kmp_task_team_t *task_team,
2609                                         kmp_int32 is_constrained) {
2610   kmp_task_t *task;
2611   kmp_taskdata_t *taskdata;
2612   kmp_thread_data_t *thread_data;
2613   kmp_uint32 tail;
2614
2615   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2616   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2617                    NULL); // Caller should check this condition
2618
2619   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2620
2621   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2622                 gtid, thread_data->td.td_deque_ntasks,
2623                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2624
2625   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2626     KA_TRACE(10,
2627              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2628               "ntasks=%d head=%u tail=%u\n",
2629               gtid, thread_data->td.td_deque_ntasks,
2630               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2631     return NULL;
2632   }
2633
2634   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2635
2636   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2637     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2638     KA_TRACE(10,
2639              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2640               "ntasks=%d head=%u tail=%u\n",
2641               gtid, thread_data->td.td_deque_ntasks,
2642               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2643     return NULL;
2644   }
2645
2646   tail = (thread_data->td.td_deque_tail - 1) &
2647          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2648   taskdata = thread_data->td.td_deque[tail];
2649
2650   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2651                              thread->th.th_current_task)) {
2652     // The TSC does not allow to steal victim task
2653     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2654     KA_TRACE(10,
2655              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2656               "ntasks=%d head=%u tail=%u\n",
2657               gtid, thread_data->td.td_deque_ntasks,
2658               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2659     return NULL;
2660   }
2661
2662   thread_data->td.td_deque_tail = tail;
2663   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2664
2665   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2666
2667   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2668                 "ntasks=%d head=%u tail=%u\n",
2669                 gtid, taskdata, thread_data->td.td_deque_ntasks,
2670                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2671
2672   task = KMP_TASKDATA_TO_TASK(taskdata);
2673   return task;
2674 }
2675
2676 // __kmp_steal_task: remove a task from another thread's deque
2677 // Assume that calling thread has already checked existence of
2678 // task_team thread_data before calling this routine.
2679 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2680                                     kmp_task_team_t *task_team,
2681                                     std::atomic<kmp_int32> *unfinished_threads,
2682                                     int *thread_finished,
2683                                     kmp_int32 is_constrained) {
2684   kmp_task_t *task;
2685   kmp_taskdata_t *taskdata;
2686   kmp_taskdata_t *current;
2687   kmp_thread_data_t *victim_td, *threads_data;
2688   kmp_int32 target;
2689   kmp_int32 victim_tid;
2690
2691   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2692
2693   threads_data = task_team->tt.tt_threads_data;
2694   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2695
2696   victim_tid = victim_thr->th.th_info.ds.ds_tid;
2697   victim_td = &threads_data[victim_tid];
2698
2699   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2700                 "task_team=%p ntasks=%d head=%u tail=%u\n",
2701                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2702                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2703                 victim_td->td.td_deque_tail));
2704
2705   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2706     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2707                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2708                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2709                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2710                   victim_td->td.td_deque_tail));
2711     return NULL;
2712   }
2713
2714   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2715
2716   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2717   // Check again after we acquire the lock
2718   if (ntasks == 0) {
2719     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2720     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2721                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2722                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2723                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2724     return NULL;
2725   }
2726
2727   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2728   current = __kmp_threads[gtid]->th.th_current_task;
2729   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2730   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2731     // Bump head pointer and Wrap.
2732     victim_td->td.td_deque_head =
2733         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2734   } else {
2735     if (!task_team->tt.tt_untied_task_encountered) {
2736       // The TSC does not allow to steal victim task
2737       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2738       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2739                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2740                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2741                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2742       return NULL;
2743     }
2744     int i;
2745     // walk through victim's deque trying to steal any task
2746     target = victim_td->td.td_deque_head;
2747     taskdata = NULL;
2748     for (i = 1; i < ntasks; ++i) {
2749       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2750       taskdata = victim_td->td.td_deque[target];
2751       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2752         break; // found victim task
2753       } else {
2754         taskdata = NULL;
2755       }
2756     }
2757     if (taskdata == NULL) {
2758       // No appropriate candidate to steal found
2759       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2760       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2761                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2762                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2763                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2764       return NULL;
2765     }
2766     int prev = target;
2767     for (i = i + 1; i < ntasks; ++i) {
2768       // shift remaining tasks in the deque left by 1
2769       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2770       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2771       prev = target;
2772     }
2773     KMP_DEBUG_ASSERT(
2774         victim_td->td.td_deque_tail ==
2775         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2776     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2777   }
2778   if (*thread_finished) {
2779     // We need to un-mark this victim as a finished victim.  This must be done
2780     // before releasing the lock, or else other threads (starting with the
2781     // master victim) might be prematurely released from the barrier!!!
2782     kmp_int32 count;
2783
2784     count = KMP_ATOMIC_INC(unfinished_threads);
2785
2786     KA_TRACE(
2787         20,
2788         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2789          gtid, count + 1, task_team));
2790
2791     *thread_finished = FALSE;
2792   }
2793   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2794
2795   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2796
2797   KMP_COUNT_BLOCK(TASK_stolen);
2798   KA_TRACE(10,
2799            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2800             "task_team=%p ntasks=%d head=%u tail=%u\n",
2801             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2802             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2803
2804   task = KMP_TASKDATA_TO_TASK(taskdata);
2805   return task;
2806 }
2807
2808 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2809 // condition is statisfied (return true) or there are none left (return false).
2810 //
2811 // final_spin is TRUE if this is the spin at the release barrier.
2812 // thread_finished indicates whether the thread is finished executing all
2813 // the tasks it has on its deque, and is at the release barrier.
2814 // spinner is the location on which to spin.
2815 // spinner == NULL means only execute a single task and return.
2816 // checker is the value to check to terminate the spin.
2817 template <class C>
2818 static inline int __kmp_execute_tasks_template(
2819     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2820     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2821     kmp_int32 is_constrained) {
2822   kmp_task_team_t *task_team = thread->th.th_task_team;
2823   kmp_thread_data_t *threads_data;
2824   kmp_task_t *task;
2825   kmp_info_t *other_thread;
2826   kmp_taskdata_t *current_task = thread->th.th_current_task;
2827   std::atomic<kmp_int32> *unfinished_threads;
2828   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2829                       tid = thread->th.th_info.ds.ds_tid;
2830
2831   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2832   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2833
2834   if (task_team == NULL || current_task == NULL)
2835     return FALSE;
2836
2837   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2838                 "*thread_finished=%d\n",
2839                 gtid, final_spin, *thread_finished));
2840
2841   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2842   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2843   KMP_DEBUG_ASSERT(threads_data != NULL);
2844
2845   nthreads = task_team->tt.tt_nproc;
2846   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2847   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2848   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2849
2850   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2851     // getting tasks from target constructs
2852     while (1) { // Inner loop to find a task and execute it
2853       task = NULL;
2854       if (use_own_tasks) { // check on own queue first
2855         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2856       }
2857       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2858         int asleep = 1;
2859         use_own_tasks = 0;
2860         // Try to steal from the last place I stole from successfully.
2861         if (victim_tid == -2) { // haven't stolen anything yet
2862           victim_tid = threads_data[tid].td.td_deque_last_stolen;
2863           if (victim_tid !=
2864               -1) // if we have a last stolen from victim, get the thread
2865             other_thread = threads_data[victim_tid].td.td_thr;
2866         }
2867         if (victim_tid != -1) { // found last victim
2868           asleep = 0;
2869         } else if (!new_victim) { // no recent steals and we haven't already
2870           // used a new victim; select a random thread
2871           do { // Find a different thread to steal work from.
2872             // Pick a random thread. Initial plan was to cycle through all the
2873             // threads, and only return if we tried to steal from every thread,
2874             // and failed.  Arch says that's not such a great idea.
2875             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2876             if (victim_tid >= tid) {
2877               ++victim_tid; // Adjusts random distribution to exclude self
2878             }
2879             // Found a potential victim
2880             other_thread = threads_data[victim_tid].td.td_thr;
2881             // There is a slight chance that __kmp_enable_tasking() did not wake
2882             // up all threads waiting at the barrier.  If victim is sleeping,
2883             // then wake it up. Since we were going to pay the cache miss
2884             // penalty for referencing another thread's kmp_info_t struct
2885             // anyway,
2886             // the check shouldn't cost too much performance at this point. In
2887             // extra barrier mode, tasks do not sleep at the separate tasking
2888             // barrier, so this isn't a problem.
2889             asleep = 0;
2890             if ((__kmp_tasking_mode == tskm_task_teams) &&
2891                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2892                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2893                  NULL)) {
2894               asleep = 1;
2895               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2896                                         other_thread->th.th_sleep_loc);
2897               // A sleeping thread should not have any tasks on it's queue.
2898               // There is a slight possibility that it resumes, steals a task
2899               // from another thread, which spawns more tasks, all in the time
2900               // that it takes this thread to check => don't write an assertion
2901               // that the victim's queue is empty.  Try stealing from a
2902               // different thread.
2903             }
2904           } while (asleep);
2905         }
2906
2907         if (!asleep) {
2908           // We have a victim to try to steal from
2909           task = __kmp_steal_task(other_thread, gtid, task_team,
2910                                   unfinished_threads, thread_finished,
2911                                   is_constrained);
2912         }
2913         if (task != NULL) { // set last stolen to victim
2914           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2915             threads_data[tid].td.td_deque_last_stolen = victim_tid;
2916             // The pre-refactored code did not try more than 1 successful new
2917             // vicitm, unless the last one generated more local tasks;
2918             // new_victim keeps track of this
2919             new_victim = 1;
2920           }
2921         } else { // No tasks found; unset last_stolen
2922           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2923           victim_tid = -2; // no successful victim found
2924         }
2925       }
2926
2927       if (task == NULL) // break out of tasking loop
2928         break;
2929
2930 // Found a task; execute it
2931 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2932       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2933         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2934           // get the object reliably
2935           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2936         }
2937         __kmp_itt_task_starting(itt_sync_obj);
2938       }
2939 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2940       __kmp_invoke_task(gtid, task, current_task);
2941 #if USE_ITT_BUILD
2942       if (itt_sync_obj != NULL)
2943         __kmp_itt_task_finished(itt_sync_obj);
2944 #endif /* USE_ITT_BUILD */
2945       // If this thread is only partway through the barrier and the condition is
2946       // met, then return now, so that the barrier gather/release pattern can
2947       // proceed. If this thread is in the last spin loop in the barrier,
2948       // waiting to be released, we know that the termination condition will not
2949       // be satisfied, so don't waste any cycles checking it.
2950       if (flag == NULL || (!final_spin && flag->done_check())) {
2951         KA_TRACE(
2952             15,
2953             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2954              gtid));
2955         return TRUE;
2956       }
2957       if (thread->th.th_task_team == NULL) {
2958         break;
2959       }
2960       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
2961       // If execution of a stolen task results in more tasks being placed on our
2962       // run queue, reset use_own_tasks
2963       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2964         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2965                       "other tasks, restart\n",
2966                       gtid));
2967         use_own_tasks = 1;
2968         new_victim = 0;
2969       }
2970     }
2971
2972     // The task source has been exhausted. If in final spin loop of barrier,
2973     // check if termination condition is satisfied. The work queue may be empty
2974     // but there might be proxy tasks still executing.
2975     if (final_spin &&
2976         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
2977       // First, decrement the #unfinished threads, if that has not already been
2978       // done.  This decrement might be to the spin location, and result in the
2979       // termination condition being satisfied.
2980       if (!*thread_finished) {
2981         kmp_int32 count;
2982
2983         count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2984         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2985                       "unfinished_threads to %d task_team=%p\n",
2986                       gtid, count, task_team));
2987         *thread_finished = TRUE;
2988       }
2989
2990       // It is now unsafe to reference thread->th.th_team !!!
2991       // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2992       // thread to pass through the barrier, where it might reset each thread's
2993       // th.th_team field for the next parallel region. If we can steal more
2994       // work, we know that this has not happened yet.
2995       if (flag != NULL && flag->done_check()) {
2996         KA_TRACE(
2997             15,
2998             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2999              gtid));
3000         return TRUE;
3001       }
3002     }
3003
3004     // If this thread's task team is NULL, master has recognized that there are
3005     // no more tasks; bail out
3006     if (thread->th.th_task_team == NULL) {
3007       KA_TRACE(15,
3008                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3009       return FALSE;
3010     }
3011
3012     // We could be getting tasks from target constructs; if this is the only
3013     // thread, keep trying to execute tasks from own queue
3014     if (nthreads == 1)
3015       use_own_tasks = 1;
3016     else {
3017       KA_TRACE(15,
3018                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3019       return FALSE;
3020     }
3021   }
3022 }
3023
3024 int __kmp_execute_tasks_32(
3025     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
3026     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3027     kmp_int32 is_constrained) {
3028   return __kmp_execute_tasks_template(
3029       thread, gtid, flag, final_spin,
3030       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3031 }
3032
3033 int __kmp_execute_tasks_64(
3034     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
3035     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3036     kmp_int32 is_constrained) {
3037   return __kmp_execute_tasks_template(
3038       thread, gtid, flag, final_spin,
3039       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3040 }
3041
3042 int __kmp_execute_tasks_oncore(
3043     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3044     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3045     kmp_int32 is_constrained) {
3046   return __kmp_execute_tasks_template(
3047       thread, gtid, flag, final_spin,
3048       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3049 }
3050
3051 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3052 // next barrier so they can assist in executing enqueued tasks.
3053 // First thread in allocates the task team atomically.
3054 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3055                                  kmp_info_t *this_thr) {
3056   kmp_thread_data_t *threads_data;
3057   int nthreads, i, is_init_thread;
3058
3059   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3060                 __kmp_gtid_from_thread(this_thr)));
3061
3062   KMP_DEBUG_ASSERT(task_team != NULL);
3063   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3064
3065   nthreads = task_team->tt.tt_nproc;
3066   KMP_DEBUG_ASSERT(nthreads > 0);
3067   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3068
3069   // Allocate or increase the size of threads_data if necessary
3070   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3071
3072   if (!is_init_thread) {
3073     // Some other thread already set up the array.
3074     KA_TRACE(
3075         20,
3076         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3077          __kmp_gtid_from_thread(this_thr)));
3078     return;
3079   }
3080   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3081   KMP_DEBUG_ASSERT(threads_data != NULL);
3082
3083   if (__kmp_tasking_mode == tskm_task_teams &&
3084       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3085     // Release any threads sleeping at the barrier, so that they can steal
3086     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3087     // at the separate tasking barrier, so this isn't a problem.
3088     for (i = 0; i < nthreads; i++) {
3089       volatile void *sleep_loc;
3090       kmp_info_t *thread = threads_data[i].td.td_thr;
3091
3092       if (i == this_thr->th.th_info.ds.ds_tid) {
3093         continue;
3094       }
3095       // Since we haven't locked the thread's suspend mutex lock at this
3096       // point, there is a small window where a thread might be putting
3097       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3098       // To work around this, __kmp_execute_tasks_template() periodically checks
3099       // see if other threads are sleeping (using the same random mechanism that
3100       // is used for task stealing) and awakens them if they are.
3101       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3102           NULL) {
3103         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3104                       __kmp_gtid_from_thread(this_thr),
3105                       __kmp_gtid_from_thread(thread)));
3106         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3107       } else {
3108         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3109                       __kmp_gtid_from_thread(this_thr),
3110                       __kmp_gtid_from_thread(thread)));
3111       }
3112     }
3113   }
3114
3115   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3116                 __kmp_gtid_from_thread(this_thr)));
3117 }
3118
3119 /* // TODO: Check the comment consistency
3120  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3121  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3122  * After a child * thread checks into a barrier and calls __kmp_release() from
3123  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3124  * longer assume that the kmp_team_t structure is intact (at any moment, the
3125  * master thread may exit the barrier code and free the team data structure,
3126  * and return the threads to the thread pool).
3127  *
3128  * This does not work with the tasking code, as the thread is still
3129  * expected to participate in the execution of any tasks that may have been
3130  * spawned my a member of the team, and the thread still needs access to all
3131  * to each thread in the team, so that it can steal work from it.
3132  *
3133  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3134  * counting mechanism, and is allocated by the master thread before calling
3135  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3136  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3137  * of the kmp_task_team_t structs for consecutive barriers can overlap
3138  * (and will, unless the master thread is the last thread to exit the barrier
3139  * release phase, which is not typical). The existence of such a struct is
3140  * useful outside the context of tasking.
3141  *
3142  * We currently use the existence of the threads array as an indicator that
3143  * tasks were spawned since the last barrier.  If the structure is to be
3144  * useful outside the context of tasking, then this will have to change, but
3145  * not setting the field minimizes the performance impact of tasking on
3146  * barriers, when no explicit tasks were spawned (pushed, actually).
3147  */
3148
3149 static kmp_task_team_t *__kmp_free_task_teams =
3150     NULL; // Free list for task_team data structures
3151 // Lock for task team data structures
3152 kmp_bootstrap_lock_t __kmp_task_team_lock =
3153     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3154
3155 // __kmp_alloc_task_deque:
3156 // Allocates a task deque for a particular thread, and initialize the necessary
3157 // data structures relating to the deque.  This only happens once per thread
3158 // per task team since task teams are recycled. No lock is needed during
3159 // allocation since each thread allocates its own deque.
3160 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3161                                    kmp_thread_data_t *thread_data) {
3162   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3163   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3164
3165   // Initialize last stolen task field to "none"
3166   thread_data->td.td_deque_last_stolen = -1;
3167
3168   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3169   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3170   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3171
3172   KE_TRACE(
3173       10,
3174       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3175        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3176   // Allocate space for task deque, and zero the deque
3177   // Cannot use __kmp_thread_calloc() because threads not around for
3178   // kmp_reap_task_team( ).
3179   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3180       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3181   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3182 }
3183
3184 // __kmp_free_task_deque:
3185 // Deallocates a task deque for a particular thread. Happens at library
3186 // deallocation so don't need to reset all thread data fields.
3187 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3188   if (thread_data->td.td_deque != NULL) {
3189     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3190     TCW_4(thread_data->td.td_deque_ntasks, 0);
3191     __kmp_free(thread_data->td.td_deque);
3192     thread_data->td.td_deque = NULL;
3193     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3194   }
3195
3196 #ifdef BUILD_TIED_TASK_STACK
3197   // GEH: Figure out what to do here for td_susp_tied_tasks
3198   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3199     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3200   }
3201 #endif // BUILD_TIED_TASK_STACK
3202 }
3203
3204 // __kmp_realloc_task_threads_data:
3205 // Allocates a threads_data array for a task team, either by allocating an
3206 // initial array or enlarging an existing array.  Only the first thread to get
3207 // the lock allocs or enlarges the array and re-initializes the array elements.
3208 // That thread returns "TRUE", the rest return "FALSE".
3209 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3210 // The current size is given by task_team -> tt.tt_max_threads.
3211 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3212                                            kmp_task_team_t *task_team) {
3213   kmp_thread_data_t **threads_data_p;
3214   kmp_int32 nthreads, maxthreads;
3215   int is_init_thread = FALSE;
3216
3217   if (TCR_4(task_team->tt.tt_found_tasks)) {
3218     // Already reallocated and initialized.
3219     return FALSE;
3220   }
3221
3222   threads_data_p = &task_team->tt.tt_threads_data;
3223   nthreads = task_team->tt.tt_nproc;
3224   maxthreads = task_team->tt.tt_max_threads;
3225
3226   // All threads must lock when they encounter the first task of the implicit
3227   // task region to make sure threads_data fields are (re)initialized before
3228   // used.
3229   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3230
3231   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3232     // first thread to enable tasking
3233     kmp_team_t *team = thread->th.th_team;
3234     int i;
3235
3236     is_init_thread = TRUE;
3237     if (maxthreads < nthreads) {
3238
3239       if (*threads_data_p != NULL) {
3240         kmp_thread_data_t *old_data = *threads_data_p;
3241         kmp_thread_data_t *new_data = NULL;
3242
3243         KE_TRACE(
3244             10,
3245             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3246              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3247              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3248         // Reallocate threads_data to have more elements than current array
3249         // Cannot use __kmp_thread_realloc() because threads not around for
3250         // kmp_reap_task_team( ).  Note all new array entries are initialized
3251         // to zero by __kmp_allocate().
3252         new_data = (kmp_thread_data_t *)__kmp_allocate(
3253             nthreads * sizeof(kmp_thread_data_t));
3254         // copy old data to new data
3255         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3256                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3257
3258 #ifdef BUILD_TIED_TASK_STACK
3259         // GEH: Figure out if this is the right thing to do
3260         for (i = maxthreads; i < nthreads; i++) {
3261           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3262           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3263         }
3264 #endif // BUILD_TIED_TASK_STACK
3265         // Install the new data and free the old data
3266         (*threads_data_p) = new_data;
3267         __kmp_free(old_data);
3268       } else {
3269         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3270                       "threads data for task_team %p, size = %d\n",
3271                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3272         // Make the initial allocate for threads_data array, and zero entries
3273         // Cannot use __kmp_thread_calloc() because threads not around for
3274         // kmp_reap_task_team( ).
3275         ANNOTATE_IGNORE_WRITES_BEGIN();
3276         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3277             nthreads * sizeof(kmp_thread_data_t));
3278         ANNOTATE_IGNORE_WRITES_END();
3279 #ifdef BUILD_TIED_TASK_STACK
3280         // GEH: Figure out if this is the right thing to do
3281         for (i = 0; i < nthreads; i++) {
3282           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3283           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3284         }
3285 #endif // BUILD_TIED_TASK_STACK
3286       }
3287       task_team->tt.tt_max_threads = nthreads;
3288     } else {
3289       // If array has (more than) enough elements, go ahead and use it
3290       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3291     }
3292
3293     // initialize threads_data pointers back to thread_info structures
3294     for (i = 0; i < nthreads; i++) {
3295       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3296       thread_data->td.td_thr = team->t.t_threads[i];
3297
3298       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3299         // The last stolen field survives across teams / barrier, and the number
3300         // of threads may have changed.  It's possible (likely?) that a new
3301         // parallel region will exhibit the same behavior as previous region.
3302         thread_data->td.td_deque_last_stolen = -1;
3303       }
3304     }
3305
3306     KMP_MB();
3307     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3308   }
3309
3310   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3311   return is_init_thread;
3312 }
3313
3314 // __kmp_free_task_threads_data:
3315 // Deallocates a threads_data array for a task team, including any attached
3316 // tasking deques.  Only occurs at library shutdown.
3317 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3318   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3319   if (task_team->tt.tt_threads_data != NULL) {
3320     int i;
3321     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3322       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3323     }
3324     __kmp_free(task_team->tt.tt_threads_data);
3325     task_team->tt.tt_threads_data = NULL;
3326   }
3327   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3328 }
3329
3330 // __kmp_allocate_task_team:
3331 // Allocates a task team associated with a specific team, taking it from
3332 // the global task team free list if possible.  Also initializes data
3333 // structures.
3334 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3335                                                  kmp_team_t *team) {
3336   kmp_task_team_t *task_team = NULL;
3337   int nthreads;
3338
3339   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3340                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3341
3342   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3343     // Take a task team from the task team pool
3344     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3345     if (__kmp_free_task_teams != NULL) {
3346       task_team = __kmp_free_task_teams;
3347       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3348       task_team->tt.tt_next = NULL;
3349     }
3350     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3351   }
3352
3353   if (task_team == NULL) {
3354     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3355                   "task team for team %p\n",
3356                   __kmp_gtid_from_thread(thread), team));
3357     // Allocate a new task team if one is not available. Cannot use
3358     // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3359     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3360     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3361 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3362     // suppress race conditions detection on synchronization flags in debug mode
3363     // this helps to analyze library internals eliminating false positives
3364     __itt_suppress_mark_range(
3365         __itt_suppress_range, __itt_suppress_threading_errors,
3366         &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3367     __itt_suppress_mark_range(__itt_suppress_range,
3368                               __itt_suppress_threading_errors,
3369                               CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3370                               sizeof(task_team->tt.tt_active));
3371 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3372     // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3373     // task_team->tt.tt_threads_data = NULL;
3374     // task_team->tt.tt_max_threads = 0;
3375     // task_team->tt.tt_next = NULL;
3376   }
3377
3378   TCW_4(task_team->tt.tt_found_tasks, FALSE);
3379   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3380   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3381
3382   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3383   TCW_4(task_team->tt.tt_active, TRUE);
3384
3385   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3386                 "unfinished_threads init'd to %d\n",
3387                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3388                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3389   return task_team;
3390 }
3391
3392 // __kmp_free_task_team:
3393 // Frees the task team associated with a specific thread, and adds it
3394 // to the global task team free list.
3395 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3396   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3397                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3398
3399   // Put task team back on free list
3400   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3401
3402   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3403   task_team->tt.tt_next = __kmp_free_task_teams;
3404   TCW_PTR(__kmp_free_task_teams, task_team);
3405
3406   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3407 }
3408
3409 // __kmp_reap_task_teams:
3410 // Free all the task teams on the task team free list.
3411 // Should only be done during library shutdown.
3412 // Cannot do anything that needs a thread structure or gtid since they are
3413 // already gone.
3414 void __kmp_reap_task_teams(void) {
3415   kmp_task_team_t *task_team;
3416
3417   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3418     // Free all task_teams on the free list
3419     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3420     while ((task_team = __kmp_free_task_teams) != NULL) {
3421       __kmp_free_task_teams = task_team->tt.tt_next;
3422       task_team->tt.tt_next = NULL;
3423
3424       // Free threads_data if necessary
3425       if (task_team->tt.tt_threads_data != NULL) {
3426         __kmp_free_task_threads_data(task_team);
3427       }
3428       __kmp_free(task_team);
3429     }
3430     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3431   }
3432 }
3433
3434 // __kmp_wait_to_unref_task_teams:
3435 // Some threads could still be in the fork barrier release code, possibly
3436 // trying to steal tasks.  Wait for each thread to unreference its task team.
3437 void __kmp_wait_to_unref_task_teams(void) {
3438   kmp_info_t *thread;
3439   kmp_uint32 spins;
3440   int done;
3441
3442   KMP_INIT_YIELD(spins);
3443
3444   for (;;) {
3445     done = TRUE;
3446
3447     // TODO: GEH - this may be is wrong because some sync would be necessary
3448     // in case threads are added to the pool during the traversal. Need to
3449     // verify that lock for thread pool is held when calling this routine.
3450     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3451          thread = thread->th.th_next_pool) {
3452 #if KMP_OS_WINDOWS
3453       DWORD exit_val;
3454 #endif
3455       if (TCR_PTR(thread->th.th_task_team) == NULL) {
3456         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3457                       __kmp_gtid_from_thread(thread)));
3458         continue;
3459       }
3460 #if KMP_OS_WINDOWS
3461       // TODO: GEH - add this check for Linux* OS / OS X* as well?
3462       if (!__kmp_is_thread_alive(thread, &exit_val)) {
3463         thread->th.th_task_team = NULL;
3464         continue;
3465       }
3466 #endif
3467
3468       done = FALSE; // Because th_task_team pointer is not NULL for this thread
3469
3470       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3471                     "unreference task_team\n",
3472                     __kmp_gtid_from_thread(thread)));
3473
3474       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3475         volatile void *sleep_loc;
3476         // If the thread is sleeping, awaken it.
3477         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3478             NULL) {
3479           KA_TRACE(
3480               10,
3481               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3482                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3483           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3484         }
3485       }
3486     }
3487     if (done) {
3488       break;
3489     }
3490
3491     // If oversubscribed or have waited a bit, yield.
3492     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3493   }
3494 }
3495
3496 // __kmp_task_team_setup:  Create a task_team for the current team, but use
3497 // an already created, unused one if it already exists.
3498 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3499   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3500
3501   // If this task_team hasn't been created yet, allocate it. It will be used in
3502   // the region after the next.
3503   // If it exists, it is the current task team and shouldn't be touched yet as
3504   // it may still be in use.
3505   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3506       (always || team->t.t_nproc > 1)) {
3507     team->t.t_task_team[this_thr->th.th_task_state] =
3508         __kmp_allocate_task_team(this_thr, team);
3509     KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3510                   "for team %d at parity=%d\n",
3511                   __kmp_gtid_from_thread(this_thr),
3512                   team->t.t_task_team[this_thr->th.th_task_state],
3513                   ((team != NULL) ? team->t.t_id : -1),
3514                   this_thr->th.th_task_state));
3515   }
3516
3517   // After threads exit the release, they will call sync, and then point to this
3518   // other task_team; make sure it is allocated and properly initialized. As
3519   // threads spin in the barrier release phase, they will continue to use the
3520   // previous task_team struct(above), until they receive the signal to stop
3521   // checking for tasks (they can't safely reference the kmp_team_t struct,
3522   // which could be reallocated by the master thread). No task teams are formed
3523   // for serialized teams.
3524   if (team->t.t_nproc > 1) {
3525     int other_team = 1 - this_thr->th.th_task_state;
3526     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3527       team->t.t_task_team[other_team] =
3528           __kmp_allocate_task_team(this_thr, team);
3529       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3530                     "task_team %p for team %d at parity=%d\n",
3531                     __kmp_gtid_from_thread(this_thr),
3532                     team->t.t_task_team[other_team],
3533                     ((team != NULL) ? team->t.t_id : -1), other_team));
3534     } else { // Leave the old task team struct in place for the upcoming region;
3535       // adjust as needed
3536       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3537       if (!task_team->tt.tt_active ||
3538           team->t.t_nproc != task_team->tt.tt_nproc) {
3539         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3540         TCW_4(task_team->tt.tt_found_tasks, FALSE);
3541         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3542         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3543                           team->t.t_nproc);
3544         TCW_4(task_team->tt.tt_active, TRUE);
3545       }
3546       // if team size has changed, the first thread to enable tasking will
3547       // realloc threads_data if necessary
3548       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3549                     "%p for team %d at parity=%d\n",
3550                     __kmp_gtid_from_thread(this_thr),
3551                     team->t.t_task_team[other_team],
3552                     ((team != NULL) ? team->t.t_id : -1), other_team));
3553     }
3554   }
3555 }
3556
3557 // __kmp_task_team_sync: Propagation of task team data from team to threads
3558 // which happens just after the release phase of a team barrier.  This may be
3559 // called by any thread, but only for teams with # threads > 1.
3560 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3561   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3562
3563   // Toggle the th_task_state field, to switch which task_team this thread
3564   // refers to
3565   this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3566   // It is now safe to propagate the task team pointer from the team struct to
3567   // the current thread.
3568   TCW_PTR(this_thr->th.th_task_team,
3569           team->t.t_task_team[this_thr->th.th_task_state]);
3570   KA_TRACE(20,
3571            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3572             "%p from Team #%d (parity=%d)\n",
3573             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3574             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3575 }
3576
3577 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3578 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3579 // if proxy tasks were created.
3580 //
3581 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3582 // by passing in 0 optionally as the last argument. When wait is zero, master
3583 // thread does not wait for unfinished_threads to reach 0.
3584 void __kmp_task_team_wait(
3585     kmp_info_t *this_thr,
3586     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3587   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3588
3589   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3590   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3591
3592   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3593     if (wait) {
3594       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3595                     "(for unfinished_threads to reach 0) on task_team = %p\n",
3596                     __kmp_gtid_from_thread(this_thr), task_team));
3597       // Worker threads may have dropped through to release phase, but could
3598       // still be executing tasks. Wait here for tasks to complete. To avoid
3599       // memory contention, only master thread checks termination condition.
3600       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3601                              &task_team->tt.tt_unfinished_threads),
3602                        0U);
3603       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3604     }
3605     // Deactivate the old task team, so that the worker threads will stop
3606     // referencing it while spinning.
3607     KA_TRACE(
3608         20,
3609         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3610          "setting active to false, setting local and team's pointer to NULL\n",
3611          __kmp_gtid_from_thread(this_thr), task_team));
3612     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3613                      task_team->tt.tt_found_proxy_tasks == TRUE);
3614     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3615     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3616     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3617     KMP_MB();
3618
3619     TCW_PTR(this_thr->th.th_task_team, NULL);
3620   }
3621 }
3622
3623 // __kmp_tasking_barrier:
3624 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3625 // Internal function to execute all tasks prior to a regular barrier or a join
3626 // barrier. It is a full barrier itself, which unfortunately turns regular
3627 // barriers into double barriers and join barriers into 1 1/2 barriers.
3628 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3629   std::atomic<kmp_uint32> *spin = RCAST(
3630       std::atomic<kmp_uint32> *,
3631       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3632   int flag = FALSE;
3633   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3634
3635 #if USE_ITT_BUILD
3636   KMP_FSYNC_SPIN_INIT(spin, NULL);
3637 #endif /* USE_ITT_BUILD */
3638   kmp_flag_32 spin_flag(spin, 0U);
3639   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3640                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3641 #if USE_ITT_BUILD
3642     // TODO: What about itt_sync_obj??
3643     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3644 #endif /* USE_ITT_BUILD */
3645
3646     if (TCR_4(__kmp_global.g.g_done)) {
3647       if (__kmp_global.g.g_abort)
3648         __kmp_abort_thread();
3649       break;
3650     }
3651     KMP_YIELD(TRUE);
3652   }
3653 #if USE_ITT_BUILD
3654   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3655 #endif /* USE_ITT_BUILD */
3656 }
3657
3658 // __kmp_give_task puts a task into a given thread queue if:
3659 //  - the queue for that thread was created
3660 //  - there's space in that queue
3661 // Because of this, __kmp_push_task needs to check if there's space after
3662 // getting the lock
3663 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3664                             kmp_int32 pass) {
3665   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3666   kmp_task_team_t *task_team = taskdata->td_task_team;
3667
3668   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3669                 taskdata, tid));
3670
3671   // If task_team is NULL something went really bad...
3672   KMP_DEBUG_ASSERT(task_team != NULL);
3673
3674   bool result = false;
3675   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3676
3677   if (thread_data->td.td_deque == NULL) {
3678     // There's no queue in this thread, go find another one
3679     // We're guaranteed that at least one thread has a queue
3680     KA_TRACE(30,
3681              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3682               tid, taskdata));
3683     return result;
3684   }
3685
3686   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3687       TASK_DEQUE_SIZE(thread_data->td)) {
3688     KA_TRACE(
3689         30,
3690         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3691          taskdata, tid));
3692
3693     // if this deque is bigger than the pass ratio give a chance to another
3694     // thread
3695     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3696       return result;
3697
3698     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3699     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3700         TASK_DEQUE_SIZE(thread_data->td)) {
3701       // expand deque to push the task which is not allowed to execute
3702       __kmp_realloc_task_deque(thread, thread_data);
3703     }
3704
3705   } else {
3706
3707     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3708
3709     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3710         TASK_DEQUE_SIZE(thread_data->td)) {
3711       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3712                     "thread %d.\n",
3713                     taskdata, tid));
3714
3715       // if this deque is bigger than the pass ratio give a chance to another
3716       // thread
3717       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3718         goto release_and_exit;
3719
3720       __kmp_realloc_task_deque(thread, thread_data);
3721     }
3722   }
3723
3724   // lock is held here, and there is space in the deque
3725
3726   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3727   // Wrap index.
3728   thread_data->td.td_deque_tail =
3729       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3730   TCW_4(thread_data->td.td_deque_ntasks,
3731         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3732
3733   result = true;
3734   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3735                 taskdata, tid));
3736
3737 release_and_exit:
3738   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3739
3740   return result;
3741 }
3742
3743 /* The finish of the proxy tasks is divided in two pieces:
3744     - the top half is the one that can be done from a thread outside the team
3745     - the bottom half must be run from a thread within the team
3746
3747    In order to run the bottom half the task gets queued back into one of the
3748    threads of the team. Once the td_incomplete_child_task counter of the parent
3749    is decremented the threads can leave the barriers. So, the bottom half needs
3750    to be queued before the counter is decremented. The top half is therefore
3751    divided in two parts:
3752     - things that can be run before queuing the bottom half
3753     - things that must be run after queuing the bottom half
3754
3755    This creates a second race as the bottom half can free the task before the
3756    second top half is executed. To avoid this we use the
3757    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3758    half. */
3759 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3760   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3761   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3762   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3763   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3764
3765   taskdata->td_flags.complete = 1; // mark the task as completed
3766
3767   if (taskdata->td_taskgroup)
3768     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3769
3770   // Create an imaginary children for this task so the bottom half cannot
3771   // release the task before we have completed the second top half
3772   KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3773 }
3774
3775 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3776   kmp_int32 children = 0;
3777
3778   // Predecrement simulated by "- 1" calculation
3779   children =
3780       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3781   KMP_DEBUG_ASSERT(children >= 0);
3782
3783   // Remove the imaginary children
3784   KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3785 }
3786
3787 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3788   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3789   kmp_info_t *thread = __kmp_threads[gtid];
3790
3791   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3792   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3793                    1); // top half must run before bottom half
3794
3795   // We need to wait to make sure the top half is finished
3796   // Spinning here should be ok as this should happen quickly
3797   while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3798     ;
3799
3800   __kmp_release_deps(gtid, taskdata);
3801   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3802 }
3803
3804 /*!
3805 @ingroup TASKING
3806 @param gtid Global Thread ID of encountering thread
3807 @param ptask Task which execution is completed
3808
3809 Execute the completion of a proxy task from a thread of that is part of the
3810 team. Run first and bottom halves directly.
3811 */
3812 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3813   KMP_DEBUG_ASSERT(ptask != NULL);
3814   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3815   KA_TRACE(
3816       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3817            gtid, taskdata));
3818   __kmp_assert_valid_gtid(gtid);
3819   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3820
3821   __kmp_first_top_half_finish_proxy(taskdata);
3822   __kmp_second_top_half_finish_proxy(taskdata);
3823   __kmp_bottom_half_finish_proxy(gtid, ptask);
3824
3825   KA_TRACE(10,
3826            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3827             gtid, taskdata));
3828 }
3829
3830 /*!
3831 @ingroup TASKING
3832 @param ptask Task which execution is completed
3833
3834 Execute the completion of a proxy task from a thread that could not belong to
3835 the team.
3836 */
3837 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3838   KMP_DEBUG_ASSERT(ptask != NULL);
3839   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3840
3841   KA_TRACE(
3842       10,
3843       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3844        taskdata));
3845
3846   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3847
3848   __kmp_first_top_half_finish_proxy(taskdata);
3849
3850   // Enqueue task to complete bottom half completion from a thread within the
3851   // corresponding team
3852   kmp_team_t *team = taskdata->td_team;
3853   kmp_int32 nthreads = team->t.t_nproc;
3854   kmp_info_t *thread;
3855
3856   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3857   // but we cannot use __kmp_get_random here
3858   kmp_int32 start_k = 0;
3859   kmp_int32 pass = 1;
3860   kmp_int32 k = start_k;
3861
3862   do {
3863     // For now we're just linearly trying to find a thread
3864     thread = team->t.t_threads[k];
3865     k = (k + 1) % nthreads;
3866
3867     // we did a full pass through all the threads
3868     if (k == start_k)
3869       pass = pass << 1;
3870
3871   } while (!__kmp_give_task(thread, k, ptask, pass));
3872
3873   __kmp_second_top_half_finish_proxy(taskdata);
3874
3875   KA_TRACE(
3876       10,
3877       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3878        taskdata));
3879 }
3880
3881 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3882                                                 kmp_task_t *task) {
3883   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3884   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3885     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
3886     td->td_allow_completion_event.ed.task = task;
3887     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
3888   }
3889   return &td->td_allow_completion_event;
3890 }
3891
3892 void __kmp_fulfill_event(kmp_event_t *event) {
3893   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
3894     kmp_task_t *ptask = event->ed.task;
3895     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3896     bool detached = false;
3897     int gtid = __kmp_get_gtid();
3898
3899     // The associated task might have completed or could be completing at this
3900     // point.
3901     // We need to take the lock to avoid races
3902     __kmp_acquire_tas_lock(&event->lock, gtid);
3903     if (taskdata->td_flags.proxy == TASK_PROXY) {
3904       detached = true;
3905     } else {
3906 #if OMPT_SUPPORT
3907       // The OMPT event must occur under mutual exclusion,
3908       // otherwise the tool might access ptask after free
3909       if (UNLIKELY(ompt_enabled.enabled))
3910         __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
3911 #endif
3912     }
3913     event->type = KMP_EVENT_UNINITIALIZED;
3914     __kmp_release_tas_lock(&event->lock, gtid);
3915
3916     if (detached) {
3917 #if OMPT_SUPPORT
3918       // We free ptask afterwards and know the task is finished,
3919       // so locking is not necessary
3920       if (UNLIKELY(ompt_enabled.enabled))
3921         __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
3922 #endif
3923       // If the task detached complete the proxy task
3924       if (gtid >= 0) {
3925         kmp_team_t *team = taskdata->td_team;
3926         kmp_info_t *thread = __kmp_get_thread();
3927         if (thread->th.th_team == team) {
3928           __kmpc_proxy_task_completed(gtid, ptask);
3929           return;
3930         }
3931       }
3932
3933       // fallback
3934       __kmpc_proxy_task_completed_ooo(ptask);
3935     }
3936   }
3937 }
3938
3939 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3940 // for taskloop
3941 //
3942 // thread:   allocating thread
3943 // task_src: pointer to source task to be duplicated
3944 // returns:  a pointer to the allocated kmp_task_t structure (task).
3945 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3946   kmp_task_t *task;
3947   kmp_taskdata_t *taskdata;
3948   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3949   kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
3950   size_t shareds_offset;
3951   size_t task_size;
3952
3953   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3954                 task_src));
3955   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3956                    TASK_FULL); // it should not be proxy task
3957   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3958   task_size = taskdata_src->td_size_alloc;
3959
3960   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3961   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3962                 task_size));
3963 #if USE_FAST_MEMORY
3964   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3965 #else
3966   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3967 #endif /* USE_FAST_MEMORY */
3968   KMP_MEMCPY(taskdata, taskdata_src, task_size);
3969
3970   task = KMP_TASKDATA_TO_TASK(taskdata);
3971
3972   // Initialize new task (only specific fields not affected by memcpy)
3973   taskdata->td_task_id = KMP_GEN_TASK_ID();
3974   if (task->shareds != NULL) { // need setup shareds pointer
3975     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3976     task->shareds = &((char *)taskdata)[shareds_offset];
3977     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3978                      0);
3979   }
3980   taskdata->td_alloc_thread = thread;
3981   taskdata->td_parent = parent_task;
3982   // task inherits the taskgroup from the parent task
3983   taskdata->td_taskgroup = parent_task->td_taskgroup;
3984   // tied task needs to initialize the td_last_tied at creation,
3985   // untied one does this when it is scheduled for execution
3986   if (taskdata->td_flags.tiedness == TASK_TIED)
3987     taskdata->td_last_tied = taskdata;
3988
3989   // Only need to keep track of child task counts if team parallel and tasking
3990   // not serialized
3991   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3992     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3993     if (parent_task->td_taskgroup)
3994       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3995     // Only need to keep track of allocated child tasks for explicit tasks since
3996     // implicit not deallocated
3997     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3998       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3999   }
4000
4001   KA_TRACE(20,
4002            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4003             thread, taskdata, taskdata->td_parent));
4004 #if OMPT_SUPPORT
4005   if (UNLIKELY(ompt_enabled.enabled))
4006     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4007 #endif
4008   return task;
4009 }
4010
4011 // Routine optionally generated by the compiler for setting the lastprivate flag
4012 // and calling needed constructors for private/firstprivate objects
4013 // (used to form taskloop tasks from pattern task)
4014 // Parameters: dest task, src task, lastprivate flag.
4015 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4016
4017 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4018
4019 // class to encapsulate manipulating loop bounds in a taskloop task.
4020 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4021 // the loop bound variables.
4022 class kmp_taskloop_bounds_t {
4023   kmp_task_t *task;
4024   const kmp_taskdata_t *taskdata;
4025   size_t lower_offset;
4026   size_t upper_offset;
4027
4028 public:
4029   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4030       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4031         lower_offset((char *)lb - (char *)task),
4032         upper_offset((char *)ub - (char *)task) {
4033     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4034     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4035   }
4036   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4037       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4038         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4039   size_t get_lower_offset() const { return lower_offset; }
4040   size_t get_upper_offset() const { return upper_offset; }
4041   kmp_uint64 get_lb() const {
4042     kmp_int64 retval;
4043 #if defined(KMP_GOMP_COMPAT)
4044     // Intel task just returns the lower bound normally
4045     if (!taskdata->td_flags.native) {
4046       retval = *(kmp_int64 *)((char *)task + lower_offset);
4047     } else {
4048       // GOMP task has to take into account the sizeof(long)
4049       if (taskdata->td_size_loop_bounds == 4) {
4050         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4051         retval = (kmp_int64)*lb;
4052       } else {
4053         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4054         retval = (kmp_int64)*lb;
4055       }
4056     }
4057 #else
4058     retval = *(kmp_int64 *)((char *)task + lower_offset);
4059 #endif // defined(KMP_GOMP_COMPAT)
4060     return retval;
4061   }
4062   kmp_uint64 get_ub() const {
4063     kmp_int64 retval;
4064 #if defined(KMP_GOMP_COMPAT)
4065     // Intel task just returns the upper bound normally
4066     if (!taskdata->td_flags.native) {
4067       retval = *(kmp_int64 *)((char *)task + upper_offset);
4068     } else {
4069       // GOMP task has to take into account the sizeof(long)
4070       if (taskdata->td_size_loop_bounds == 4) {
4071         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4072         retval = (kmp_int64)*ub;
4073       } else {
4074         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4075         retval = (kmp_int64)*ub;
4076       }
4077     }
4078 #else
4079     retval = *(kmp_int64 *)((char *)task + upper_offset);
4080 #endif // defined(KMP_GOMP_COMPAT)
4081     return retval;
4082   }
4083   void set_lb(kmp_uint64 lb) {
4084 #if defined(KMP_GOMP_COMPAT)
4085     // Intel task just sets the lower bound normally
4086     if (!taskdata->td_flags.native) {
4087       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4088     } else {
4089       // GOMP task has to take into account the sizeof(long)
4090       if (taskdata->td_size_loop_bounds == 4) {
4091         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4092         *lower = (kmp_uint32)lb;
4093       } else {
4094         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4095         *lower = (kmp_uint64)lb;
4096       }
4097     }
4098 #else
4099     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4100 #endif // defined(KMP_GOMP_COMPAT)
4101   }
4102   void set_ub(kmp_uint64 ub) {
4103 #if defined(KMP_GOMP_COMPAT)
4104     // Intel task just sets the upper bound normally
4105     if (!taskdata->td_flags.native) {
4106       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4107     } else {
4108       // GOMP task has to take into account the sizeof(long)
4109       if (taskdata->td_size_loop_bounds == 4) {
4110         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4111         *upper = (kmp_uint32)ub;
4112       } else {
4113         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4114         *upper = (kmp_uint64)ub;
4115       }
4116     }
4117 #else
4118     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4119 #endif // defined(KMP_GOMP_COMPAT)
4120   }
4121 };
4122
4123 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4124 //
4125 // loc        Source location information
4126 // gtid       Global thread ID
4127 // task       Pattern task, exposes the loop iteration range
4128 // lb         Pointer to loop lower bound in task structure
4129 // ub         Pointer to loop upper bound in task structure
4130 // st         Loop stride
4131 // ub_glob    Global upper bound (used for lastprivate check)
4132 // num_tasks  Number of tasks to execute
4133 // grainsize  Number of loop iterations per task
4134 // extras     Number of chunks with grainsize+1 iterations
4135 // tc         Iterations count
4136 // task_dup   Tasks duplication routine
4137 // codeptr_ra Return address for OMPT events
4138 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4139                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4140                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4141                            kmp_uint64 grainsize, kmp_uint64 extras,
4142                            kmp_uint64 tc,
4143 #if OMPT_SUPPORT
4144                            void *codeptr_ra,
4145 #endif
4146                            void *task_dup) {
4147   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4148   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4149   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4150   // compiler provides global bounds here
4151   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4152   kmp_uint64 lower = task_bounds.get_lb();
4153   kmp_uint64 upper = task_bounds.get_ub();
4154   kmp_uint64 i;
4155   kmp_info_t *thread = __kmp_threads[gtid];
4156   kmp_taskdata_t *current_task = thread->th.th_current_task;
4157   kmp_task_t *next_task;
4158   kmp_int32 lastpriv = 0;
4159
4160   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4161   KMP_DEBUG_ASSERT(num_tasks > extras);
4162   KMP_DEBUG_ASSERT(num_tasks > 0);
4163   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4164                 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4165                 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
4166                 task_dup));
4167
4168   // Launch num_tasks tasks, assign grainsize iterations each task
4169   for (i = 0; i < num_tasks; ++i) {
4170     kmp_uint64 chunk_minus_1;
4171     if (extras == 0) {
4172       chunk_minus_1 = grainsize - 1;
4173     } else {
4174       chunk_minus_1 = grainsize;
4175       --extras; // first extras iterations get bigger chunk (grainsize+1)
4176     }
4177     upper = lower + st * chunk_minus_1;
4178     if (i == num_tasks - 1) {
4179       // schedule the last task, set lastprivate flag if needed
4180       if (st == 1) { // most common case
4181         KMP_DEBUG_ASSERT(upper == *ub);
4182         if (upper == ub_glob)
4183           lastpriv = 1;
4184       } else if (st > 0) { // positive loop stride
4185         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4186         if ((kmp_uint64)st > ub_glob - upper)
4187           lastpriv = 1;
4188       } else { // negative loop stride
4189         KMP_DEBUG_ASSERT(upper + st < *ub);
4190         if (upper - ub_glob < (kmp_uint64)(-st))
4191           lastpriv = 1;
4192       }
4193     }
4194     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4195     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4196     kmp_taskloop_bounds_t next_task_bounds =
4197         kmp_taskloop_bounds_t(next_task, task_bounds);
4198
4199     // adjust task-specific bounds
4200     next_task_bounds.set_lb(lower);
4201     if (next_taskdata->td_flags.native) {
4202       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4203     } else {
4204       next_task_bounds.set_ub(upper);
4205     }
4206     if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4207                            // etc.
4208       ptask_dup(next_task, task, lastpriv);
4209     KA_TRACE(40,
4210              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4211               "upper %lld stride %lld, (offsets %p %p)\n",
4212               gtid, i, next_task, lower, upper, st,
4213               next_task_bounds.get_lower_offset(),
4214               next_task_bounds.get_upper_offset()));
4215 #if OMPT_SUPPORT
4216     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4217                            codeptr_ra); // schedule new task
4218 #else
4219     __kmp_omp_task(gtid, next_task, true); // schedule new task
4220 #endif
4221     lower = upper + st; // adjust lower bound for the next iteration
4222   }
4223   // free the pattern task and exit
4224   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4225   // do not execute the pattern task, just do internal bookkeeping
4226   __kmp_task_finish<false>(gtid, task, current_task);
4227 }
4228
4229 // Structure to keep taskloop parameters for auxiliary task
4230 // kept in the shareds of the task structure.
4231 typedef struct __taskloop_params {
4232   kmp_task_t *task;
4233   kmp_uint64 *lb;
4234   kmp_uint64 *ub;
4235   void *task_dup;
4236   kmp_int64 st;
4237   kmp_uint64 ub_glob;
4238   kmp_uint64 num_tasks;
4239   kmp_uint64 grainsize;
4240   kmp_uint64 extras;
4241   kmp_uint64 tc;
4242   kmp_uint64 num_t_min;
4243 #if OMPT_SUPPORT
4244   void *codeptr_ra;
4245 #endif
4246 } __taskloop_params_t;
4247
4248 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4249                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4250                           kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
4251 #if OMPT_SUPPORT
4252                           void *,
4253 #endif
4254                           void *);
4255
4256 // Execute part of the taskloop submitted as a task.
4257 int __kmp_taskloop_task(int gtid, void *ptask) {
4258   __taskloop_params_t *p =
4259       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4260   kmp_task_t *task = p->task;
4261   kmp_uint64 *lb = p->lb;
4262   kmp_uint64 *ub = p->ub;
4263   void *task_dup = p->task_dup;
4264   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4265   kmp_int64 st = p->st;
4266   kmp_uint64 ub_glob = p->ub_glob;
4267   kmp_uint64 num_tasks = p->num_tasks;
4268   kmp_uint64 grainsize = p->grainsize;
4269   kmp_uint64 extras = p->extras;
4270   kmp_uint64 tc = p->tc;
4271   kmp_uint64 num_t_min = p->num_t_min;
4272 #if OMPT_SUPPORT
4273   void *codeptr_ra = p->codeptr_ra;
4274 #endif
4275 #if KMP_DEBUG
4276   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4277   KMP_DEBUG_ASSERT(task != NULL);
4278   KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4279                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4280                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4281                 task_dup));
4282 #endif
4283   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4284   if (num_tasks > num_t_min)
4285     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4286                          grainsize, extras, tc, num_t_min,
4287 #if OMPT_SUPPORT
4288                          codeptr_ra,
4289 #endif
4290                          task_dup);
4291   else
4292     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4293                           grainsize, extras, tc,
4294 #if OMPT_SUPPORT
4295                           codeptr_ra,
4296 #endif
4297                           task_dup);
4298
4299   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4300   return 0;
4301 }
4302
4303 // Schedule part of the taskloop as a task,
4304 // execute the rest of the taskloop.
4305 //
4306 // loc        Source location information
4307 // gtid       Global thread ID
4308 // task       Pattern task, exposes the loop iteration range
4309 // lb         Pointer to loop lower bound in task structure
4310 // ub         Pointer to loop upper bound in task structure
4311 // st         Loop stride
4312 // ub_glob    Global upper bound (used for lastprivate check)
4313 // num_tasks  Number of tasks to execute
4314 // grainsize  Number of loop iterations per task
4315 // extras     Number of chunks with grainsize+1 iterations
4316 // tc         Iterations count
4317 // num_t_min  Threshold to launch tasks recursively
4318 // task_dup   Tasks duplication routine
4319 // codeptr_ra Return address for OMPT events
4320 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4321                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4322                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4323                           kmp_uint64 grainsize, kmp_uint64 extras,
4324                           kmp_uint64 tc, kmp_uint64 num_t_min,
4325 #if OMPT_SUPPORT
4326                           void *codeptr_ra,
4327 #endif
4328                           void *task_dup) {
4329   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4330   KMP_DEBUG_ASSERT(task != NULL);
4331   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4332   KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4333                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4334                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4335                 task_dup));
4336   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4337   kmp_uint64 lower = *lb;
4338   kmp_info_t *thread = __kmp_threads[gtid];
4339   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
4340   kmp_task_t *next_task;
4341   size_t lower_offset =
4342       (char *)lb - (char *)task; // remember offset of lb in the task structure
4343   size_t upper_offset =
4344       (char *)ub - (char *)task; // remember offset of ub in the task structure
4345
4346   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4347   KMP_DEBUG_ASSERT(num_tasks > extras);
4348   KMP_DEBUG_ASSERT(num_tasks > 0);
4349
4350   // split the loop in two halves
4351   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4352   kmp_uint64 gr_size0 = grainsize;
4353   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4354   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4355   if (n_tsk0 <= extras) {
4356     gr_size0++; // integrate extras into grainsize
4357     ext0 = 0; // no extra iters in 1st half
4358     ext1 = extras - n_tsk0; // remaining extras
4359     tc0 = gr_size0 * n_tsk0;
4360     tc1 = tc - tc0;
4361   } else { // n_tsk0 > extras
4362     ext1 = 0; // no extra iters in 2nd half
4363     ext0 = extras;
4364     tc1 = grainsize * n_tsk1;
4365     tc0 = tc - tc1;
4366   }
4367   ub0 = lower + st * (tc0 - 1);
4368   lb1 = ub0 + st;
4369
4370   // create pattern task for 2nd half of the loop
4371   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4372   // adjust lower bound (upper bound is not changed) for the 2nd half
4373   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4374   if (ptask_dup != NULL) // construct firstprivates, etc.
4375     ptask_dup(next_task, task, 0);
4376   *ub = ub0; // adjust upper bound for the 1st half
4377
4378   // create auxiliary task for 2nd half of the loop
4379   // make sure new task has same parent task as the pattern task
4380   kmp_taskdata_t *current_task = thread->th.th_current_task;
4381   thread->th.th_current_task = taskdata->td_parent;
4382   kmp_task_t *new_task =
4383       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4384                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4385   // restore current task
4386   thread->th.th_current_task = current_task;
4387   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4388   p->task = next_task;
4389   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4390   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4391   p->task_dup = task_dup;
4392   p->st = st;
4393   p->ub_glob = ub_glob;
4394   p->num_tasks = n_tsk1;
4395   p->grainsize = grainsize;
4396   p->extras = ext1;
4397   p->tc = tc1;
4398   p->num_t_min = num_t_min;
4399 #if OMPT_SUPPORT
4400   p->codeptr_ra = codeptr_ra;
4401 #endif
4402
4403 #if OMPT_SUPPORT
4404   // schedule new task with correct return address for OMPT events
4405   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4406 #else
4407   __kmp_omp_task(gtid, new_task, true); // schedule new task
4408 #endif
4409
4410   // execute the 1st half of current subrange
4411   if (n_tsk0 > num_t_min)
4412     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4413                          ext0, tc0, num_t_min,
4414 #if OMPT_SUPPORT
4415                          codeptr_ra,
4416 #endif
4417                          task_dup);
4418   else
4419     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4420                           gr_size0, ext0, tc0,
4421 #if OMPT_SUPPORT
4422                           codeptr_ra,
4423 #endif
4424                           task_dup);
4425
4426   KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4427 }
4428
4429 /*!
4430 @ingroup TASKING
4431 @param loc       Source location information
4432 @param gtid      Global thread ID
4433 @param task      Task structure
4434 @param if_val    Value of the if clause
4435 @param lb        Pointer to loop lower bound in task structure
4436 @param ub        Pointer to loop upper bound in task structure
4437 @param st        Loop stride
4438 @param nogroup   Flag, 1 if no taskgroup needs to be added, 0 otherwise
4439 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4440 @param grainsize Schedule value if specified
4441 @param task_dup  Tasks duplication routine
4442
4443 Execute the taskloop construct.
4444 */
4445 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4446                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4447                      int sched, kmp_uint64 grainsize, void *task_dup) {
4448   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4449   KMP_DEBUG_ASSERT(task != NULL);
4450   __kmp_assert_valid_gtid(gtid);
4451   if (nogroup == 0) {
4452 #if OMPT_SUPPORT && OMPT_OPTIONAL
4453     OMPT_STORE_RETURN_ADDRESS(gtid);
4454 #endif
4455     __kmpc_taskgroup(loc, gtid);
4456   }
4457
4458   // =========================================================================
4459   // calculate loop parameters
4460   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4461   kmp_uint64 tc;
4462   // compiler provides global bounds here
4463   kmp_uint64 lower = task_bounds.get_lb();
4464   kmp_uint64 upper = task_bounds.get_ub();
4465   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4466   kmp_uint64 num_tasks = 0, extras = 0;
4467   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4468   kmp_info_t *thread = __kmp_threads[gtid];
4469   kmp_taskdata_t *current_task = thread->th.th_current_task;
4470
4471   KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4472                 "grain %llu(%d), dup %p\n",
4473                 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4474
4475   // compute trip count
4476   if (st == 1) { // most common case
4477     tc = upper - lower + 1;
4478   } else if (st < 0) {
4479     tc = (lower - upper) / (-st) + 1;
4480   } else { // st > 0
4481     tc = (upper - lower) / st + 1;
4482   }
4483   if (tc == 0) {
4484     KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4485     // free the pattern task and exit
4486     __kmp_task_start(gtid, task, current_task);
4487     // do not execute anything for zero-trip loop
4488     __kmp_task_finish<false>(gtid, task, current_task);
4489     return;
4490   }
4491
4492 #if OMPT_SUPPORT && OMPT_OPTIONAL
4493   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4494   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4495   if (ompt_enabled.ompt_callback_work) {
4496     ompt_callbacks.ompt_callback(ompt_callback_work)(
4497         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4498         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4499   }
4500 #endif
4501
4502   if (num_tasks_min == 0)
4503     // TODO: can we choose better default heuristic?
4504     num_tasks_min =
4505         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4506
4507   // compute num_tasks/grainsize based on the input provided
4508   switch (sched) {
4509   case 0: // no schedule clause specified, we can choose the default
4510     // let's try to schedule (team_size*10) tasks
4511     grainsize = thread->th.th_team_nproc * 10;
4512     KMP_FALLTHROUGH();
4513   case 2: // num_tasks provided
4514     if (grainsize > tc) {
4515       num_tasks = tc; // too big num_tasks requested, adjust values
4516       grainsize = 1;
4517       extras = 0;
4518     } else {
4519       num_tasks = grainsize;
4520       grainsize = tc / num_tasks;
4521       extras = tc % num_tasks;
4522     }
4523     break;
4524   case 1: // grainsize provided
4525     if (grainsize > tc) {
4526       num_tasks = 1; // too big grainsize requested, adjust values
4527       grainsize = tc;
4528       extras = 0;
4529     } else {
4530       num_tasks = tc / grainsize;
4531       // adjust grainsize for balanced distribution of iterations
4532       grainsize = tc / num_tasks;
4533       extras = tc % num_tasks;
4534     }
4535     break;
4536   default:
4537     KMP_ASSERT2(0, "unknown scheduling of taskloop");
4538   }
4539   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4540   KMP_DEBUG_ASSERT(num_tasks > extras);
4541   KMP_DEBUG_ASSERT(num_tasks > 0);
4542   // =========================================================================
4543
4544   // check if clause value first
4545   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4546   if (if_val == 0) { // if(0) specified, mark task as serial
4547     taskdata->td_flags.task_serial = 1;
4548     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4549     // always start serial tasks linearly
4550     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4551                           grainsize, extras, tc,
4552 #if OMPT_SUPPORT
4553                           OMPT_GET_RETURN_ADDRESS(0),
4554 #endif
4555                           task_dup);
4556     // !taskdata->td_flags.native => currently force linear spawning of tasks
4557     // for GOMP_taskloop
4558   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4559     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4560                   "(%lld), grain %llu, extras %llu\n",
4561                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4562     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4563                          grainsize, extras, tc, num_tasks_min,
4564 #if OMPT_SUPPORT
4565                          OMPT_GET_RETURN_ADDRESS(0),
4566 #endif
4567                          task_dup);
4568   } else {
4569     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4570                   "(%lld), grain %llu, extras %llu\n",
4571                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4572     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4573                           grainsize, extras, tc,
4574 #if OMPT_SUPPORT
4575                           OMPT_GET_RETURN_ADDRESS(0),
4576 #endif
4577                           task_dup);
4578   }
4579
4580 #if OMPT_SUPPORT && OMPT_OPTIONAL
4581   if (ompt_enabled.ompt_callback_work) {
4582     ompt_callbacks.ompt_callback(ompt_callback_work)(
4583         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4584         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4585   }
4586 #endif
4587
4588   if (nogroup == 0) {
4589 #if OMPT_SUPPORT && OMPT_OPTIONAL
4590     OMPT_STORE_RETURN_ADDRESS(gtid);
4591 #endif
4592     __kmpc_end_taskgroup(loc, gtid);
4593   }
4594   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4595 }