libgomp/iter.c

   1 /* Copyright (C) 2005-2024 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Offloading and Multi Processing Library
   5    (libgomp).
   6
   7    Libgomp is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15    more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 /* This file contains routines for managing work-share iteration, both
  27    for loops and sections.  */
  28
  29 #include "libgomp.h"
  30 #include <stdlib.h>
  31
  32
  33 /* This function implements the STATIC scheduling method.  The caller should
  34    iterate *pstart <= x < *pend.  Return zero if there are more iterations
  35    to perform; nonzero if not.  Return less than 0 if this thread had
  36    received the absolutely last iteration.  */
  37
  38 int
  39 gomp_iter_static_next (long *pstart, long *pend)
  40 {
  41   struct gomp_thread *thr = gomp_thread ();
  42   struct gomp_team *team = thr->ts.team;
  43   struct gomp_work_share *ws = thr->ts.work_share;
  44   unsigned long nthreads = team ? team->nthreads : 1;
  45
  46   if (thr->ts.static_trip == -1)
  47     return -1;
  48
  49   /* Quick test for degenerate teams and orphaned constructs.  */
  50   if (nthreads == 1)
  51     {
  52       *pstart = ws->next;
  53       *pend = ws->end;
  54       thr->ts.static_trip = -1;
  55       return ws->next == ws->end;
  56     }
  57
  58   /* We interpret chunk_size zero as "unspecified", which means that we
  59      should break up the iterations such that each thread makes only one
  60      trip through the outer loop.  */
  61   if (ws->chunk_size == 0)
  62     {
  63       unsigned long n, q, i, t;
  64       unsigned long s0, e0;
  65       long s, e;
  66
  67       if (thr->ts.static_trip > 0)
  68         return 1;
  69
  70       /* Compute the total number of iterations.  */
  71       s = ws->incr + (ws->incr > 0 ? -1 : 1);
  72       n = (ws->end - ws->next + s) / ws->incr;
  73       i = thr->ts.team_id;
  74
  75       /* Compute the "zero-based" start and end points.  That is, as
  76          if the loop began at zero and incremented by one.  */
  77       q = n / nthreads;
  78       t = n % nthreads;
  79       if (i < t)
  80         {
  81           t = 0;
  82           q++;
  83         }
  84       s0 = q * i + t;
  85       e0 = s0 + q;
  86
  87       /* Notice when no iterations allocated for this thread.  */
  88       if (s0 >= e0)
  89         {
  90           thr->ts.static_trip = 1;
  91           return 1;
  92         }
  93
  94       /* Transform these to the actual start and end numbers.  */
  95       s = (long)s0 * ws->incr + ws->next;
  96       e = (long)e0 * ws->incr + ws->next;
  97
  98       *pstart = s;
  99       *pend = e;
 100       thr->ts.static_trip = (e0 == n ? -1 : 1);
 101       return 0;
 102     }
 103   else
 104     {
 105       unsigned long n, s0, e0, i, c;
 106       long s, e;
 107
 108       /* Otherwise, each thread gets exactly chunk_size iterations
 109          (if available) each time through the loop.  */
 110
 111       s = ws->incr + (ws->incr > 0 ? -1 : 1);
 112       n = (ws->end - ws->next + s) / ws->incr;
 113       i = thr->ts.team_id;
 114       c = ws->chunk_size;
 115
 116       /* Initial guess is a C sized chunk positioned nthreads iterations
 117          in, offset by our thread number.  */
 118       s0 = (thr->ts.static_trip * nthreads + i) * c;
 119       e0 = s0 + c;
 120
 121       /* Detect overflow.  */
 122       if (s0 >= n)
 123         return 1;
 124       if (e0 > n)
 125         e0 = n;
 126
 127       /* Transform these to the actual start and end numbers.  */
 128       s = (long)s0 * ws->incr + ws->next;
 129       e = (long)e0 * ws->incr + ws->next;
 130
 131       *pstart = s;
 132       *pend = e;
 133
 134       if (e0 == n)
 135         thr->ts.static_trip = -1;
 136       else
 137         thr->ts.static_trip++;
 138       return 0;
 139     }
 140 }
 141
 142
 143 /* This function implements the DYNAMIC scheduling method.  Arguments are
 144    as for gomp_iter_static_next.  This function must be called with ws->lock
 145    held.  */
 146
 147 bool
 148 gomp_iter_dynamic_next_locked (long *pstart, long *pend)
 149 {
 150   struct gomp_thread *thr = gomp_thread ();
 151   struct gomp_work_share *ws = thr->ts.work_share;
 152   long start, end, chunk, left;
 153
 154   start = ws->next;
 155   if (start == ws->end)
 156     return false;
 157
 158   chunk = ws->chunk_size;
 159   left = ws->end - start;
 160   if (ws->incr < 0)
 161     {
 162       if (chunk < left)
 163         chunk = left;
 164     }
 165   else
 166     {
 167       if (chunk > left)
 168         chunk = left;
 169     }
 170   end = start + chunk;
 171
 172   ws->next = end;
 173   *pstart = start;
 174   *pend = end;
 175   return true;
 176 }
 177
 178
 179 #ifdef HAVE_SYNC_BUILTINS
 180 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 181    instead.  Note that the only memory value that changes is ws->next.  */
 182
 183 bool
 184 gomp_iter_dynamic_next (long *pstart, long *pend)
 185 {
 186   struct gomp_thread *thr = gomp_thread ();
 187   struct gomp_work_share *ws = thr->ts.work_share;
 188   long start, end, nend, chunk, incr;
 189
 190   end = ws->end;
 191   incr = ws->incr;
 192   chunk = ws->chunk_size;
 193
 194   if (__builtin_expect (ws->mode, 1))
 195     {
 196       long tmp = __sync_fetch_and_add (&ws->next, chunk);
 197       if (incr > 0)
 198         {
 199           if (tmp >= end)
 200             return false;
 201           nend = tmp + chunk;
 202           if (nend > end)
 203             nend = end;
 204           *pstart = tmp;
 205           *pend = nend;
 206           return true;
 207         }
 208       else
 209         {
 210           if (tmp <= end)
 211             return false;
 212           nend = tmp + chunk;
 213           if (nend < end)
 214             nend = end;
 215           *pstart = tmp;
 216           *pend = nend;
 217           return true;
 218         }
 219     }
 220
 221   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
 222   while (1)
 223     {
 224       long left = end - start;
 225       long tmp;
 226
 227       if (start == end)
 228         return false;
 229
 230       if (incr < 0)
 231         {
 232           if (chunk < left)
 233             chunk = left;
 234         }
 235       else
 236         {
 237           if (chunk > left)
 238             chunk = left;
 239         }
 240       nend = start + chunk;
 241
 242       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
 243       if (__builtin_expect (tmp == start, 1))
 244         break;
 245
 246       start = tmp;
 247     }
 248
 249   *pstart = start;
 250   *pend = nend;
 251   return true;
 252 }
 253 #endif /* HAVE_SYNC_BUILTINS */
 254
 255
 256 /* This function implements the GUIDED scheduling method.  Arguments are
 257    as for gomp_iter_static_next.  This function must be called with the
 258    work share lock held.  */
 259
 260 bool
 261 gomp_iter_guided_next_locked (long *pstart, long *pend)
 262 {
 263   struct gomp_thread *thr = gomp_thread ();
 264   struct gomp_work_share *ws = thr->ts.work_share;
 265   struct gomp_team *team = thr->ts.team;
 266   unsigned long nthreads = team ? team->nthreads : 1;
 267   unsigned long n, q;
 268   long start, end;
 269
 270   if (ws->next == ws->end)
 271     return false;
 272
 273   start = ws->next;
 274   n = (ws->end - start) / ws->incr;
 275   q = (n + nthreads - 1) / nthreads;
 276
 277   if (q < ws->chunk_size)
 278     q = ws->chunk_size;
 279   if (q <= n)
 280     end = start + q * ws->incr;
 281   else
 282     end = ws->end;
 283
 284   ws->next = end;
 285   *pstart = start;
 286   *pend = end;
 287   return true;
 288 }
 289
 290 #ifdef HAVE_SYNC_BUILTINS
 291 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 292    instead.  Note that the only memory value that changes is ws->next.  */
 293
 294 bool
 295 gomp_iter_guided_next (long *pstart, long *pend)
 296 {
 297   struct gomp_thread *thr = gomp_thread ();
 298   struct gomp_work_share *ws = thr->ts.work_share;
 299   struct gomp_team *team = thr->ts.team;
 300   unsigned long nthreads = team ? team->nthreads : 1;
 301   long start, end, nend, incr;
 302   unsigned long chunk_size;
 303
 304   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
 305   end = ws->end;
 306   incr = ws->incr;
 307   chunk_size = ws->chunk_size;
 308
 309   while (1)
 310     {
 311       unsigned long n, q;
 312       long tmp;
 313
 314       if (start == end)
 315         return false;
 316
 317       n = (end - start) / incr;
 318       q = (n + nthreads - 1) / nthreads;
 319
 320       if (q < chunk_size)
 321         q = chunk_size;
 322       if (__builtin_expect (q <= n, 1))
 323         nend = start + q * incr;
 324       else
 325         nend = end;
 326
 327       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
 328       if (__builtin_expect (tmp == start, 1))
 329         break;
 330
 331       start = tmp;
 332     }
 333
 334   *pstart = start;
 335   *pend = nend;
 336   return true;
 337 }
 338 #endif /* HAVE_SYNC_BUILTINS */