[RISC-V][PR target/116590] Avoid emitting multiple instructions from fmacc patterns
[gcc.git] / libgomp / iter.c
blob2c96a1bf372b33a46ab47ba5a7eec35816e295c4
1 /* Copyright (C) 2005-2024 Free Software Foundation, Inc.
2 Contributed by Richard Henderson <rth@redhat.com>.
4 This file is part of the GNU Offloading and Multi Processing Library
5 (libgomp).
7 Libgomp is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 /* This file contains routines for managing work-share iteration, both
27 for loops and sections. */
29 #include "libgomp.h"
30 #include <stdlib.h>
33 /* This function implements the STATIC scheduling method. The caller should
34 iterate *pstart <= x < *pend. Return zero if there are more iterations
35 to perform; nonzero if not. Return less than 0 if this thread had
36 received the absolutely last iteration. */
38 int
39 gomp_iter_static_next (long *pstart, long *pend)
41 struct gomp_thread *thr = gomp_thread ();
42 struct gomp_team *team = thr->ts.team;
43 struct gomp_work_share *ws = thr->ts.work_share;
44 unsigned long nthreads = team ? team->nthreads : 1;
46 if (thr->ts.static_trip == -1)
47 return -1;
49 /* Quick test for degenerate teams and orphaned constructs. */
50 if (nthreads == 1)
52 *pstart = ws->next;
53 *pend = ws->end;
54 thr->ts.static_trip = -1;
55 return ws->next == ws->end;
58 /* We interpret chunk_size zero as "unspecified", which means that we
59 should break up the iterations such that each thread makes only one
60 trip through the outer loop. */
61 if (ws->chunk_size == 0)
63 unsigned long n, q, i, t;
64 unsigned long s0, e0;
65 long s, e;
67 if (thr->ts.static_trip > 0)
68 return 1;
70 /* Compute the total number of iterations. */
71 s = ws->incr + (ws->incr > 0 ? -1 : 1);
72 n = (ws->end - ws->next + s) / ws->incr;
73 i = thr->ts.team_id;
75 /* Compute the "zero-based" start and end points. That is, as
76 if the loop began at zero and incremented by one. */
77 q = n / nthreads;
78 t = n % nthreads;
79 if (i < t)
81 t = 0;
82 q++;
84 s0 = q * i + t;
85 e0 = s0 + q;
87 /* Notice when no iterations allocated for this thread. */
88 if (s0 >= e0)
90 thr->ts.static_trip = 1;
91 return 1;
94 /* Transform these to the actual start and end numbers. */
95 s = (long)s0 * ws->incr + ws->next;
96 e = (long)e0 * ws->incr + ws->next;
98 *pstart = s;
99 *pend = e;
100 thr->ts.static_trip = (e0 == n ? -1 : 1);
101 return 0;
103 else
105 unsigned long n, s0, e0, i, c;
106 long s, e;
108 /* Otherwise, each thread gets exactly chunk_size iterations
109 (if available) each time through the loop. */
111 s = ws->incr + (ws->incr > 0 ? -1 : 1);
112 n = (ws->end - ws->next + s) / ws->incr;
113 i = thr->ts.team_id;
114 c = ws->chunk_size;
116 /* Initial guess is a C sized chunk positioned nthreads iterations
117 in, offset by our thread number. */
118 s0 = (thr->ts.static_trip * nthreads + i) * c;
119 e0 = s0 + c;
121 /* Detect overflow. */
122 if (s0 >= n)
123 return 1;
124 if (e0 > n)
125 e0 = n;
127 /* Transform these to the actual start and end numbers. */
128 s = (long)s0 * ws->incr + ws->next;
129 e = (long)e0 * ws->incr + ws->next;
131 *pstart = s;
132 *pend = e;
134 if (e0 == n)
135 thr->ts.static_trip = -1;
136 else
137 thr->ts.static_trip++;
138 return 0;
143 /* This function implements the DYNAMIC scheduling method. Arguments are
144 as for gomp_iter_static_next. This function must be called with ws->lock
145 held. */
147 bool
148 gomp_iter_dynamic_next_locked (long *pstart, long *pend)
150 struct gomp_thread *thr = gomp_thread ();
151 struct gomp_work_share *ws = thr->ts.work_share;
152 long start, end, chunk, left;
154 start = ws->next;
155 if (start == ws->end)
156 return false;
158 chunk = ws->chunk_size;
159 left = ws->end - start;
160 if (ws->incr < 0)
162 if (chunk < left)
163 chunk = left;
165 else
167 if (chunk > left)
168 chunk = left;
170 end = start + chunk;
172 ws->next = end;
173 *pstart = start;
174 *pend = end;
175 return true;
179 #ifdef HAVE_SYNC_BUILTINS
180 /* Similar, but doesn't require the lock held, and uses compare-and-swap
181 instead. Note that the only memory value that changes is ws->next. */
183 bool
184 gomp_iter_dynamic_next (long *pstart, long *pend)
186 struct gomp_thread *thr = gomp_thread ();
187 struct gomp_work_share *ws = thr->ts.work_share;
188 long start, end, nend, chunk, incr;
190 end = ws->end;
191 incr = ws->incr;
192 chunk = ws->chunk_size;
194 if (__builtin_expect (ws->mode, 1))
196 long tmp = __sync_fetch_and_add (&ws->next, chunk);
197 if (incr > 0)
199 if (tmp >= end)
200 return false;
201 nend = tmp + chunk;
202 if (nend > end)
203 nend = end;
204 *pstart = tmp;
205 *pend = nend;
206 return true;
208 else
210 if (tmp <= end)
211 return false;
212 nend = tmp + chunk;
213 if (nend < end)
214 nend = end;
215 *pstart = tmp;
216 *pend = nend;
217 return true;
221 start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
222 while (1)
224 long left = end - start;
225 long tmp;
227 if (start == end)
228 return false;
230 if (incr < 0)
232 if (chunk < left)
233 chunk = left;
235 else
237 if (chunk > left)
238 chunk = left;
240 nend = start + chunk;
242 tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
243 if (__builtin_expect (tmp == start, 1))
244 break;
246 start = tmp;
249 *pstart = start;
250 *pend = nend;
251 return true;
253 #endif /* HAVE_SYNC_BUILTINS */
256 /* This function implements the GUIDED scheduling method. Arguments are
257 as for gomp_iter_static_next. This function must be called with the
258 work share lock held. */
260 bool
261 gomp_iter_guided_next_locked (long *pstart, long *pend)
263 struct gomp_thread *thr = gomp_thread ();
264 struct gomp_work_share *ws = thr->ts.work_share;
265 struct gomp_team *team = thr->ts.team;
266 unsigned long nthreads = team ? team->nthreads : 1;
267 unsigned long n, q;
268 long start, end;
270 if (ws->next == ws->end)
271 return false;
273 start = ws->next;
274 n = (ws->end - start) / ws->incr;
275 q = (n + nthreads - 1) / nthreads;
277 if (q < ws->chunk_size)
278 q = ws->chunk_size;
279 if (q <= n)
280 end = start + q * ws->incr;
281 else
282 end = ws->end;
284 ws->next = end;
285 *pstart = start;
286 *pend = end;
287 return true;
290 #ifdef HAVE_SYNC_BUILTINS
291 /* Similar, but doesn't require the lock held, and uses compare-and-swap
292 instead. Note that the only memory value that changes is ws->next. */
294 bool
295 gomp_iter_guided_next (long *pstart, long *pend)
297 struct gomp_thread *thr = gomp_thread ();
298 struct gomp_work_share *ws = thr->ts.work_share;
299 struct gomp_team *team = thr->ts.team;
300 unsigned long nthreads = team ? team->nthreads : 1;
301 long start, end, nend, incr;
302 unsigned long chunk_size;
304 start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
305 end = ws->end;
306 incr = ws->incr;
307 chunk_size = ws->chunk_size;
309 while (1)
311 unsigned long n, q;
312 long tmp;
314 if (start == end)
315 return false;
317 n = (end - start) / incr;
318 q = (n + nthreads - 1) / nthreads;
320 if (q < chunk_size)
321 q = chunk_size;
322 if (__builtin_expect (q <= n, 1))
323 nend = start + q * incr;
324 else
325 nend = end;
327 tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
328 if (__builtin_expect (tmp == start, 1))
329 break;
331 start = tmp;
334 *pstart = start;
335 *pend = nend;
336 return true;
338 #endif /* HAVE_SYNC_BUILTINS */