Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / runtime / test / tasking / kmp_task_reduction_nest.cpp
blob63dffe44dad5c040449b6650fa946e573b5be23b
1 // RUN: %libomp-cxx-compile-and-run
2 // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
3 // GCC-5 is needed for OpenMP 4.0 support (taskgroup)
4 // XFAIL: gcc-4
5 #include <cstdio>
6 #include <cmath>
7 #include <cassert>
8 #include <omp.h>
10 // Total number of loop iterations, should be multiple of T for this test
11 #define N 10000
13 // Flag to request lazy (1) or eager (0) allocation of reduction objects
14 #ifndef FLG
15 #define FLG 0
16 #endif
19 // initial user's code that corresponds to pseudo code of the test
20 #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
22 for( int l = 0; l < N; ++l ) {
23 #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
25 i += l;
26 if( l%2 )
27 x *= 1.0 / (l + 1);
28 else
29 x *= (l + 1);
33 #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
35 for( int l = 0; l < N; ++l ) {
36 #pragma omp task firstprivate(l) in_reduction(+:j,y) \
37 in_reduction(*:x) in_reduction(-:k)
39 j += l;
40 k -= l;
41 y += (double)l;
42 if( l%2 )
43 x *= 1.0 / (l + 1);
44 else
45 x *= (l + 1);
47 #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
49 i -= l;
50 k -= l;
51 y += (double)l;
53 #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
55 j += l;
56 if( l%2 )
57 x *= 1.0 / (l + 1);
58 else
59 x *= (l + 1);
62 } // inner reduction
64 for( int l = 0; l < N; ++l ) {
65 #pragma omp task firstprivate(l) in_reduction(+:j)
66 j += l;
68 } // outer reduction
71 //------------------------------------------------
72 // OpenMP runtime library routines
73 #ifdef __cplusplus
74 extern "C" {
75 #endif
76 extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
77 extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
78 extern int __kmpc_global_thread_num(void*);
79 #ifdef __cplusplus
81 #endif
83 //------------------------------------------------
84 // Compiler-generated code
86 typedef struct _task_red_item {
87 void *shar; // shared reduction item
88 size_t size; // size of data item
89 void *f_init; // data initialization routine
90 void *f_fini; // data finalization routine
91 void *f_comb; // data combiner routine
92 unsigned flags;
93 } _task_red_item_t;
95 // int:+ no need in init/fini callbacks, valid for subtraction
96 void __red_int_add_comb(void *lhs, void *rhs) // combiner
97 { *(int*)lhs += *(int*)rhs; }
99 // long long:+ no need in init/fini callbacks, valid for subtraction
100 void __red_llong_add_comb(void *lhs, void *rhs) // combiner
101 { *(long long*)lhs += *(long long*)rhs; }
103 // double:* no need in fini callback
104 void __red_dbl_mul_init(void *data) // initializer
105 { *(double*)data = 1.0; }
106 void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
107 { *(double*)lhs *= *(double*)rhs; }
109 // double:+ no need in init/fini callbacks
110 void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
111 { *(double*)lhs += *(double*)rhs; }
113 // ==============================
115 void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
117 for( int l = 0; l < N; ++l ) {
118 *pi += l;
119 if( l%2 )
120 *px *= 1.0 / (l + 1);
121 else
122 *px *= (l + 1);
124 for( int l = 0; l < N; ++l ) {
125 *pj += l;
126 *pk -= l;
127 *py += (double)l;
128 if( l%2 )
129 *px *= 1.0 / (l + 1);
130 else
131 *px *= (l + 1);
133 *pi -= l;
134 *pk -= l;
135 *py += (double)l;
137 *pj += l;
138 if( l%2 )
139 *px *= 1.0 / (l + 1);
140 else
141 *px *= (l + 1);
143 for( int l = 0; l < N; ++l ) {
144 *pj += l;
148 //------------------------------------------------
149 // Test case
150 int main()
152 int nthreads = omp_get_max_threads();
153 int err = 0;
154 void** ptrs = (void**)malloc(nthreads*sizeof(void*));
156 // user's code ======================================
157 // variables for serial calculations:
158 int is = 3;
159 long long js = -9999999;
160 double xs = 99999.0;
161 long long ks = 99999999;
162 double ys = -99999999.0;
163 // variables for parallel calculations:
164 int ip = 3;
165 long long jp = -9999999;
166 double xp = 99999.0;
167 long long kp = 99999999;
168 double yp = -99999999.0;
170 calc_serial(&is, &js, &xs, &ks, &ys);
171 // ==================================================
172 for (int i = 0; i < nthreads; ++i)
173 ptrs[i] = NULL;
174 #pragma omp parallel
176 #pragma omp single nowait
178 // outer taskgroup reduces (i,j,x)
179 #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
181 _task_red_item_t red_data[3];
182 red_data[0].shar = &ip;
183 red_data[0].size = sizeof(ip);
184 red_data[0].f_init = NULL; // RTL will zero thread-specific objects
185 red_data[0].f_fini = NULL; // no destructors needed
186 red_data[0].f_comb = (void*)&__red_int_add_comb;
187 red_data[0].flags = FLG;
188 red_data[1].shar = &jp;
189 red_data[1].size = sizeof(jp);
190 red_data[1].f_init = NULL; // RTL will zero thread-specific objects
191 red_data[1].f_fini = NULL; // no destructors needed
192 red_data[1].f_comb = (void*)&__red_llong_add_comb;
193 red_data[1].flags = FLG;
194 red_data[2].shar = &xp;
195 red_data[2].size = sizeof(xp);
196 red_data[2].f_init = (void*)&__red_dbl_mul_init;
197 red_data[2].f_fini = NULL; // no destructors needed
198 red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
199 red_data[2].flags = FLG;
200 int gtid = __kmpc_global_thread_num(NULL);
201 void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
203 for( int l = 0; l < N; l += 2 ) {
204 // 2 iterations per task to get correct x value; actually any even
205 // number of iters per task will work, otherwise x looses precision
206 #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
208 int gtid = __kmpc_global_thread_num(NULL);
209 int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
210 double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
211 gtid, tg1, &xp);
212 if (!ptrs[gtid]) ptrs[gtid] = p_xp;
214 // user's pseudo-code ==============================
215 *p_ip += l;
216 *p_xp *= (l + 1);
218 *p_ip += l + 1;
219 *p_xp *= 1.0 / (l + 2);
220 // ==================================================
223 // inner taskgroup reduces (i,k,y), i is same object as in outer one
224 #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
226 _task_red_item_t red_data[3];
227 red_data[0].shar = &ip;
228 red_data[0].size = sizeof(ip);
229 red_data[0].f_init = NULL; // RTL will zero thread-specific objects
230 red_data[0].f_fini = NULL; // no destructors needed
231 red_data[0].f_comb = (void*)&__red_int_add_comb;
232 red_data[0].flags = FLG;
233 red_data[1].shar = &kp;
234 red_data[1].size = sizeof(kp);
235 red_data[1].f_init = NULL; // RTL will zero thread-specific objects
236 red_data[1].f_fini = NULL; // no destructors needed
237 red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
238 red_data[1].flags = FLG;
239 red_data[2].shar = &yp;
240 red_data[2].size = sizeof(yp);
241 red_data[2].f_init = NULL; // RTL will zero thread-specific objects
242 red_data[2].f_fini = NULL; // no destructors needed
243 red_data[2].f_comb = (void*)&__red_dbl_add_comb;
244 red_data[2].flags = FLG;
245 int gtid = __kmpc_global_thread_num(NULL);
246 void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
248 for( int l = 0; l < N; l += 2 ) {
249 #pragma omp task firstprivate(l)
250 // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
252 int gtid = __kmpc_global_thread_num(NULL);
253 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
254 gtid, tg1, &jp);
255 long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
256 gtid, tg2, &kp);
257 double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
258 gtid, tg1, &xp);
259 double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
260 gtid, tg2, &yp);
261 // user's pseudo-code ==============================
262 *p_jp += l;
263 *p_kp -= l;
264 *p_yp += (double)l;
265 *p_xp *= (l + 1);
267 *p_jp += l + 1;
268 *p_kp -= l + 1;
269 *p_yp += (double)(l + 1);
270 *p_xp *= 1.0 / (l + 2);
271 // =================================================
273 // the following code is here just to check __kmpc_task_reduction_get_th_data:
274 int tid = omp_get_thread_num();
275 void *addr1;
276 void *addr2;
277 addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
278 addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
279 if (addr1 != addr2) {
280 #pragma omp atomic
281 ++err;
282 printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
284 // from neighbour w/o taskgroup (should start lookup from current tg2)
285 if (tid > 0) {
286 if (ptrs[tid-1]) {
287 addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
288 if (addr1 != addr2) {
289 #pragma omp atomic
290 ++err;
291 printf("Wrong thread-specific addresses %d s:%p n:%p\n",
292 tid, addr1, addr2);
295 } else {
296 if (ptrs[nthreads-1]) {
297 addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
298 if (addr1 != addr2) {
299 #pragma omp atomic
300 ++err;
301 printf("Wrong thread-specific addresses %d s:%p n:%p\n",
302 tid, addr1, addr2);
306 // ----------------------------------------------
309 #pragma omp task firstprivate(l)
310 // in_reduction(+:y) in_reduction(-:i,k)
312 int gtid = __kmpc_global_thread_num(NULL);
313 int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
314 gtid, tg2, &ip);
315 long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
316 gtid, tg2, &kp);
317 double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
318 gtid, tg2, &yp);
320 // user's pseudo-code ==============================
321 *p_ip -= l;
322 *p_kp -= l;
323 *p_yp += (double)l;
325 *p_ip -= l + 1;
326 *p_kp -= l + 1;
327 *p_yp += (double)(l + 1);
328 // =================================================
330 #pragma omp task firstprivate(l)
331 // in_reduction(+:j) in_reduction(*:x)
333 int gtid = __kmpc_global_thread_num(NULL);
334 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
335 gtid, tg1, &jp);
336 double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
337 gtid, tg1, &xp);
338 // user's pseudo-code ==============================
339 *p_jp += l;
340 *p_xp *= (l + 1);
342 *p_jp += l + 1;
343 *p_xp *= 1.0 / (l + 2);
344 // =================================================
347 } // inner reduction
349 for( int l = 0; l < N; l += 2 ) {
350 #pragma omp task firstprivate(l) // in_reduction(+:j)
352 int gtid = __kmpc_global_thread_num(NULL);
353 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
354 gtid, tg1, &jp);
355 // user's pseudo-code ==============================
356 *p_jp += l;
357 *p_jp += l + 1;
358 // =================================================
361 } // outer reduction
362 } // end single
363 } // end parallel
364 // check results
365 #if _DEBUG
366 printf("reduction flags = %u\n", FLG);
367 #endif
368 if (ip == is && jp == js && ks == kp &&
369 fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
370 printf("passed\n");
371 else
372 printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
373 is, js, xs, ks, ys,
374 ip, jp, xp, kp, yp);
375 return 0;