openmp/runtime/test/tasking/kmp_task_reduction_nest.cpp

   1 // RUN: %libomp-cxx-compile-and-run
   2 // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
   3 // GCC-5 is needed for OpenMP 4.0 support (taskgroup)
   4 // XFAIL: gcc-4
   5 #include <cstdio>
   6 #include <cmath>
   7 #include <cassert>
   8 #include <omp.h>
   9
  10 // Total number of loop iterations, should be multiple of T for this test
  11 #define N 10000
  12
  13 // Flag to request lazy (1) or eager (0) allocation of reduction objects
  14 #ifndef FLG
  15 #define FLG 0
  16 #endif
  17
  18 /*
  19   // initial user's code that corresponds to pseudo code of the test
  20   #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
  21   {
  22     for( int l = 0; l < N; ++l ) {
  23       #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
  24       {
  25         i += l;
  26         if( l%2 )
  27           x *= 1.0 / (l + 1);
  28         else
  29           x *= (l + 1);
  30       }
  31     }
  32
  33     #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
  34     {
  35       for( int l = 0; l < N; ++l ) {
  36         #pragma omp task firstprivate(l) in_reduction(+:j,y) \
  37             in_reduction(*:x) in_reduction(-:k)
  38         {
  39           j += l;
  40           k -= l;
  41           y += (double)l;
  42           if( l%2 )
  43             x *= 1.0 / (l + 1);
  44           else
  45             x *= (l + 1);
  46         }
  47         #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
  48         {
  49           i -= l;
  50           k -= l;
  51           y += (double)l;
  52         }
  53         #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
  54         {
  55           j += l;
  56           if( l%2 )
  57             x *= 1.0 / (l + 1);
  58           else
  59             x *= (l + 1);
  60         }
  61       }
  62     } // inner reduction
  63
  64     for( int l = 0; l < N; ++l ) {
  65       #pragma omp task firstprivate(l) in_reduction(+:j)
  66         j += l;
  67     }
  68   } // outer reduction
  69 */
  70
  71 //------------------------------------------------
  72 // OpenMP runtime library routines
  73 #ifdef __cplusplus
  74 extern "C" {
  75 #endif
  76 extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
  77 extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
  78 extern int __kmpc_global_thread_num(void*);
  79 #ifdef __cplusplus
  80 }
  81 #endif
  82
  83 //------------------------------------------------
  84 // Compiler-generated code
  85
  86 typedef struct _task_red_item {
  87     void       *shar; // shared reduction item
  88     size_t      size; // size of data item
  89     void       *f_init; // data initialization routine
  90     void       *f_fini; // data finalization routine
  91     void       *f_comb; // data combiner routine
  92     unsigned    flags;
  93 } _task_red_item_t;
  94
  95 // int:+   no need in init/fini callbacks, valid for subtraction
  96 void __red_int_add_comb(void *lhs, void *rhs) // combiner
  97 { *(int*)lhs += *(int*)rhs; }
  98
  99 // long long:+   no need in init/fini callbacks, valid for subtraction
 100 void __red_llong_add_comb(void *lhs, void *rhs) // combiner
 101 { *(long long*)lhs += *(long long*)rhs; }
 102
 103 // double:*   no need in fini callback
 104 void __red_dbl_mul_init(void *data) // initializer
 105 { *(double*)data = 1.0; }
 106 void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
 107 { *(double*)lhs *= *(double*)rhs; }
 108
 109 // double:+   no need in init/fini callbacks
 110 void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
 111 { *(double*)lhs += *(double*)rhs; }
 112
 113 // ==============================
 114
 115 void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
 116 {
 117     for( int l = 0; l < N; ++l ) {
 118         *pi += l;
 119         if( l%2 )
 120           *px *= 1.0 / (l + 1);
 121         else
 122           *px *= (l + 1);
 123     }
 124     for( int l = 0; l < N; ++l ) {
 125         *pj += l;
 126         *pk -= l;
 127         *py += (double)l;
 128         if( l%2 )
 129             *px *= 1.0 / (l + 1);
 130         else
 131             *px *= (l + 1);
 132
 133         *pi -= l;
 134         *pk -= l;
 135         *py += (double)l;
 136
 137         *pj += l;
 138         if( l%2 )
 139             *px *= 1.0 / (l + 1);
 140         else
 141             *px *= (l + 1);
 142     }
 143     for( int l = 0; l < N; ++l ) {
 144         *pj += l;
 145     }
 146 }
 147
 148 //------------------------------------------------
 149 // Test case
 150 int main()
 151 {
 152   int nthreads = omp_get_max_threads();
 153   int err = 0;
 154   void** ptrs = (void**)malloc(nthreads*sizeof(void*));
 155
 156   // user's code ======================================
 157   // variables for serial calculations:
 158   int is = 3;
 159   long long js = -9999999;
 160   double xs = 99999.0;
 161   long long ks = 99999999;
 162   double ys = -99999999.0;
 163   // variables for parallel calculations:
 164   int ip = 3;
 165   long long jp = -9999999;
 166   double xp = 99999.0;
 167   long long kp = 99999999;
 168   double yp = -99999999.0;
 169
 170   calc_serial(&is, &js, &xs, &ks, &ys);
 171   // ==================================================
 172   for (int i = 0; i < nthreads; ++i)
 173     ptrs[i] = NULL;
 174   #pragma omp parallel
 175   {
 176     #pragma omp single nowait
 177     {
 178       // outer taskgroup reduces (i,j,x)
 179       #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
 180       {
 181         _task_red_item_t red_data[3];
 182         red_data[0].shar = &ip;
 183         red_data[0].size = sizeof(ip);
 184         red_data[0].f_init = NULL; // RTL will zero thread-specific objects
 185         red_data[0].f_fini = NULL; // no destructors needed
 186         red_data[0].f_comb = (void*)&__red_int_add_comb;
 187         red_data[0].flags = FLG;
 188         red_data[1].shar = &jp;
 189         red_data[1].size = sizeof(jp);
 190         red_data[1].f_init = NULL; // RTL will zero thread-specific objects
 191         red_data[1].f_fini = NULL; // no destructors needed
 192         red_data[1].f_comb = (void*)&__red_llong_add_comb;
 193         red_data[1].flags = FLG;
 194         red_data[2].shar = &xp;
 195         red_data[2].size = sizeof(xp);
 196         red_data[2].f_init = (void*)&__red_dbl_mul_init;
 197         red_data[2].f_fini = NULL; // no destructors needed
 198         red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
 199         red_data[2].flags = FLG;
 200         int gtid = __kmpc_global_thread_num(NULL);
 201         void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
 202
 203         for( int l = 0; l < N; l += 2 ) {
 204           // 2 iterations per task to get correct x value; actually any even
 205           // number of iters per task will work, otherwise x looses precision
 206           #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
 207           {
 208             int gtid = __kmpc_global_thread_num(NULL);
 209             int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
 210             double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
 211                                         gtid, tg1, &xp);
 212             if (!ptrs[gtid]) ptrs[gtid] = p_xp;
 213
 214             // user's pseudo-code ==============================
 215             *p_ip += l;
 216             *p_xp *= (l + 1);
 217
 218             *p_ip += l + 1;
 219             *p_xp *= 1.0 / (l + 2);
 220             // ==================================================
 221           }
 222         }
 223         // inner taskgroup reduces (i,k,y), i is same object as in outer one
 224         #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
 225         {
 226           _task_red_item_t red_data[3];
 227           red_data[0].shar = &ip;
 228           red_data[0].size = sizeof(ip);
 229           red_data[0].f_init = NULL; // RTL will zero thread-specific objects
 230           red_data[0].f_fini = NULL; // no destructors needed
 231           red_data[0].f_comb = (void*)&__red_int_add_comb;
 232           red_data[0].flags = FLG;
 233           red_data[1].shar = &kp;
 234           red_data[1].size = sizeof(kp);
 235           red_data[1].f_init = NULL; // RTL will zero thread-specific objects
 236           red_data[1].f_fini = NULL; // no destructors needed
 237           red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
 238           red_data[1].flags = FLG;
 239           red_data[2].shar = &yp;
 240           red_data[2].size = sizeof(yp);
 241           red_data[2].f_init = NULL; // RTL will zero thread-specific objects
 242           red_data[2].f_fini = NULL; // no destructors needed
 243           red_data[2].f_comb = (void*)&__red_dbl_add_comb;
 244           red_data[2].flags = FLG;
 245           int gtid = __kmpc_global_thread_num(NULL);
 246           void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
 247
 248           for( int l = 0; l < N; l += 2 ) {
 249             #pragma omp task firstprivate(l)
 250             // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
 251             {
 252               int gtid = __kmpc_global_thread_num(NULL);
 253               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
 254                                                 gtid, tg1, &jp);
 255               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
 256                                                 gtid, tg2, &kp);
 257               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
 258                                           gtid, tg1, &xp);
 259               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
 260                                           gtid, tg2, &yp);
 261               // user's pseudo-code ==============================
 262               *p_jp += l;
 263               *p_kp -= l;
 264               *p_yp += (double)l;
 265               *p_xp *= (l + 1);
 266
 267               *p_jp += l + 1;
 268               *p_kp -= l + 1;
 269               *p_yp += (double)(l + 1);
 270               *p_xp *= 1.0 / (l + 2);
 271               // =================================================
 272 {
 273   // the following code is here just to check __kmpc_task_reduction_get_th_data:
 274   int tid = omp_get_thread_num();
 275   void *addr1;
 276   void *addr2;
 277   addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
 278   addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
 279   if (addr1 != addr2) {
 280     #pragma omp atomic
 281       ++err;
 282     printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
 283   }
 284   // from neighbour w/o taskgroup (should start lookup from current tg2)
 285   if (tid > 0) {
 286     if (ptrs[tid-1]) {
 287       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
 288       if (addr1 != addr2) {
 289         #pragma omp atomic
 290           ++err;
 291         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
 292                tid, addr1, addr2);
 293       }
 294     }
 295   } else {
 296     if (ptrs[nthreads-1]) {
 297       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
 298       if (addr1 != addr2) {
 299         #pragma omp atomic
 300           ++err;
 301         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
 302                tid, addr1, addr2);
 303       }
 304     }
 305   }
 306   // ----------------------------------------------
 307 }
 308             }
 309             #pragma omp task firstprivate(l)
 310             // in_reduction(+:y) in_reduction(-:i,k)
 311             {
 312               int gtid = __kmpc_global_thread_num(NULL);
 313               int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
 314                                     gtid, tg2, &ip);
 315               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
 316                                                 gtid, tg2, &kp);
 317               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
 318                                           gtid, tg2, &yp);
 319
 320               // user's pseudo-code ==============================
 321               *p_ip -= l;
 322               *p_kp -= l;
 323               *p_yp += (double)l;
 324
 325               *p_ip -= l + 1;
 326               *p_kp -= l + 1;
 327               *p_yp += (double)(l + 1);
 328               // =================================================
 329             }
 330             #pragma omp task firstprivate(l)
 331             // in_reduction(+:j) in_reduction(*:x)
 332             {
 333               int gtid = __kmpc_global_thread_num(NULL);
 334               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
 335                                                 gtid, tg1, &jp);
 336               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
 337                                           gtid, tg1, &xp);
 338               // user's pseudo-code ==============================
 339               *p_jp += l;
 340               *p_xp *= (l + 1);
 341
 342               *p_jp += l + 1;
 343               *p_xp *= 1.0 / (l + 2);
 344               // =================================================
 345             }
 346           }
 347         } // inner reduction
 348
 349         for( int l = 0; l < N; l += 2 ) {
 350           #pragma omp task firstprivate(l) // in_reduction(+:j)
 351           {
 352             int gtid = __kmpc_global_thread_num(NULL);
 353             long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
 354                                               gtid, tg1, &jp);
 355             // user's pseudo-code ==============================
 356             *p_jp += l;
 357             *p_jp += l + 1;
 358             // =================================================
 359           }
 360         }
 361       } // outer reduction
 362     } // end single
 363   } // end parallel
 364   // check results
 365 #if _DEBUG
 366   printf("reduction flags = %u\n", FLG);
 367 #endif
 368   if (ip == is && jp == js && ks == kp &&
 369       fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
 370     printf("passed\n");
 371   else
 372     printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
 373       is, js, xs, ks, ys,
 374       ip, jp, xp, kp, yp);
 375   return 0;
 376 }