openmp/runtime/src/kmp_collapse.cpp

   1 /*
   2  * kmp_collapse.cpp -- loop collapse feature
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "kmp.h"
  14 #include "kmp_error.h"
  15 #include "kmp_i18n.h"
  16 #include "kmp_itt.h"
  17 #include "kmp_stats.h"
  18 #include "kmp_str.h"
  19 #include "kmp_collapse.h"
  20
  21 #if OMPT_SUPPORT
  22 #include "ompt-specific.h"
  23 #endif
  24
  25 // OMPTODO: different style of comments (see kmp_sched)
  26 // OMPTODO: OMPT/OMPD
  27
  28 // avoid inadevertently using a library based abs
  29 template <typename T> T __kmp_abs(const T val) {
  30   return (val < 0) ? -val : val;
  31 }
  32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
  33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
  34
  35 //----------------------------------------------------------------------------
  36 // Common functions for working with rectangular and non-rectangular loops
  37 //----------------------------------------------------------------------------
  38
  39 template <typename T> int __kmp_sign(T val) {
  40   return (T(0) < val) - (val < T(0));
  41 }
  42
  43 template <typename T> class CollapseAllocator {
  44   typedef T *pT;
  45
  46 private:
  47   static const size_t allocaSize = 32; // size limit for stack allocations
  48                                        // (8 bytes x 4 nested loops)
  49   char stackAlloc[allocaSize];
  50   static constexpr size_t maxElemCount = allocaSize / sizeof(T);
  51   pT pTAlloc;
  52
  53 public:
  54   CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
  55     if (n > maxElemCount) {
  56       pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
  57     }
  58   }
  59   ~CollapseAllocator() {
  60     if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
  61       __kmp_free(pTAlloc);
  62     }
  63   }
  64   T &operator[](int index) { return pTAlloc[index]; }
  65   operator const pT() { return pTAlloc; }
  66 };
  67
  68 //----------Loop canonicalization---------------------------------------------
  69
  70 // For loop nest (any shape):
  71 // convert != to < or >;
  72 // switch from using < or > to <= or >=.
  73 // "bounds" array has to be allocated per thread.
  74 // All other internal functions will work only with canonicalized loops.
  75 template <typename T>
  76 void kmp_canonicalize_one_loop_XX(
  77     ident_t *loc,
  78     /*in/out*/ bounds_infoXX_template<T> *bounds) {
  79
  80   if (__kmp_env_consistency_check) {
  81     if (bounds->step == 0) {
  82       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
  83                             loc);
  84     }
  85   }
  86
  87   if (bounds->comparison == comparison_t::comp_not_eq) {
  88     // We can convert this to < or >, depends on the sign of the step:
  89     if (bounds->step > 0) {
  90       bounds->comparison = comparison_t::comp_less;
  91     } else {
  92       bounds->comparison = comparison_t::comp_greater;
  93     }
  94   }
  95
  96   if (bounds->comparison == comparison_t::comp_less) {
  97     // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
  98     // because ub0 + ub1*j should be still positive (otherwise loop was not
  99     // well formed)
 100     bounds->ub0 -= 1;
 101     bounds->comparison = comparison_t::comp_less_or_eq;
 102   } else if (bounds->comparison == comparison_t::comp_greater) {
 103     bounds->ub0 += 1;
 104     bounds->comparison = comparison_t::comp_greater_or_eq;
 105   }
 106 }
 107
 108 // Canonicalize loop nest. original_bounds_nest is an array of length n.
 109 void kmp_canonicalize_loop_nest(ident_t *loc,
 110                                 /*in/out*/ bounds_info_t *original_bounds_nest,
 111                                 kmp_index_t n) {
 112
 113   for (kmp_index_t ind = 0; ind < n; ++ind) {
 114     auto bounds = &(original_bounds_nest[ind]);
 115
 116     switch (bounds->loop_type) {
 117     case loop_type_t::loop_type_int32:
 118       kmp_canonicalize_one_loop_XX<kmp_int32>(
 119           loc,
 120           /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
 121       break;
 122     case loop_type_t::loop_type_uint32:
 123       kmp_canonicalize_one_loop_XX<kmp_uint32>(
 124           loc,
 125           /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
 126       break;
 127     case loop_type_t::loop_type_int64:
 128       kmp_canonicalize_one_loop_XX<kmp_int64>(
 129           loc,
 130           /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
 131       break;
 132     case loop_type_t::loop_type_uint64:
 133       kmp_canonicalize_one_loop_XX<kmp_uint64>(
 134           loc,
 135           /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
 136       break;
 137     default:
 138       KMP_ASSERT(false);
 139     }
 140   }
 141 }
 142
 143 //----------Calculating trip count on one level-------------------------------
 144
 145 // Calculate trip count on this loop level.
 146 // We do this either for a rectangular loop nest,
 147 // or after an adjustment bringing the loops to a parallelepiped shape.
 148 // This number should not depend on the value of outer IV
 149 // even if the formular has lb1 and ub1.
 150 // Note: for non-rectangular loops don't use span for this, it's too big.
 151
 152 template <typename T>
 153 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
 154     /*in/out*/ bounds_infoXX_template<T> *bounds) {
 155
 156   if (bounds->comparison == comparison_t::comp_less_or_eq) {
 157     if (bounds->ub0 < bounds->lb0) {
 158       // Note: after this we don't need to calculate inner loops,
 159       // but that should be an edge case:
 160       bounds->trip_count = 0;
 161     } else {
 162       // ub - lb may exceed signed type range; we need to cast to
 163       // kmp_loop_nest_iv_t anyway
 164       bounds->trip_count =
 165           static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
 166               __kmp_abs(bounds->step) +
 167           1;
 168     }
 169   } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
 170     if (bounds->lb0 < bounds->ub0) {
 171       // Note: after this we don't need to calculate inner loops,
 172       // but that should be an edge case:
 173       bounds->trip_count = 0;
 174     } else {
 175       // lb - ub may exceed signed type range; we need to cast to
 176       // kmp_loop_nest_iv_t anyway
 177       bounds->trip_count =
 178           static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
 179               __kmp_abs(bounds->step) +
 180           1;
 181     }
 182   } else {
 183     KMP_ASSERT(false);
 184   }
 185   return bounds->trip_count;
 186 }
 187
 188 // Calculate trip count on this loop level.
 189 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
 190
 191   kmp_loop_nest_iv_t trip_count = 0;
 192
 193   switch (bounds->loop_type) {
 194   case loop_type_t::loop_type_int32:
 195     trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
 196         /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
 197     break;
 198   case loop_type_t::loop_type_uint32:
 199     trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
 200         /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
 201     break;
 202   case loop_type_t::loop_type_int64:
 203     trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
 204         /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
 205     break;
 206   case loop_type_t::loop_type_uint64:
 207     trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
 208         /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
 209     break;
 210   default:
 211     KMP_ASSERT(false);
 212   }
 213
 214   return trip_count;
 215 }
 216
 217 //----------Trim original iv according to its type----------------------------
 218
 219 // Trim original iv according to its type.
 220 // Return kmp_uint64 value which can be easily used in all internal calculations
 221 // And can be statically cast back to original type in user code.
 222 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
 223   kmp_uint64 res = 0;
 224
 225   switch (loop_iv_type) {
 226   case loop_type_t::loop_type_int8:
 227     res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
 228     break;
 229   case loop_type_t::loop_type_uint8:
 230     res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
 231     break;
 232   case loop_type_t::loop_type_int16:
 233     res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
 234     break;
 235   case loop_type_t::loop_type_uint16:
 236     res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
 237     break;
 238   case loop_type_t::loop_type_int32:
 239     res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
 240     break;
 241   case loop_type_t::loop_type_uint32:
 242     res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
 243     break;
 244   case loop_type_t::loop_type_int64:
 245     res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
 246     break;
 247   case loop_type_t::loop_type_uint64:
 248     res = static_cast<kmp_uint64>(original_iv);
 249     break;
 250   default:
 251     KMP_ASSERT(false);
 252   }
 253
 254   return res;
 255 }
 256
 257 //----------Compare two IVs (remember they have a type)-----------------------
 258
 259 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
 260                 kmp_uint64 original_iv2) {
 261   bool res = false;
 262
 263   switch (loop_iv_type) {
 264   case loop_type_t::loop_type_int8:
 265     res = static_cast<kmp_int8>(original_iv1) ==
 266           static_cast<kmp_int8>(original_iv2);
 267     break;
 268   case loop_type_t::loop_type_uint8:
 269     res = static_cast<kmp_uint8>(original_iv1) ==
 270           static_cast<kmp_uint8>(original_iv2);
 271     break;
 272   case loop_type_t::loop_type_int16:
 273     res = static_cast<kmp_int16>(original_iv1) ==
 274           static_cast<kmp_int16>(original_iv2);
 275     break;
 276   case loop_type_t::loop_type_uint16:
 277     res = static_cast<kmp_uint16>(original_iv1) ==
 278           static_cast<kmp_uint16>(original_iv2);
 279     break;
 280   case loop_type_t::loop_type_int32:
 281     res = static_cast<kmp_int32>(original_iv1) ==
 282           static_cast<kmp_int32>(original_iv2);
 283     break;
 284   case loop_type_t::loop_type_uint32:
 285     res = static_cast<kmp_uint32>(original_iv1) ==
 286           static_cast<kmp_uint32>(original_iv2);
 287     break;
 288   case loop_type_t::loop_type_int64:
 289     res = static_cast<kmp_int64>(original_iv1) ==
 290           static_cast<kmp_int64>(original_iv2);
 291     break;
 292   case loop_type_t::loop_type_uint64:
 293     res = static_cast<kmp_uint64>(original_iv1) ==
 294           static_cast<kmp_uint64>(original_iv2);
 295     break;
 296   default:
 297     KMP_ASSERT(false);
 298   }
 299
 300   return res;
 301 }
 302
 303 //----------Calculate original iv on one level--------------------------------
 304
 305 // Return true if the point fits into upper bounds on this level,
 306 // false otherwise
 307 template <typename T>
 308 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
 309                                  const kmp_point_t original_ivs,
 310                                  kmp_index_t ind) {
 311
 312   T iv = static_cast<T>(original_ivs[ind]);
 313   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
 314
 315   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
 316        (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
 317       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
 318        (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
 319     // The calculated point is outside of loop upper boundary:
 320     return false;
 321   }
 322
 323   return true;
 324 }
 325
 326 // Calculate one iv corresponding to iteration on the level ind.
 327 // Return true if it fits into lower-upper bounds on this level
 328 // (if not, we need to re-calculate)
 329 template <typename T>
 330 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
 331                         /*in/out*/ kmp_point_t original_ivs,
 332                         const kmp_iterations_t iterations, kmp_index_t ind,
 333                         bool start_with_lower_bound, bool checkBounds) {
 334
 335   kmp_uint64 temp = 0;
 336   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
 337
 338   if (start_with_lower_bound) {
 339     // we moved to the next iteration on one of outer loops, should start
 340     // with the lower bound here:
 341     temp = bounds->lb0 + bounds->lb1 * outer_iv;
 342   } else {
 343     auto iteration = iterations[ind];
 344     temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
 345   }
 346
 347   // Now trim original iv according to its type:
 348   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
 349
 350   if (checkBounds) {
 351     return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
 352   } else {
 353     return true;
 354   }
 355 }
 356
 357 bool kmp_calc_one_iv(const bounds_info_t *bounds,
 358                      /*in/out*/ kmp_point_t original_ivs,
 359                      const kmp_iterations_t iterations, kmp_index_t ind,
 360                      bool start_with_lower_bound, bool checkBounds) {
 361
 362   switch (bounds->loop_type) {
 363   case loop_type_t::loop_type_int32:
 364     return kmp_calc_one_iv_XX<kmp_int32>(
 365         (bounds_infoXX_template<kmp_int32> *)(bounds),
 366         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
 367         checkBounds);
 368     break;
 369   case loop_type_t::loop_type_uint32:
 370     return kmp_calc_one_iv_XX<kmp_uint32>(
 371         (bounds_infoXX_template<kmp_uint32> *)(bounds),
 372         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
 373         checkBounds);
 374     break;
 375   case loop_type_t::loop_type_int64:
 376     return kmp_calc_one_iv_XX<kmp_int64>(
 377         (bounds_infoXX_template<kmp_int64> *)(bounds),
 378         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
 379         checkBounds);
 380     break;
 381   case loop_type_t::loop_type_uint64:
 382     return kmp_calc_one_iv_XX<kmp_uint64>(
 383         (bounds_infoXX_template<kmp_uint64> *)(bounds),
 384         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
 385         checkBounds);
 386     break;
 387   default:
 388     KMP_ASSERT(false);
 389     return false;
 390   }
 391 }
 392
 393 //----------Calculate original iv on one level for rectangular loop nest------
 394
 395 // Calculate one iv corresponding to iteration on the level ind.
 396 // Return true if it fits into lower-upper bounds on this level
 397 // (if not, we need to re-calculate)
 398 template <typename T>
 399 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
 400                                 /*in/out*/ kmp_uint64 *original_ivs,
 401                                 const kmp_iterations_t iterations,
 402                                 kmp_index_t ind) {
 403
 404   auto iteration = iterations[ind];
 405
 406   kmp_uint64 temp =
 407       bounds->lb0 +
 408       bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
 409       iteration * bounds->step;
 410
 411   // Now trim original iv according to its type:
 412   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
 413 }
 414
 415 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
 416                              /*in/out*/ kmp_uint64 *original_ivs,
 417                              const kmp_iterations_t iterations,
 418                              kmp_index_t ind) {
 419
 420   switch (bounds->loop_type) {
 421   case loop_type_t::loop_type_int32:
 422     kmp_calc_one_iv_rectang_XX<kmp_int32>(
 423         (bounds_infoXX_template<kmp_int32> *)(bounds),
 424         /*in/out*/ original_ivs, iterations, ind);
 425     break;
 426   case loop_type_t::loop_type_uint32:
 427     kmp_calc_one_iv_rectang_XX<kmp_uint32>(
 428         (bounds_infoXX_template<kmp_uint32> *)(bounds),
 429         /*in/out*/ original_ivs, iterations, ind);
 430     break;
 431   case loop_type_t::loop_type_int64:
 432     kmp_calc_one_iv_rectang_XX<kmp_int64>(
 433         (bounds_infoXX_template<kmp_int64> *)(bounds),
 434         /*in/out*/ original_ivs, iterations, ind);
 435     break;
 436   case loop_type_t::loop_type_uint64:
 437     kmp_calc_one_iv_rectang_XX<kmp_uint64>(
 438         (bounds_infoXX_template<kmp_uint64> *)(bounds),
 439         /*in/out*/ original_ivs, iterations, ind);
 440     break;
 441   default:
 442     KMP_ASSERT(false);
 443   }
 444 }
 445
 446 //----------------------------------------------------------------------------
 447 // Rectangular loop nest
 448 //----------------------------------------------------------------------------
 449
 450 //----------Canonicalize loop nest and calculate trip count-------------------
 451
 452 // Canonicalize loop nest and calculate overall trip count.
 453 // "bounds_nest" has to be allocated per thread.
 454 // API will modify original bounds_nest array to bring it to a canonical form
 455 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
 456 // canonical form there will be no changes to bounds in bounds_nest array
 457 // (only trip counts will be calculated).
 458 // Returns trip count of overall space.
 459 extern "C" kmp_loop_nest_iv_t
 460 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
 461                                  /*in/out*/ bounds_info_t *original_bounds_nest,
 462                                  kmp_index_t n) {
 463
 464   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
 465
 466   kmp_loop_nest_iv_t total = 1;
 467
 468   for (kmp_index_t ind = 0; ind < n; ++ind) {
 469     auto bounds = &(original_bounds_nest[ind]);
 470
 471     kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
 472     total *= trip_count;
 473   }
 474
 475   return total;
 476 }
 477
 478 //----------Calculate old induction variables---------------------------------
 479
 480 // Calculate old induction variables corresponding to overall new_iv.
 481 // Note: original IV will be returned as if it had kmp_uint64 type,
 482 // will have to be converted to original type in user code.
 483 // Note: trip counts should be already calculated by
 484 // __kmpc_process_loop_nest_rectang.
 485 // OMPTODO: special case 2, 3 nested loops: either do different
 486 // interface without array or possibly template this over n
 487 extern "C" void
 488 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
 489                                  const bounds_info_t *original_bounds_nest,
 490                                  /*out*/ kmp_uint64 *original_ivs,
 491                                  kmp_index_t n) {
 492
 493   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
 494
 495   // First, calc corresponding iteration in every original loop:
 496   for (kmp_index_t ind = n; ind > 0;) {
 497     --ind;
 498     auto bounds = &(original_bounds_nest[ind]);
 499
 500     // should be optimized to OPDIVREM:
 501     auto temp = new_iv / bounds->trip_count;
 502     auto iteration = new_iv % bounds->trip_count;
 503     new_iv = temp;
 504
 505     iterations[ind] = iteration;
 506   }
 507   KMP_ASSERT(new_iv == 0);
 508
 509   for (kmp_index_t ind = 0; ind < n; ++ind) {
 510     auto bounds = &(original_bounds_nest[ind]);
 511
 512     kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
 513   }
 514 }
 515
 516 //----------------------------------------------------------------------------
 517 // Non-rectangular loop nest
 518 //----------------------------------------------------------------------------
 519
 520 //----------Calculate maximum possible span of iv values on one level---------
 521
 522 // Calculate span for IV on this loop level for "<=" case.
 523 // Note: it's for <= on this loop nest level, so lower bound should be smallest
 524 // value, upper bound should be the biggest value. If the loop won't execute,
 525 // 'smallest' may be bigger than 'biggest', but we'd better not switch them
 526 // around.
 527 template <typename T>
 528 void kmp_calc_span_lessoreq_XX(
 529     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
 530     /* in/out*/ bounds_info_internal_t *bounds_nest) {
 531
 532   typedef typename traits_t<T>::unsigned_t UT;
 533   // typedef typename traits_t<T>::signed_t ST;
 534
 535   // typedef typename big_span_t span_t;
 536   typedef T span_t;
 537
 538   auto &bbounds = bounds->b;
 539
 540   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
 541     // This dimention depends on one of previous ones; can't be the outermost
 542     // one.
 543     bounds_info_internalXX_template<T> *previous =
 544         reinterpret_cast<bounds_info_internalXX_template<T> *>(
 545             &(bounds_nest[bbounds.outer_iv]));
 546
 547     // OMPTODO: assert that T is compatible with loop variable type on
 548     // 'previous' loop
 549
 550     {
 551       span_t bound_candidate1 =
 552           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
 553       span_t bound_candidate2 =
 554           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
 555       if (bound_candidate1 < bound_candidate2) {
 556         bounds->span_smallest = bound_candidate1;
 557       } else {
 558         bounds->span_smallest = bound_candidate2;
 559       }
 560     }
 561
 562     {
 563       // We can't adjust the upper bound with respect to step, because
 564       // lower bound might be off after adjustments
 565
 566       span_t bound_candidate1 =
 567           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
 568       span_t bound_candidate2 =
 569           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
 570       if (bound_candidate1 < bound_candidate2) {
 571         bounds->span_biggest = bound_candidate2;
 572       } else {
 573         bounds->span_biggest = bound_candidate1;
 574       }
 575     }
 576   } else {
 577     // Rectangular:
 578     bounds->span_smallest = bbounds.lb0;
 579     bounds->span_biggest = bbounds.ub0;
 580   }
 581   if (!bounds->loop_bounds_adjusted) {
 582     // Here it's safe to reduce the space to the multiply of step.
 583     // OMPTODO: check if the formular is correct.
 584     // Also check if it would be safe to do this if we didn't adjust left side.
 585     bounds->span_biggest -=
 586         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
 587   }
 588 }
 589
 590 // Calculate span for IV on this loop level for ">=" case.
 591 template <typename T>
 592 void kmp_calc_span_greateroreq_XX(
 593     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
 594     /* in/out*/ bounds_info_internal_t *bounds_nest) {
 595
 596   typedef typename traits_t<T>::unsigned_t UT;
 597   // typedef typename traits_t<T>::signed_t ST;
 598
 599   // typedef typename big_span_t span_t;
 600   typedef T span_t;
 601
 602   auto &bbounds = bounds->b;
 603
 604   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
 605     // This dimention depends on one of previous ones; can't be the outermost
 606     // one.
 607     bounds_info_internalXX_template<T> *previous =
 608         reinterpret_cast<bounds_info_internalXX_template<T> *>(
 609             &(bounds_nest[bbounds.outer_iv]));
 610
 611     // OMPTODO: assert that T is compatible with loop variable type on
 612     // 'previous' loop
 613
 614     {
 615       span_t bound_candidate1 =
 616           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
 617       span_t bound_candidate2 =
 618           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
 619       if (bound_candidate1 >= bound_candidate2) {
 620         bounds->span_smallest = bound_candidate1;
 621       } else {
 622         bounds->span_smallest = bound_candidate2;
 623       }
 624     }
 625
 626     {
 627       // We can't adjust the upper bound with respect to step, because
 628       // lower bound might be off after adjustments
 629
 630       span_t bound_candidate1 =
 631           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
 632       span_t bound_candidate2 =
 633           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
 634       if (bound_candidate1 >= bound_candidate2) {
 635         bounds->span_biggest = bound_candidate2;
 636       } else {
 637         bounds->span_biggest = bound_candidate1;
 638       }
 639     }
 640
 641   } else {
 642     // Rectangular:
 643     bounds->span_biggest = bbounds.lb0;
 644     bounds->span_smallest = bbounds.ub0;
 645   }
 646   if (!bounds->loop_bounds_adjusted) {
 647     // Here it's safe to reduce the space to the multiply of step.
 648     // OMPTODO: check if the formular is correct.
 649     // Also check if it would be safe to do this if we didn't adjust left side.
 650     bounds->span_biggest -=
 651         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
 652   }
 653 }
 654
 655 // Calculate maximum possible span for IV on this loop level.
 656 template <typename T>
 657 void kmp_calc_span_XX(
 658     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
 659     /* in/out*/ bounds_info_internal_t *bounds_nest) {
 660
 661   if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
 662     kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
 663   } else {
 664     KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
 665     kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
 666   }
 667 }
 668
 669 //----------All initial processing of the loop nest---------------------------
 670
 671 // Calculate new bounds for this loop level.
 672 // To be able to work with the nest we need to get it to a parallelepiped shape.
 673 // We need to stay in the original range of values, so that there will be no
 674 // overflow, for that we'll adjust both upper and lower bounds as needed.
 675 template <typename T>
 676 void kmp_calc_new_bounds_XX(
 677     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
 678     /* in/out*/ bounds_info_internal_t *bounds_nest) {
 679
 680   auto &bbounds = bounds->b;
 681
 682   if (bbounds.lb1 == bbounds.ub1) {
 683     // Already parallel, no need to adjust:
 684     bounds->loop_bounds_adjusted = false;
 685   } else {
 686     bounds->loop_bounds_adjusted = true;
 687
 688     T old_lb1 = bbounds.lb1;
 689     T old_ub1 = bbounds.ub1;
 690
 691     if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
 692       // With this shape we can adjust to a rectangle:
 693       bbounds.lb1 = 0;
 694       bbounds.ub1 = 0;
 695     } else {
 696       // get upper and lower bounds to be parallel
 697       // with values in the old range.
 698       // Note: abs didn't work here.
 699       if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
 700           ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
 701         bbounds.lb1 = old_ub1;
 702       } else {
 703         bbounds.ub1 = old_lb1;
 704       }
 705     }
 706
 707     // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
 708     // The idea here that for this IV we are now getting the same span
 709     // irrespective of the previous IV value.
 710     bounds_info_internalXX_template<T> *previous =
 711         reinterpret_cast<bounds_info_internalXX_template<T> *>(
 712             &bounds_nest[bbounds.outer_iv]);
 713
 714     if (bbounds.comparison == comparison_t::comp_less_or_eq) {
 715       if (old_lb1 < bbounds.lb1) {
 716         KMP_ASSERT(old_lb1 < 0);
 717         // The length is good on outer_iv biggest number,
 718         // can use it to find where to move the lower bound:
 719
 720         T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
 721         bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
 722                             // e.g. it was 0?? (same below)
 723       } else if (old_lb1 > bbounds.lb1) {
 724         // still need to move lower bound:
 725         T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
 726         bbounds.lb0 += add;
 727       }
 728
 729       if (old_ub1 > bbounds.ub1) {
 730         KMP_ASSERT(old_ub1 > 0);
 731         // The length is good on outer_iv biggest number,
 732         // can use it to find where to move upper bound:
 733
 734         T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
 735         bbounds.ub0 += add;
 736       } else if (old_ub1 < bbounds.ub1) {
 737         // still need to move upper bound:
 738         T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
 739         bbounds.ub0 -= sub;
 740       }
 741     } else {
 742       KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
 743       if (old_lb1 < bbounds.lb1) {
 744         KMP_ASSERT(old_lb1 < 0);
 745         T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
 746         bbounds.lb0 -= sub;
 747       } else if (old_lb1 > bbounds.lb1) {
 748         T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
 749         bbounds.lb0 += add;
 750       }
 751
 752       if (old_ub1 > bbounds.ub1) {
 753         KMP_ASSERT(old_ub1 > 0);
 754         T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
 755         bbounds.ub0 += add;
 756       } else if (old_ub1 < bbounds.ub1) {
 757         T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
 758         bbounds.ub0 -= sub;
 759       }
 760     }
 761   }
 762 }
 763
 764 // Do all processing for one canonicalized loop in the nest
 765 // (assuming that outer loops already were processed):
 766 template <typename T>
 767 kmp_loop_nest_iv_t kmp_process_one_loop_XX(
 768     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
 769     /*in/out*/ bounds_info_internal_t *bounds_nest) {
 770
 771   kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
 772   kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
 773   return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
 774 }
 775
 776 // Non-rectangular loop nest, canonicalized to use <= or >=.
 777 // Process loop nest to have a parallelepiped shape,
 778 // calculate biggest spans for IV's on all levels and calculate overall trip
 779 // count. "bounds_nest" has to be allocated per thread.
 780 // Returns overall trip count (for adjusted space).
 781 kmp_loop_nest_iv_t kmp_process_loop_nest(
 782     /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
 783
 784   kmp_loop_nest_iv_t total = 1;
 785
 786   for (kmp_index_t ind = 0; ind < n; ++ind) {
 787     auto bounds = &(bounds_nest[ind]);
 788     kmp_loop_nest_iv_t trip_count = 0;
 789
 790     switch (bounds->b.loop_type) {
 791     case loop_type_t::loop_type_int32:
 792       trip_count = kmp_process_one_loop_XX<kmp_int32>(
 793           /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
 794           /*in/out*/ bounds_nest);
 795       break;
 796     case loop_type_t::loop_type_uint32:
 797       trip_count = kmp_process_one_loop_XX<kmp_uint32>(
 798           /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
 799           /*in/out*/ bounds_nest);
 800       break;
 801     case loop_type_t::loop_type_int64:
 802       trip_count = kmp_process_one_loop_XX<kmp_int64>(
 803           /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
 804           /*in/out*/ bounds_nest);
 805       break;
 806     case loop_type_t::loop_type_uint64:
 807       trip_count = kmp_process_one_loop_XX<kmp_uint64>(
 808           /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
 809           /*in/out*/ bounds_nest);
 810       break;
 811     default:
 812       KMP_ASSERT(false);
 813     }
 814     total *= trip_count;
 815   }
 816
 817   return total;
 818 }
 819
 820 //----------Calculate iterations (in the original or updated space)-----------
 821
 822 // Calculate number of iterations in original or updated space resulting in
 823 // original_ivs[ind] (only on this level, non-negative)
 824 // (not counting initial iteration)
 825 template <typename T>
 826 kmp_loop_nest_iv_t
 827 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
 828                                  const kmp_point_t original_ivs,
 829                                  kmp_index_t ind) {
 830
 831   kmp_loop_nest_iv_t iterations = 0;
 832
 833   if (bounds->comparison == comparison_t::comp_less_or_eq) {
 834     iterations =
 835         (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
 836          bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
 837         __kmp_abs(bounds->step);
 838   } else {
 839     KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
 840     iterations = (bounds->lb0 +
 841                   bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
 842                   static_cast<T>(original_ivs[ind])) /
 843                  __kmp_abs(bounds->step);
 844   }
 845
 846   return iterations;
 847 }
 848
 849 // Calculate number of iterations in the original or updated space resulting in
 850 // original_ivs[ind] (only on this level, non-negative)
 851 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
 852                                                  const kmp_point_t original_ivs,
 853                                                  kmp_index_t ind) {
 854
 855   switch (bounds->loop_type) {
 856   case loop_type_t::loop_type_int32:
 857     return kmp_calc_number_of_iterations_XX<kmp_int32>(
 858         (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
 859     break;
 860   case loop_type_t::loop_type_uint32:
 861     return kmp_calc_number_of_iterations_XX<kmp_uint32>(
 862         (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
 863     break;
 864   case loop_type_t::loop_type_int64:
 865     return kmp_calc_number_of_iterations_XX<kmp_int64>(
 866         (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
 867     break;
 868   case loop_type_t::loop_type_uint64:
 869     return kmp_calc_number_of_iterations_XX<kmp_uint64>(
 870         (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
 871     break;
 872   default:
 873     KMP_ASSERT(false);
 874     return 0;
 875   }
 876 }
 877
 878 //----------Calculate new iv corresponding to original ivs--------------------
 879
 880 // We got a point in the original loop nest.
 881 // Take updated bounds and calculate what new_iv will correspond to this point.
 882 // When we are getting original IVs from new_iv, we have to adjust to fit into
 883 // original loops bounds. Getting new_iv for the adjusted original IVs will help
 884 // with making more chunks non-empty.
 885 kmp_loop_nest_iv_t
 886 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
 887                                   const kmp_point_t original_ivs,
 888                                   kmp_index_t n) {
 889
 890   kmp_loop_nest_iv_t new_iv = 0;
 891
 892   for (kmp_index_t ind = 0; ind < n; ++ind) {
 893     auto bounds = &(bounds_nest[ind].b);
 894
 895     new_iv = new_iv * bounds->trip_count +
 896              kmp_calc_number_of_iterations(bounds, original_ivs, ind);
 897   }
 898
 899   return new_iv;
 900 }
 901
 902 //----------Calculate original ivs for provided iterations--------------------
 903
 904 // Calculate original IVs for provided iterations, assuming iterations are
 905 // calculated in the original space.
 906 // Loop nest is in canonical form (with <= / >=).
 907 bool kmp_calc_original_ivs_from_iterations(
 908     const bounds_info_t *original_bounds_nest, kmp_index_t n,
 909     /*in/out*/ kmp_point_t original_ivs,
 910     /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
 911
 912   kmp_index_t lengthened_ind = n;
 913
 914   for (; ind < n;) {
 915     auto bounds = &(original_bounds_nest[ind]);
 916     bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
 917                                 ind, (lengthened_ind < ind), true);
 918
 919     if (!good) {
 920       // The calculated iv value is too big (or too small for >=):
 921       if (ind == 0) {
 922         // Space is empty:
 923         return false;
 924       } else {
 925         // Go to next iteration on the outer loop:
 926         --ind;
 927         ++iterations[ind];
 928         lengthened_ind = ind;
 929         for (kmp_index_t i = ind + 1; i < n; ++i) {
 930           iterations[i] = 0;
 931         }
 932         continue;
 933       }
 934     }
 935     ++ind;
 936   }
 937
 938   return true;
 939 }
 940
 941 //----------Calculate original ivs for the beginning of the loop nest---------
 942
 943 // Calculate IVs for the beginning of the loop nest.
 944 // Note: lower bounds of all loops may not work -
 945 // if on some of the iterations of the outer loops inner loops are empty.
 946 // Loop nest is in canonical form (with <= / >=).
 947 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
 948                                      kmp_index_t n,
 949                                      /*out*/ kmp_point_t original_ivs) {
 950
 951   // Iterations in the original space, multiplied by step:
 952   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
 953   for (kmp_index_t ind = n; ind > 0;) {
 954     --ind;
 955     iterations[ind] = 0;
 956   }
 957
 958   // Now calculate the point:
 959   bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
 960                                                  /*in/out*/ original_ivs,
 961                                                  /*in/out*/ iterations, 0);
 962   return b;
 963 }
 964
 965 //----------Calculate next point in the original loop space-------------------
 966
 967 // From current set of original IVs calculate next point.
 968 // Return false if there is no next point in the loop bounds.
 969 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
 970                                 kmp_index_t n, const kmp_point_t original_ivs,
 971                                 /*out*/ kmp_point_t next_original_ivs) {
 972   // Iterations in the original space, multiplied by step (so can be negative):
 973   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
 974   // First, calc corresponding iteration in every original loop:
 975   for (kmp_index_t ind = 0; ind < n; ++ind) {
 976     auto bounds = &(original_bounds_nest[ind]);
 977     iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
 978   }
 979
 980   for (kmp_index_t ind = 0; ind < n; ++ind) {
 981     next_original_ivs[ind] = original_ivs[ind];
 982   }
 983
 984   // Next add one step to the iterations on the inner-most level, and see if we
 985   // need to move up the nest:
 986   kmp_index_t ind = n - 1;
 987   ++iterations[ind];
 988
 989   bool b = kmp_calc_original_ivs_from_iterations(
 990       original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
 991
 992   return b;
 993 }
 994
 995 //----------Calculate chunk end in the original loop space--------------------
 996
 997 // For one level calculate old induction variable corresponding to overall
 998 // new_iv for the chunk end.
 999 // Return true if it fits into upper bound on this level
1000 // (if not, we need to re-calculate)
1001 template <typename T>
1002 bool kmp_calc_one_iv_for_chunk_end_XX(
1003     const bounds_infoXX_template<T> *bounds,
1004     const bounds_infoXX_template<T> *updated_bounds,
1005     /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006     kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007     const kmp_point_t original_ivs_start) {
1008
1009   // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010   // big_span_t;
1011
1012   // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013   T temp = 0;
1014
1015   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016
1017   if (start_with_lower_bound) {
1018     // we moved to the next iteration on one of outer loops, may as well use
1019     // the lower bound here:
1020     temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021   } else {
1022     // Start in expanded space, but:
1023     // - we need to hit original space lower bound, so need to account for
1024     // that
1025     // - we have to go into original space, even if that means adding more
1026     // iterations than was planned
1027     // - we have to go past (or equal to) previous point (which is the chunk
1028     // starting point)
1029
1030     auto iteration = iterations[ind];
1031
1032     auto step = bounds->step;
1033
1034     // In case of >= it's negative:
1035     auto accountForStep =
1036         ((bounds->lb0 + bounds->lb1 * outer_iv) -
1037          (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038         step;
1039
1040     temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041            accountForStep + iteration * step;
1042
1043     if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044          (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045         ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046          (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047       // Too small (or too big), didn't reach the original lower bound. Use
1048       // heuristic:
1049       temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1050     }
1051
1052     if (compare_with_start) {
1053
1054       T start = static_cast<T>(original_ivs_start[ind]);
1055
1056       temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057
1058       // On all previous levels start of the chunk is same as the end, need to
1059       // be really careful here:
1060       if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061            (temp < start)) ||
1062           ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063            (temp > start))) {
1064         // End of the chunk can't be smaller (for >= bigger) than it's start.
1065         // Use heuristic:
1066         temp = start + iteration / 4 * step;
1067       }
1068     }
1069   }
1070
1071   original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072
1073   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074        (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076        (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077     // Too big (or too small for >=).
1078     return false;
1079   }
1080
1081   return true;
1082 }
1083
1084 // For one level calculate old induction variable corresponding to overall
1085 // new_iv for the chunk end.
1086 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087                                    const bounds_info_t *updated_bounds,
1088                                    /*in/out*/ kmp_point_t original_ivs,
1089                                    const kmp_iterations_t iterations,
1090                                    kmp_index_t ind, bool start_with_lower_bound,
1091                                    bool compare_with_start,
1092                                    const kmp_point_t original_ivs_start) {
1093
1094   switch (bounds->loop_type) {
1095   case loop_type_t::loop_type_int32:
1096     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1097         (bounds_infoXX_template<kmp_int32> *)(bounds),
1098         (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099         /*in/out*/
1100         original_ivs, iterations, ind, start_with_lower_bound,
1101         compare_with_start, original_ivs_start);
1102     break;
1103   case loop_type_t::loop_type_uint32:
1104     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1105         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1106         (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107         /*in/out*/
1108         original_ivs, iterations, ind, start_with_lower_bound,
1109         compare_with_start, original_ivs_start);
1110     break;
1111   case loop_type_t::loop_type_int64:
1112     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1113         (bounds_infoXX_template<kmp_int64> *)(bounds),
1114         (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115         /*in/out*/
1116         original_ivs, iterations, ind, start_with_lower_bound,
1117         compare_with_start, original_ivs_start);
1118     break;
1119   case loop_type_t::loop_type_uint64:
1120     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1121         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1122         (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123         /*in/out*/
1124         original_ivs, iterations, ind, start_with_lower_bound,
1125         compare_with_start, original_ivs_start);
1126     break;
1127   default:
1128     KMP_ASSERT(false);
1129     return false;
1130   }
1131 }
1132
1133 // Calculate old induction variables corresponding to overall new_iv for the
1134 // chunk end. If due to space extension we are getting old IVs outside of the
1135 // boundaries, bring them into the boundaries. Need to do this in the runtime,
1136 // esp. on the lower bounds side. When getting result need to make sure that the
1137 // new chunk starts at next position to old chunk, not overlaps with it (this is
1138 // done elsewhere), and need to make sure end of the chunk is further than the
1139 // beginning of the chunk. We don't need an exact ending point here, just
1140 // something more-or-less close to the desired chunk length, bigger is fine
1141 // (smaller would be fine, but we risk going into infinite loop, so do smaller
1142 // only at the very end of the space). result: false if could not find the
1143 // ending point in the original loop space. In this case the caller can use
1144 // original upper bounds as the end of the chunk. Chunk won't be empty, because
1145 // it'll have at least the starting point, which is by construction in the
1146 // original space.
1147 bool kmp_calc_original_ivs_for_chunk_end(
1148     const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149     const bounds_info_internal_t *updated_bounds_nest,
1150     const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151     /*out*/ kmp_point_t original_ivs) {
1152
1153   // Iterations in the expanded space:
1154   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155   // First, calc corresponding iteration in every modified loop:
1156   for (kmp_index_t ind = n; ind > 0;) {
1157     --ind;
1158     auto &updated_bounds = updated_bounds_nest[ind];
1159
1160     // should be optimized to OPDIVREM:
1161     auto new_ind = new_iv / updated_bounds.b.trip_count;
1162     auto iteration = new_iv % updated_bounds.b.trip_count;
1163
1164     new_iv = new_ind;
1165     iterations[ind] = iteration;
1166   }
1167   KMP_DEBUG_ASSERT(new_iv == 0);
1168
1169   kmp_index_t lengthened_ind = n;
1170   kmp_index_t equal_ind = -1;
1171
1172   // Next calculate the point, but in original loop nest.
1173   for (kmp_index_t ind = 0; ind < n;) {
1174     auto bounds = &(original_bounds_nest[ind]);
1175     auto updated_bounds = &(updated_bounds_nest[ind].b);
1176
1177     bool good = kmp_calc_one_iv_for_chunk_end(
1178         bounds, updated_bounds,
1179         /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1180         (equal_ind >= ind - 1), original_ivs_start);
1181
1182     if (!good) {
1183       // Too big (or too small for >=).
1184       if (ind == 0) {
1185         // Need to reduce to the end.
1186         return false;
1187       } else {
1188         // Go to next iteration on outer loop:
1189         --ind;
1190         ++(iterations[ind]);
1191         lengthened_ind = ind;
1192         if (equal_ind >= lengthened_ind) {
1193           // We've changed the number of iterations here,
1194           // can't be same anymore:
1195           equal_ind = lengthened_ind - 1;
1196         }
1197         for (kmp_index_t i = ind + 1; i < n; ++i) {
1198           iterations[i] = 0;
1199         }
1200         continue;
1201       }
1202     }
1203
1204     if ((equal_ind == ind - 1) &&
1205         (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206                     original_ivs_start[ind]))) {
1207       equal_ind = ind;
1208     } else if ((equal_ind > ind - 1) &&
1209                !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210                             original_ivs_start[ind]))) {
1211       equal_ind = ind - 1;
1212     }
1213     ++ind;
1214   }
1215
1216   return true;
1217 }
1218
1219 //----------Calculate upper bounds for the last chunk-------------------------
1220
1221 // Calculate one upper bound for the end.
1222 template <typename T>
1223 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224                             /*in/out*/ kmp_point_t original_ivs,
1225                             kmp_index_t ind) {
1226
1227   T temp = bounds->ub0 +
1228            bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229
1230   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231 }
1232
1233 void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234                          /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1235
1236   switch (bounds->loop_type) {
1237   default:
1238     KMP_ASSERT(false);
1239     break;
1240   case loop_type_t::loop_type_int32:
1241     kmp_calc_one_iv_end_XX<kmp_int32>(
1242         (bounds_infoXX_template<kmp_int32> *)(bounds),
1243         /*in/out*/ original_ivs, ind);
1244     break;
1245   case loop_type_t::loop_type_uint32:
1246     kmp_calc_one_iv_end_XX<kmp_uint32>(
1247         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1248         /*in/out*/ original_ivs, ind);
1249     break;
1250   case loop_type_t::loop_type_int64:
1251     kmp_calc_one_iv_end_XX<kmp_int64>(
1252         (bounds_infoXX_template<kmp_int64> *)(bounds),
1253         /*in/out*/ original_ivs, ind);
1254     break;
1255   case loop_type_t::loop_type_uint64:
1256     kmp_calc_one_iv_end_XX<kmp_uint64>(
1257         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1258         /*in/out*/ original_ivs, ind);
1259     break;
1260   }
1261 }
1262
1263 // Calculate upper bounds for the last loop iteration. Just use original upper
1264 // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265 // this point is in the original space (it's likely not)
1266 void kmp_calc_original_ivs_for_end(
1267     const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268     /*out*/ kmp_point_t original_ivs) {
1269   for (kmp_index_t ind = 0; ind < n; ++ind) {
1270     auto bounds = &(original_bounds_nest[ind]);
1271     kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1272   }
1273 }
1274
1275 /**************************************************************************
1276  * Identify nested loop structure - loops come in the canonical form
1277  * Lower triangle matrix: i = 0; i <= N; i++        {0,0}:{N,0}
1278  *                        j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279  * Upper Triangle matrix
1280  *                        i = 0;     i <= N; i++    {0,0}:{N,0}
1281  *                        j = 0+1*i; j <= N; j++    {0,1}:{N,0}
1282  * ************************************************************************/
1283 nested_loop_type_t
1284 kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
1285                                    /*in*/ kmp_index_t n) {
1286   // only 2-level nested loops are supported
1287   if (n != 2) {
1288     return nested_loop_type_unkown;
1289   }
1290   // loops must be canonical
1291   KMP_ASSERT(
1292       (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293       (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1294   // check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295   kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296                                         original_bounds_nest[0].lb0_u64);
1297   kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298                                         original_bounds_nest[0].ub0_u64);
1299   kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300                                         original_bounds_nest[0].lb1_u64);
1301   kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302                                         original_bounds_nest[0].ub1_u64);
1303   if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304     return nested_loop_type_unkown;
1305   }
1306   // check inner bounds to determine triangle type
1307   kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308                                         original_bounds_nest[1].lb0_u64);
1309   kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310                                         original_bounds_nest[1].ub0_u64);
1311   kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312                                         original_bounds_nest[1].lb1_u64);
1313   kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314                                         original_bounds_nest[1].ub1_u64);
1315   // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316   if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317       (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318     return nested_loop_type_lower_triangular_matrix;
1319   }
1320   // upper triangle loop inner bounds need to be {0,1}:{N,0}
1321   if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322       inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323     return nested_loop_type_upper_triangular_matrix;
1324   }
1325   return nested_loop_type_unkown;
1326 }
1327
1328 /**************************************************************************
1329  * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330  * Start point is x so the result is always > sqrt(x)
1331  * The method has uniform convergence, PRECISION is set to 0.1
1332  * ************************************************************************/
1333 #define level_of_precision 0.1
1334 double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
1335   double sqrt_old = 0.;
1336   double sqrt_new = (double)x;
1337   do {
1338     sqrt_old = sqrt_new;
1339     sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340   } while ((sqrt_old - sqrt_new) > level_of_precision);
1341   return sqrt_new;
1342 }
1343
1344 /**************************************************************************
1345  *  Handle lower triangle matrix in the canonical form
1346  *  i = 0; i <= N; i++          {0,0}:{N,0}
1347  *  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348  * ************************************************************************/
1349 void kmp_handle_lower_triangle_matrix(
1350     /*in*/ kmp_uint32 nth,
1351     /*in*/ kmp_uint32 tid,
1352     /*in */ kmp_index_t n,
1353     /*in/out*/ bounds_info_t *original_bounds_nest,
1354     /*out*/ bounds_info_t *chunk_bounds_nest) {
1355
1356   // transfer loop types from the original loop to the chunks
1357   for (kmp_index_t i = 0; i < n; ++i) {
1358     chunk_bounds_nest[i] = original_bounds_nest[i];
1359   }
1360   // cleanup iv variables
1361   kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362                                     original_bounds_nest[0].ub0_u64);
1363   kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364                                     original_bounds_nest[0].lb0_u64);
1365   kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366                                     original_bounds_nest[1].ub0_u64);
1367   // calculate the chunk's lower and upper bounds
1368   // the total number of iterations in the loop is the sum of the arithmetic
1369   // progression from the outer lower to outer upper bound (inclusive since the
1370   // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371   // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372   // + 1) -> N - 1
1373   kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374   kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1375   // the current thread's number of iterations:
1376   // each thread gets an equal number of iterations: total number of iterations
1377   // divided by the number of threads plus, if there's a remainder,
1378   // the first threads with the number up to the remainder get an additional
1379   // iteration each to cover it
1380   kmp_uint64 iter_current =
1381       iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1382   // cumulative number of iterations executed by all the previous threads:
1383   // threads with the tid below the remainder will have (iter_total/nth+1)
1384   // elements, and so will all threads before them so the cumulative number of
1385   // iterations executed by the all previous will be the current thread's number
1386   // of iterations multiplied by the number of previous threads which is equal
1387   // to the current thread's tid; threads with the number equal or above the
1388   // remainder will have (iter_total/nth) elements so the cumulative number of
1389   // iterations previously executed is its number of iterations multipled by the
1390   // number of previous threads which is again equal to the current thread's tid
1391   // PLUS all the remainder iterations that will have been executed by the
1392   // previous threads
1393   kmp_uint64 iter_before_current =
1394       tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395   // cumulative number of iterations executed with the current thread is
1396   // the cumulative number executed before it plus its own
1397   kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398   // calculate the outer loop lower bound (lbo) which is the max outer iv value
1399   // that gives the number of iterations that is equal or just below the total
1400   // number of iterations executed by the previous threads, for less_than
1401   // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402   // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403   // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404   // i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405   // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406   // using a parameter to control the equation sign
1407   kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408   kmp_uint64 lower_bound_outer =
1409       (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410                                       8 * iter_before_current) +
1411                    inner_adjustment) /
1412           2 -
1413       inner_adjustment;
1414   // calculate the inner loop lower bound which is the remaining number of
1415   // iterations required to hit the total number of iterations executed by the
1416   // previous threads giving the starting point of this thread
1417   kmp_uint64 lower_bound_inner =
1418       iter_before_current -
1419       ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1420   // calculate the outer loop upper bound using the same approach as for the
1421   // inner bound except using the total number of iterations executed with the
1422   // current thread
1423   kmp_uint64 upper_bound_outer =
1424       (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425                                       8 * iter_with_current) +
1426                    inner_adjustment) /
1427           2 -
1428       inner_adjustment;
1429   // calculate the inner loop upper bound which is the remaining number of
1430   // iterations required to hit the total number of iterations executed after
1431   // the current thread giving the starting point of the next thread
1432   kmp_uint64 upper_bound_inner =
1433       iter_with_current -
1434       ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1435   // adjust the upper bounds down by 1 element to point at the last iteration of
1436   // the current thread the first iteration of the next thread
1437   if (upper_bound_inner == 0) {
1438     // {n,0} => {n-1,n-1}
1439     upper_bound_outer -= 1;
1440     upper_bound_inner = upper_bound_outer;
1441   } else {
1442     // {n,m} => {n,m-1} (m!=0)
1443     upper_bound_inner -= 1;
1444   }
1445
1446   // assign the values, zeroing out lb1 and ub1 values since the iteration space
1447   // is now one-dimensional
1448   chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449   chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450   chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451   chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452   chunk_bounds_nest[0].lb1_u64 = 0;
1453   chunk_bounds_nest[0].ub1_u64 = 0;
1454   chunk_bounds_nest[1].lb1_u64 = 0;
1455   chunk_bounds_nest[1].ub1_u64 = 0;
1456
1457 #if 0
1458   printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459          tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460          chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461 #endif
1462 }
1463
1464 /**************************************************************************
1465  *  Handle upper triangle matrix in the canonical form
1466  *  i = 0; i <= N; i++     {0,0}:{N,0}
1467  *  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468  * ************************************************************************/
1469 void kmp_handle_upper_triangle_matrix(
1470     /*in*/ kmp_uint32 nth,
1471     /*in*/ kmp_uint32 tid,
1472     /*in */ kmp_index_t n,
1473     /*in/out*/ bounds_info_t *original_bounds_nest,
1474     /*out*/ bounds_info_t *chunk_bounds_nest) {
1475
1476   // transfer loop types from the original loop to the chunks
1477   for (kmp_index_t i = 0; i < n; ++i) {
1478     chunk_bounds_nest[i] = original_bounds_nest[i];
1479   }
1480   // cleanup iv variables
1481   kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482                                     original_bounds_nest[0].ub0_u64);
1483   kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484                                     original_bounds_nest[0].lb0_u64);
1485   [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486       original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
1487   // calculate the chunk's lower and upper bounds
1488   // the total number of iterations in the loop is the sum of the arithmetic
1489   // progression from the outer lower to outer upper bound (inclusive since the
1490   // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491   // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492   // + 1) -> N - 1
1493   kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494   kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1495   // the current thread's number of iterations:
1496   // each thread gets an equal number of iterations: total number of iterations
1497   // divided by the number of threads plus, if there's a remainder,
1498   // the first threads with the number up to the remainder get an additional
1499   // iteration each to cover it
1500   kmp_uint64 iter_current =
1501       iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1502   // cumulative number of iterations executed by all the previous threads:
1503   // threads with the tid below the remainder will have (iter_total/nth+1)
1504   // elements, and so will all threads before them so the cumulative number of
1505   // iterations executed by the all previous will be the current thread's number
1506   // of iterations multiplied by the number of previous threads which is equal
1507   // to the current thread's tid; threads with the number equal or above the
1508   // remainder will have (iter_total/nth) elements so the cumulative number of
1509   // iterations previously executed is its number of iterations multipled by the
1510   // number of previous threads which is again equal to the current thread's tid
1511   // PLUS all the remainder iterations that will have been executed by the
1512   // previous threads
1513   kmp_uint64 iter_before_current =
1514       tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515   // cumulative number of iterations executed with the current thread is
1516   // the cumulative number executed before it plus its own
1517   kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518   // calculate the outer loop lower bound (lbo) which is the max outer iv value
1519   // that gives the number of iterations that is equal or just below the total
1520   // number of iterations executed by the previous threads:
1521   // lbo*(lbo+1)/2<=iter_before_current =>
1522   // lbo^2+lbo-2*iter_before_current<=0
1523   kmp_uint64 lower_bound_outer =
1524       (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1525   // calculate the inner loop lower bound which is the remaining number of
1526   // iterations required to hit the total number of iterations executed by the
1527   // previous threads giving the starting point of this thread
1528   kmp_uint64 lower_bound_inner =
1529       iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1530   // calculate the outer loop upper bound using the same approach as for the
1531   // inner bound except using the total number of iterations executed with the
1532   // current thread
1533   kmp_uint64 upper_bound_outer =
1534       (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1535   // calculate the inner loop upper bound which is the remaining number of
1536   // iterations required to hit the total number of iterations executed after
1537   // the current thread giving the starting point of the next thread
1538   kmp_uint64 upper_bound_inner =
1539       iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1540   // adjust the upper bounds down by 1 element to point at the last iteration of
1541   // the current thread the first iteration of the next thread
1542   if (upper_bound_inner == 0) {
1543     // {n,0} => {n-1,n-1}
1544     upper_bound_outer -= 1;
1545     upper_bound_inner = upper_bound_outer;
1546   } else {
1547     // {n,m} => {n,m-1} (m!=0)
1548     upper_bound_inner -= 1;
1549   }
1550
1551   // assign the values, zeroing out lb1 and ub1 values since the iteration space
1552   // is now one-dimensional
1553   chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1554   chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1555   chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1556   chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1557   chunk_bounds_nest[0].lb1_u64 = 0;
1558   chunk_bounds_nest[0].ub1_u64 = 0;
1559   chunk_bounds_nest[1].lb1_u64 = 0;
1560   chunk_bounds_nest[1].ub1_u64 = 0;
1561
1562 #if 0
1563   printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564          tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1565          chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1566 #endif
1567 }
1568 //----------Init API for non-rectangular loops--------------------------------
1569
1570 // Init API for collapsed loops (static, no chunks defined).
1571 // "bounds_nest" has to be allocated per thread.
1572 // API will modify original bounds_nest array to bring it to a canonical form
1573 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1574 // canonical form there will be no changes to bounds in bounds_nest array
1575 // (only trip counts will be calculated). Internally API will expand the space
1576 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1577 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1578 // important on the left side, to hit the lower bounds and not step over), and
1579 // pick the correct chunk for this thread (so it will calculate chunks up to the
1580 // needed one). It could be optimized to calculate just this chunk, potentially
1581 // a bit less well distributed among threads. It is designed to make sure that
1582 // threads will receive predictable chunks, deterministically (so that next nest
1583 // of loops with similar characteristics will get exactly same chunks on same
1584 // threads).
1585 // Current contract: chunk_bounds_nest has only lb0 and ub0,
1586 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1587 extern "C" kmp_int32
1588 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1589                           /*in/out*/ bounds_info_t *original_bounds_nest,
1590                           /*out*/ bounds_info_t *chunk_bounds_nest,
1591                           kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1592
1593   KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1594   KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1595
1596   if (__kmp_env_consistency_check) {
1597     __kmp_push_workshare(gtid, ct_pdo, loc);
1598   }
1599
1600   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1601
1602   CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1603
1604   for (kmp_index_t i = 0; i < n; ++i) {
1605     updated_bounds_nest[i].b = original_bounds_nest[i];
1606   }
1607
1608   kmp_loop_nest_iv_t total =
1609       kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1610
1611   if (plastiter != NULL) {
1612     *plastiter = FALSE;
1613   }
1614
1615   if (total == 0) {
1616     // Loop won't execute:
1617     return FALSE;
1618   }
1619
1620   // OMPTODO: DISTRIBUTE is not supported yet
1621   __kmp_assert_valid_gtid(gtid);
1622   kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1623
1624   kmp_info_t *th = __kmp_threads[gtid];
1625   kmp_team_t *team = th->th.th_team;
1626   kmp_uint32 nth = team->t.t_nproc; // Number of threads
1627
1628   KMP_DEBUG_ASSERT(tid < nth);
1629
1630   // Handle special cases
1631   nested_loop_type_t loop_type =
1632       kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633   if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634     kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1635                                      chunk_bounds_nest);
1636     return TRUE;
1637   } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638     kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1639                                      chunk_bounds_nest);
1640     return TRUE;
1641   }
1642
1643   CollapseAllocator<kmp_uint64> original_ivs_start(n);
1644
1645   if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1646                                        /*out*/ original_ivs_start)) {
1647     // Loop won't execute:
1648     return FALSE;
1649   }
1650
1651   // Not doing this optimization for one thread:
1652   // (1) more to test
1653   // (2) without it current contract that chunk_bounds_nest has only lb0 and
1654   // ub0, lb1 and ub1 are set to 0 and can be ignored.
1655   // if (nth == 1) {
1656   //  // One thread:
1657   //  // Copy all info from original_bounds_nest, it'll be good enough.
1658
1659   //  for (kmp_index_t i = 0; i < n; ++i) {
1660   //    chunk_bounds_nest[i] = original_bounds_nest[i];
1661   //  }
1662
1663   //  if (plastiter != NULL) {
1664   //    *plastiter = TRUE;
1665   //  }
1666   //  return TRUE;
1667   //}
1668
1669   kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1670       updated_bounds_nest, original_ivs_start, n);
1671
1672   bool last_iter = false;
1673
1674   for (; nth > 0;) {
1675     // We could calculate chunk size once, but this is to compensate that the
1676     // original space is not parallelepiped and some threads can be left
1677     // without work:
1678     KMP_DEBUG_ASSERT(total >= new_iv);
1679
1680     kmp_loop_nest_iv_t total_left = total - new_iv;
1681     kmp_loop_nest_iv_t chunk_size = total_left / nth;
1682     kmp_loop_nest_iv_t remainder = total_left % nth;
1683
1684     kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1685
1686     if (remainder > 0) {
1687       ++curr_chunk_size;
1688       --remainder;
1689     }
1690
1691 #if defined(KMP_DEBUG)
1692     kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1693 #endif
1694
1695     if (curr_chunk_size > 1) {
1696       new_iv += curr_chunk_size - 1;
1697     }
1698
1699     CollapseAllocator<kmp_uint64> original_ivs_end(n);
1700     if ((nth == 1) || (new_iv >= total - 1)) {
1701       // Do this one till the end - just in case we miscalculated
1702       // and either too much is left to process or new_iv is a bit too big:
1703       kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1704                                     /*out*/ original_ivs_end);
1705
1706       last_iter = true;
1707     } else {
1708       // Note: here we make sure it's past (or equal to) the previous point.
1709       if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1710                                                updated_bounds_nest,
1711                                                original_ivs_start, new_iv,
1712                                                /*out*/ original_ivs_end)) {
1713         // We could not find the ending point, use the original upper bounds:
1714         kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1715                                       /*out*/ original_ivs_end);
1716
1717         last_iter = true;
1718       }
1719     }
1720
1721 #if defined(KMP_DEBUG)
1722     auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1723         updated_bounds_nest, original_ivs_end, n);
1724     KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1725 #endif
1726
1727     if (last_iter && (tid != 0)) {
1728       // We are done, this was last chunk, but no chunk for current thread was
1729       // found:
1730       return FALSE;
1731     }
1732
1733     if (tid == 0) {
1734       // We found the chunk for this thread, now we need to check if it's the
1735       // last chunk or not:
1736
1737       CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1738       if (last_iter ||
1739           !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1740                                       /*out*/ original_ivs_next_start)) {
1741         // no more loop iterations left to process,
1742         // this means that currently found chunk is the last chunk:
1743         if (plastiter != NULL) {
1744           *plastiter = TRUE;
1745         }
1746       }
1747
1748       // Fill in chunk bounds:
1749       for (kmp_index_t i = 0; i < n; ++i) {
1750         chunk_bounds_nest[i] =
1751             original_bounds_nest[i]; // To fill in types, etc. - optional
1752         chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1753         chunk_bounds_nest[i].lb1_u64 = 0;
1754
1755         chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1756         chunk_bounds_nest[i].ub1_u64 = 0;
1757       }
1758
1759       return TRUE;
1760     }
1761
1762     --tid;
1763     --nth;
1764
1765     bool next_chunk = kmp_calc_next_original_ivs(
1766         original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1767     if (!next_chunk) {
1768       // no more loop iterations to process,
1769       // the prevoius chunk was the last chunk
1770       break;
1771     }
1772
1773     // original_ivs_start is next to previous chunk original_ivs_end,
1774     // we need to start new chunk here, so chunks will be one after another
1775     // without any gap or overlap:
1776     new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1777                                                original_ivs_start, n);
1778   }
1779
1780   return FALSE;
1781 }