2 * kmp_collapse.cpp -- loop collapse feature
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "kmp_error.h"
17 #include "kmp_stats.h"
19 #include "kmp_collapse.h"
22 #include "ompt-specific.h"
25 // OMPTODO: different style of comments (see kmp_sched)
28 // avoid inadevertently using a library based abs
29 template <typename T
> T
__kmp_abs(const T val
) {
30 return (val
< 0) ? -val
: val
;
32 kmp_uint32
__kmp_abs(const kmp_uint32 val
) { return val
; }
33 kmp_uint64
__kmp_abs(const kmp_uint64 val
) { return val
; }
35 //----------------------------------------------------------------------------
36 // Common functions for working with rectangular and non-rectangular loops
37 //----------------------------------------------------------------------------
39 template <typename T
> int __kmp_sign(T val
) {
40 return (T(0) < val
) - (val
< T(0));
43 template <typename T
> class CollapseAllocator
{
47 static const size_t allocaSize
= 32; // size limit for stack allocations
48 // (8 bytes x 4 nested loops)
49 char stackAlloc
[allocaSize
];
50 static constexpr size_t maxElemCount
= allocaSize
/ sizeof(T
);
54 CollapseAllocator(size_t n
) : pTAlloc(reinterpret_cast<pT
>(stackAlloc
)) {
55 if (n
> maxElemCount
) {
56 pTAlloc
= reinterpret_cast<pT
>(__kmp_allocate(n
* sizeof(T
)));
59 ~CollapseAllocator() {
60 if (pTAlloc
!= reinterpret_cast<pT
>(stackAlloc
)) {
64 T
&operator[](int index
) { return pTAlloc
[index
]; }
65 operator const pT() { return pTAlloc
; }
68 //----------Loop canonicalization---------------------------------------------
70 // For loop nest (any shape):
71 // convert != to < or >;
72 // switch from using < or > to <= or >=.
73 // "bounds" array has to be allocated per thread.
74 // All other internal functions will work only with canonicalized loops.
76 void kmp_canonicalize_one_loop_XX(
78 /*in/out*/ bounds_infoXX_template
<T
> *bounds
) {
80 if (__kmp_env_consistency_check
) {
81 if (bounds
->step
== 0) {
82 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited
, ct_pdo
,
87 if (bounds
->comparison
== comparison_t::comp_not_eq
) {
88 // We can convert this to < or >, depends on the sign of the step:
89 if (bounds
->step
> 0) {
90 bounds
->comparison
= comparison_t::comp_less
;
92 bounds
->comparison
= comparison_t::comp_greater
;
96 if (bounds
->comparison
== comparison_t::comp_less
) {
97 // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98 // because ub0 + ub1*j should be still positive (otherwise loop was not
101 bounds
->comparison
= comparison_t::comp_less_or_eq
;
102 } else if (bounds
->comparison
== comparison_t::comp_greater
) {
104 bounds
->comparison
= comparison_t::comp_greater_or_eq
;
108 // Canonicalize loop nest. original_bounds_nest is an array of length n.
109 void kmp_canonicalize_loop_nest(ident_t
*loc
,
110 /*in/out*/ bounds_info_t
*original_bounds_nest
,
113 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
114 auto bounds
= &(original_bounds_nest
[ind
]);
116 switch (bounds
->loop_type
) {
117 case loop_type_t::loop_type_int32
:
118 kmp_canonicalize_one_loop_XX
<kmp_int32
>(
120 /*in/out*/ (bounds_infoXX_template
<kmp_int32
> *)(bounds
));
122 case loop_type_t::loop_type_uint32
:
123 kmp_canonicalize_one_loop_XX
<kmp_uint32
>(
125 /*in/out*/ (bounds_infoXX_template
<kmp_uint32
> *)(bounds
));
127 case loop_type_t::loop_type_int64
:
128 kmp_canonicalize_one_loop_XX
<kmp_int64
>(
130 /*in/out*/ (bounds_infoXX_template
<kmp_int64
> *)(bounds
));
132 case loop_type_t::loop_type_uint64
:
133 kmp_canonicalize_one_loop_XX
<kmp_uint64
>(
135 /*in/out*/ (bounds_infoXX_template
<kmp_uint64
> *)(bounds
));
143 //----------Calculating trip count on one level-------------------------------
145 // Calculate trip count on this loop level.
146 // We do this either for a rectangular loop nest,
147 // or after an adjustment bringing the loops to a parallelepiped shape.
148 // This number should not depend on the value of outer IV
149 // even if the formular has lb1 and ub1.
150 // Note: for non-rectangular loops don't use span for this, it's too big.
152 template <typename T
>
153 kmp_loop_nest_iv_t
kmp_calculate_trip_count_XX(
154 /*in/out*/ bounds_infoXX_template
<T
> *bounds
) {
156 if (bounds
->comparison
== comparison_t::comp_less_or_eq
) {
157 if (bounds
->ub0
< bounds
->lb0
) {
158 // Note: after this we don't need to calculate inner loops,
159 // but that should be an edge case:
160 bounds
->trip_count
= 0;
162 // ub - lb may exceed signed type range; we need to cast to
163 // kmp_loop_nest_iv_t anyway
165 static_cast<kmp_loop_nest_iv_t
>(bounds
->ub0
- bounds
->lb0
) /
166 __kmp_abs(bounds
->step
) +
169 } else if (bounds
->comparison
== comparison_t::comp_greater_or_eq
) {
170 if (bounds
->lb0
< bounds
->ub0
) {
171 // Note: after this we don't need to calculate inner loops,
172 // but that should be an edge case:
173 bounds
->trip_count
= 0;
175 // lb - ub may exceed signed type range; we need to cast to
176 // kmp_loop_nest_iv_t anyway
178 static_cast<kmp_loop_nest_iv_t
>(bounds
->lb0
- bounds
->ub0
) /
179 __kmp_abs(bounds
->step
) +
185 return bounds
->trip_count
;
188 // Calculate trip count on this loop level.
189 kmp_loop_nest_iv_t
kmp_calculate_trip_count(/*in/out*/ bounds_info_t
*bounds
) {
191 kmp_loop_nest_iv_t trip_count
= 0;
193 switch (bounds
->loop_type
) {
194 case loop_type_t::loop_type_int32
:
195 trip_count
= kmp_calculate_trip_count_XX
<kmp_int32
>(
196 /*in/out*/ (bounds_infoXX_template
<kmp_int32
> *)(bounds
));
198 case loop_type_t::loop_type_uint32
:
199 trip_count
= kmp_calculate_trip_count_XX
<kmp_uint32
>(
200 /*in/out*/ (bounds_infoXX_template
<kmp_uint32
> *)(bounds
));
202 case loop_type_t::loop_type_int64
:
203 trip_count
= kmp_calculate_trip_count_XX
<kmp_int64
>(
204 /*in/out*/ (bounds_infoXX_template
<kmp_int64
> *)(bounds
));
206 case loop_type_t::loop_type_uint64
:
207 trip_count
= kmp_calculate_trip_count_XX
<kmp_uint64
>(
208 /*in/out*/ (bounds_infoXX_template
<kmp_uint64
> *)(bounds
));
217 //----------Trim original iv according to its type----------------------------
219 // Trim original iv according to its type.
220 // Return kmp_uint64 value which can be easily used in all internal calculations
221 // And can be statically cast back to original type in user code.
222 kmp_uint64
kmp_fix_iv(loop_type_t loop_iv_type
, kmp_uint64 original_iv
) {
225 switch (loop_iv_type
) {
226 case loop_type_t::loop_type_int8
:
227 res
= static_cast<kmp_uint64
>(static_cast<kmp_int8
>(original_iv
));
229 case loop_type_t::loop_type_uint8
:
230 res
= static_cast<kmp_uint64
>(static_cast<kmp_uint8
>(original_iv
));
232 case loop_type_t::loop_type_int16
:
233 res
= static_cast<kmp_uint64
>(static_cast<kmp_int16
>(original_iv
));
235 case loop_type_t::loop_type_uint16
:
236 res
= static_cast<kmp_uint64
>(static_cast<kmp_uint16
>(original_iv
));
238 case loop_type_t::loop_type_int32
:
239 res
= static_cast<kmp_uint64
>(static_cast<kmp_int32
>(original_iv
));
241 case loop_type_t::loop_type_uint32
:
242 res
= static_cast<kmp_uint64
>(static_cast<kmp_uint32
>(original_iv
));
244 case loop_type_t::loop_type_int64
:
245 res
= static_cast<kmp_uint64
>(static_cast<kmp_int64
>(original_iv
));
247 case loop_type_t::loop_type_uint64
:
248 res
= static_cast<kmp_uint64
>(original_iv
);
257 //----------Compare two IVs (remember they have a type)-----------------------
259 bool kmp_ivs_eq(loop_type_t loop_iv_type
, kmp_uint64 original_iv1
,
260 kmp_uint64 original_iv2
) {
263 switch (loop_iv_type
) {
264 case loop_type_t::loop_type_int8
:
265 res
= static_cast<kmp_int8
>(original_iv1
) ==
266 static_cast<kmp_int8
>(original_iv2
);
268 case loop_type_t::loop_type_uint8
:
269 res
= static_cast<kmp_uint8
>(original_iv1
) ==
270 static_cast<kmp_uint8
>(original_iv2
);
272 case loop_type_t::loop_type_int16
:
273 res
= static_cast<kmp_int16
>(original_iv1
) ==
274 static_cast<kmp_int16
>(original_iv2
);
276 case loop_type_t::loop_type_uint16
:
277 res
= static_cast<kmp_uint16
>(original_iv1
) ==
278 static_cast<kmp_uint16
>(original_iv2
);
280 case loop_type_t::loop_type_int32
:
281 res
= static_cast<kmp_int32
>(original_iv1
) ==
282 static_cast<kmp_int32
>(original_iv2
);
284 case loop_type_t::loop_type_uint32
:
285 res
= static_cast<kmp_uint32
>(original_iv1
) ==
286 static_cast<kmp_uint32
>(original_iv2
);
288 case loop_type_t::loop_type_int64
:
289 res
= static_cast<kmp_int64
>(original_iv1
) ==
290 static_cast<kmp_int64
>(original_iv2
);
292 case loop_type_t::loop_type_uint64
:
293 res
= static_cast<kmp_uint64
>(original_iv1
) ==
294 static_cast<kmp_uint64
>(original_iv2
);
303 //----------Calculate original iv on one level--------------------------------
305 // Return true if the point fits into upper bounds on this level,
307 template <typename T
>
308 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template
<T
> *bounds
,
309 const kmp_point_t original_ivs
,
312 T iv
= static_cast<T
>(original_ivs
[ind
]);
313 T outer_iv
= static_cast<T
>(original_ivs
[bounds
->outer_iv
]);
315 if (((bounds
->comparison
== comparison_t::comp_less_or_eq
) &&
316 (iv
> (bounds
->ub0
+ bounds
->ub1
* outer_iv
))) ||
317 ((bounds
->comparison
== comparison_t::comp_greater_or_eq
) &&
318 (iv
< (bounds
->ub0
+ bounds
->ub1
* outer_iv
)))) {
319 // The calculated point is outside of loop upper boundary:
326 // Calculate one iv corresponding to iteration on the level ind.
327 // Return true if it fits into lower-upper bounds on this level
328 // (if not, we need to re-calculate)
329 template <typename T
>
330 bool kmp_calc_one_iv_XX(const bounds_infoXX_template
<T
> *bounds
,
331 /*in/out*/ kmp_point_t original_ivs
,
332 const kmp_iterations_t iterations
, kmp_index_t ind
,
333 bool start_with_lower_bound
, bool checkBounds
) {
336 T outer_iv
= static_cast<T
>(original_ivs
[bounds
->outer_iv
]);
338 if (start_with_lower_bound
) {
339 // we moved to the next iteration on one of outer loops, should start
340 // with the lower bound here:
341 temp
= bounds
->lb0
+ bounds
->lb1
* outer_iv
;
343 auto iteration
= iterations
[ind
];
344 temp
= bounds
->lb0
+ bounds
->lb1
* outer_iv
+ iteration
* bounds
->step
;
347 // Now trim original iv according to its type:
348 original_ivs
[ind
] = kmp_fix_iv(bounds
->loop_iv_type
, temp
);
351 return kmp_iv_is_in_upper_bound_XX(bounds
, original_ivs
, ind
);
357 bool kmp_calc_one_iv(const bounds_info_t
*bounds
,
358 /*in/out*/ kmp_point_t original_ivs
,
359 const kmp_iterations_t iterations
, kmp_index_t ind
,
360 bool start_with_lower_bound
, bool checkBounds
) {
362 switch (bounds
->loop_type
) {
363 case loop_type_t::loop_type_int32
:
364 return kmp_calc_one_iv_XX
<kmp_int32
>(
365 (bounds_infoXX_template
<kmp_int32
> *)(bounds
),
366 /*in/out*/ original_ivs
, iterations
, ind
, start_with_lower_bound
,
369 case loop_type_t::loop_type_uint32
:
370 return kmp_calc_one_iv_XX
<kmp_uint32
>(
371 (bounds_infoXX_template
<kmp_uint32
> *)(bounds
),
372 /*in/out*/ original_ivs
, iterations
, ind
, start_with_lower_bound
,
375 case loop_type_t::loop_type_int64
:
376 return kmp_calc_one_iv_XX
<kmp_int64
>(
377 (bounds_infoXX_template
<kmp_int64
> *)(bounds
),
378 /*in/out*/ original_ivs
, iterations
, ind
, start_with_lower_bound
,
381 case loop_type_t::loop_type_uint64
:
382 return kmp_calc_one_iv_XX
<kmp_uint64
>(
383 (bounds_infoXX_template
<kmp_uint64
> *)(bounds
),
384 /*in/out*/ original_ivs
, iterations
, ind
, start_with_lower_bound
,
393 //----------Calculate original iv on one level for rectangular loop nest------
395 // Calculate one iv corresponding to iteration on the level ind.
396 // Return true if it fits into lower-upper bounds on this level
397 // (if not, we need to re-calculate)
398 template <typename T
>
399 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template
<T
> *bounds
,
400 /*in/out*/ kmp_uint64
*original_ivs
,
401 const kmp_iterations_t iterations
,
404 auto iteration
= iterations
[ind
];
408 bounds
->lb1
* static_cast<T
>(original_ivs
[bounds
->outer_iv
]) +
409 iteration
* bounds
->step
;
411 // Now trim original iv according to its type:
412 original_ivs
[ind
] = kmp_fix_iv(bounds
->loop_iv_type
, temp
);
415 void kmp_calc_one_iv_rectang(const bounds_info_t
*bounds
,
416 /*in/out*/ kmp_uint64
*original_ivs
,
417 const kmp_iterations_t iterations
,
420 switch (bounds
->loop_type
) {
421 case loop_type_t::loop_type_int32
:
422 kmp_calc_one_iv_rectang_XX
<kmp_int32
>(
423 (bounds_infoXX_template
<kmp_int32
> *)(bounds
),
424 /*in/out*/ original_ivs
, iterations
, ind
);
426 case loop_type_t::loop_type_uint32
:
427 kmp_calc_one_iv_rectang_XX
<kmp_uint32
>(
428 (bounds_infoXX_template
<kmp_uint32
> *)(bounds
),
429 /*in/out*/ original_ivs
, iterations
, ind
);
431 case loop_type_t::loop_type_int64
:
432 kmp_calc_one_iv_rectang_XX
<kmp_int64
>(
433 (bounds_infoXX_template
<kmp_int64
> *)(bounds
),
434 /*in/out*/ original_ivs
, iterations
, ind
);
436 case loop_type_t::loop_type_uint64
:
437 kmp_calc_one_iv_rectang_XX
<kmp_uint64
>(
438 (bounds_infoXX_template
<kmp_uint64
> *)(bounds
),
439 /*in/out*/ original_ivs
, iterations
, ind
);
446 //----------------------------------------------------------------------------
447 // Rectangular loop nest
448 //----------------------------------------------------------------------------
450 //----------Canonicalize loop nest and calculate trip count-------------------
452 // Canonicalize loop nest and calculate overall trip count.
453 // "bounds_nest" has to be allocated per thread.
454 // API will modify original bounds_nest array to bring it to a canonical form
455 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
456 // canonical form there will be no changes to bounds in bounds_nest array
457 // (only trip counts will be calculated).
458 // Returns trip count of overall space.
459 extern "C" kmp_loop_nest_iv_t
460 __kmpc_process_loop_nest_rectang(ident_t
*loc
, kmp_int32 gtid
,
461 /*in/out*/ bounds_info_t
*original_bounds_nest
,
464 kmp_canonicalize_loop_nest(loc
, /*in/out*/ original_bounds_nest
, n
);
466 kmp_loop_nest_iv_t total
= 1;
468 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
469 auto bounds
= &(original_bounds_nest
[ind
]);
471 kmp_loop_nest_iv_t trip_count
= kmp_calculate_trip_count(/*in/out*/ bounds
);
478 //----------Calculate old induction variables---------------------------------
480 // Calculate old induction variables corresponding to overall new_iv.
481 // Note: original IV will be returned as if it had kmp_uint64 type,
482 // will have to be converted to original type in user code.
483 // Note: trip counts should be already calculated by
484 // __kmpc_process_loop_nest_rectang.
485 // OMPTODO: special case 2, 3 nested loops: either do different
486 // interface without array or possibly template this over n
488 __kmpc_calc_original_ivs_rectang(ident_t
*loc
, kmp_loop_nest_iv_t new_iv
,
489 const bounds_info_t
*original_bounds_nest
,
490 /*out*/ kmp_uint64
*original_ivs
,
493 CollapseAllocator
<kmp_loop_nest_iv_t
> iterations(n
);
495 // First, calc corresponding iteration in every original loop:
496 for (kmp_index_t ind
= n
; ind
> 0;) {
498 auto bounds
= &(original_bounds_nest
[ind
]);
500 // should be optimized to OPDIVREM:
501 auto temp
= new_iv
/ bounds
->trip_count
;
502 auto iteration
= new_iv
% bounds
->trip_count
;
505 iterations
[ind
] = iteration
;
507 KMP_ASSERT(new_iv
== 0);
509 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
510 auto bounds
= &(original_bounds_nest
[ind
]);
512 kmp_calc_one_iv_rectang(bounds
, /*in/out*/ original_ivs
, iterations
, ind
);
516 //----------------------------------------------------------------------------
517 // Non-rectangular loop nest
518 //----------------------------------------------------------------------------
520 //----------Calculate maximum possible span of iv values on one level---------
522 // Calculate span for IV on this loop level for "<=" case.
523 // Note: it's for <= on this loop nest level, so lower bound should be smallest
524 // value, upper bound should be the biggest value. If the loop won't execute,
525 // 'smallest' may be bigger than 'biggest', but we'd better not switch them
527 template <typename T
>
528 void kmp_calc_span_lessoreq_XX(
529 /* in/out*/ bounds_info_internalXX_template
<T
> *bounds
,
530 /* in/out*/ bounds_info_internal_t
*bounds_nest
) {
532 typedef typename traits_t
<T
>::unsigned_t UT
;
533 // typedef typename traits_t<T>::signed_t ST;
535 // typedef typename big_span_t span_t;
538 auto &bbounds
= bounds
->b
;
540 if ((bbounds
.lb1
!= 0) || (bbounds
.ub1
!= 0)) {
541 // This dimention depends on one of previous ones; can't be the outermost
543 bounds_info_internalXX_template
<T
> *previous
=
544 reinterpret_cast<bounds_info_internalXX_template
<T
> *>(
545 &(bounds_nest
[bbounds
.outer_iv
]));
547 // OMPTODO: assert that T is compatible with loop variable type on
551 span_t bound_candidate1
=
552 bbounds
.lb0
+ bbounds
.lb1
* previous
->span_smallest
;
553 span_t bound_candidate2
=
554 bbounds
.lb0
+ bbounds
.lb1
* previous
->span_biggest
;
555 if (bound_candidate1
< bound_candidate2
) {
556 bounds
->span_smallest
= bound_candidate1
;
558 bounds
->span_smallest
= bound_candidate2
;
563 // We can't adjust the upper bound with respect to step, because
564 // lower bound might be off after adjustments
566 span_t bound_candidate1
=
567 bbounds
.ub0
+ bbounds
.ub1
* previous
->span_smallest
;
568 span_t bound_candidate2
=
569 bbounds
.ub0
+ bbounds
.ub1
* previous
->span_biggest
;
570 if (bound_candidate1
< bound_candidate2
) {
571 bounds
->span_biggest
= bound_candidate2
;
573 bounds
->span_biggest
= bound_candidate1
;
578 bounds
->span_smallest
= bbounds
.lb0
;
579 bounds
->span_biggest
= bbounds
.ub0
;
581 if (!bounds
->loop_bounds_adjusted
) {
582 // Here it's safe to reduce the space to the multiply of step.
583 // OMPTODO: check if the formular is correct.
584 // Also check if it would be safe to do this if we didn't adjust left side.
585 bounds
->span_biggest
-=
586 (static_cast<UT
>(bbounds
.ub0
- bbounds
.lb0
)) % bbounds
.step
; // abs?
590 // Calculate span for IV on this loop level for ">=" case.
591 template <typename T
>
592 void kmp_calc_span_greateroreq_XX(
593 /* in/out*/ bounds_info_internalXX_template
<T
> *bounds
,
594 /* in/out*/ bounds_info_internal_t
*bounds_nest
) {
596 typedef typename traits_t
<T
>::unsigned_t UT
;
597 // typedef typename traits_t<T>::signed_t ST;
599 // typedef typename big_span_t span_t;
602 auto &bbounds
= bounds
->b
;
604 if ((bbounds
.lb1
!= 0) || (bbounds
.ub1
!= 0)) {
605 // This dimention depends on one of previous ones; can't be the outermost
607 bounds_info_internalXX_template
<T
> *previous
=
608 reinterpret_cast<bounds_info_internalXX_template
<T
> *>(
609 &(bounds_nest
[bbounds
.outer_iv
]));
611 // OMPTODO: assert that T is compatible with loop variable type on
615 span_t bound_candidate1
=
616 bbounds
.lb0
+ bbounds
.lb1
* previous
->span_smallest
;
617 span_t bound_candidate2
=
618 bbounds
.lb0
+ bbounds
.lb1
* previous
->span_biggest
;
619 if (bound_candidate1
>= bound_candidate2
) {
620 bounds
->span_smallest
= bound_candidate1
;
622 bounds
->span_smallest
= bound_candidate2
;
627 // We can't adjust the upper bound with respect to step, because
628 // lower bound might be off after adjustments
630 span_t bound_candidate1
=
631 bbounds
.ub0
+ bbounds
.ub1
* previous
->span_smallest
;
632 span_t bound_candidate2
=
633 bbounds
.ub0
+ bbounds
.ub1
* previous
->span_biggest
;
634 if (bound_candidate1
>= bound_candidate2
) {
635 bounds
->span_biggest
= bound_candidate2
;
637 bounds
->span_biggest
= bound_candidate1
;
643 bounds
->span_biggest
= bbounds
.lb0
;
644 bounds
->span_smallest
= bbounds
.ub0
;
646 if (!bounds
->loop_bounds_adjusted
) {
647 // Here it's safe to reduce the space to the multiply of step.
648 // OMPTODO: check if the formular is correct.
649 // Also check if it would be safe to do this if we didn't adjust left side.
650 bounds
->span_biggest
-=
651 (static_cast<UT
>(bbounds
.ub0
- bbounds
.lb0
)) % bbounds
.step
; // abs?
655 // Calculate maximum possible span for IV on this loop level.
656 template <typename T
>
657 void kmp_calc_span_XX(
658 /* in/out*/ bounds_info_internalXX_template
<T
> *bounds
,
659 /* in/out*/ bounds_info_internal_t
*bounds_nest
) {
661 if (bounds
->b
.comparison
== comparison_t::comp_less_or_eq
) {
662 kmp_calc_span_lessoreq_XX(/* in/out*/ bounds
, /* in/out*/ bounds_nest
);
664 KMP_ASSERT(bounds
->b
.comparison
== comparison_t::comp_greater_or_eq
);
665 kmp_calc_span_greateroreq_XX(/* in/out*/ bounds
, /* in/out*/ bounds_nest
);
669 //----------All initial processing of the loop nest---------------------------
671 // Calculate new bounds for this loop level.
672 // To be able to work with the nest we need to get it to a parallelepiped shape.
673 // We need to stay in the original range of values, so that there will be no
674 // overflow, for that we'll adjust both upper and lower bounds as needed.
675 template <typename T
>
676 void kmp_calc_new_bounds_XX(
677 /* in/out*/ bounds_info_internalXX_template
<T
> *bounds
,
678 /* in/out*/ bounds_info_internal_t
*bounds_nest
) {
680 auto &bbounds
= bounds
->b
;
682 if (bbounds
.lb1
== bbounds
.ub1
) {
683 // Already parallel, no need to adjust:
684 bounds
->loop_bounds_adjusted
= false;
686 bounds
->loop_bounds_adjusted
= true;
688 T old_lb1
= bbounds
.lb1
;
689 T old_ub1
= bbounds
.ub1
;
691 if (__kmp_sign(old_lb1
) != __kmp_sign(old_ub1
)) {
692 // With this shape we can adjust to a rectangle:
696 // get upper and lower bounds to be parallel
697 // with values in the old range.
698 // Note: abs didn't work here.
699 if (((old_lb1
< 0) && (old_lb1
< old_ub1
)) ||
700 ((old_lb1
> 0) && (old_lb1
> old_ub1
))) {
701 bbounds
.lb1
= old_ub1
;
703 bbounds
.ub1
= old_lb1
;
707 // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708 // The idea here that for this IV we are now getting the same span
709 // irrespective of the previous IV value.
710 bounds_info_internalXX_template
<T
> *previous
=
711 reinterpret_cast<bounds_info_internalXX_template
<T
> *>(
712 &bounds_nest
[bbounds
.outer_iv
]);
714 if (bbounds
.comparison
== comparison_t::comp_less_or_eq
) {
715 if (old_lb1
< bbounds
.lb1
) {
716 KMP_ASSERT(old_lb1
< 0);
717 // The length is good on outer_iv biggest number,
718 // can use it to find where to move the lower bound:
720 T sub
= (bbounds
.lb1
- old_lb1
) * previous
->span_biggest
;
721 bbounds
.lb0
-= sub
; // OMPTODO: what if it'll go out of unsigned space?
722 // e.g. it was 0?? (same below)
723 } else if (old_lb1
> bbounds
.lb1
) {
724 // still need to move lower bound:
725 T add
= (old_lb1
- bbounds
.lb1
) * previous
->span_smallest
;
729 if (old_ub1
> bbounds
.ub1
) {
730 KMP_ASSERT(old_ub1
> 0);
731 // The length is good on outer_iv biggest number,
732 // can use it to find where to move upper bound:
734 T add
= (old_ub1
- bbounds
.ub1
) * previous
->span_biggest
;
736 } else if (old_ub1
< bbounds
.ub1
) {
737 // still need to move upper bound:
738 T sub
= (bbounds
.ub1
- old_ub1
) * previous
->span_smallest
;
742 KMP_ASSERT(bbounds
.comparison
== comparison_t::comp_greater_or_eq
);
743 if (old_lb1
< bbounds
.lb1
) {
744 KMP_ASSERT(old_lb1
< 0);
745 T sub
= (bbounds
.lb1
- old_lb1
) * previous
->span_smallest
;
747 } else if (old_lb1
> bbounds
.lb1
) {
748 T add
= (old_lb1
- bbounds
.lb1
) * previous
->span_biggest
;
752 if (old_ub1
> bbounds
.ub1
) {
753 KMP_ASSERT(old_ub1
> 0);
754 T add
= (old_ub1
- bbounds
.ub1
) * previous
->span_smallest
;
756 } else if (old_ub1
< bbounds
.ub1
) {
757 T sub
= (bbounds
.ub1
- old_ub1
) * previous
->span_biggest
;
764 // Do all processing for one canonicalized loop in the nest
765 // (assuming that outer loops already were processed):
766 template <typename T
>
767 kmp_loop_nest_iv_t
kmp_process_one_loop_XX(
768 /* in/out*/ bounds_info_internalXX_template
<T
> *bounds
,
769 /*in/out*/ bounds_info_internal_t
*bounds_nest
) {
771 kmp_calc_new_bounds_XX(/* in/out*/ bounds
, /* in/out*/ bounds_nest
);
772 kmp_calc_span_XX(/* in/out*/ bounds
, /* in/out*/ bounds_nest
);
773 return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds
->b
));
776 // Non-rectangular loop nest, canonicalized to use <= or >=.
777 // Process loop nest to have a parallelepiped shape,
778 // calculate biggest spans for IV's on all levels and calculate overall trip
779 // count. "bounds_nest" has to be allocated per thread.
780 // Returns overall trip count (for adjusted space).
781 kmp_loop_nest_iv_t
kmp_process_loop_nest(
782 /*in/out*/ bounds_info_internal_t
*bounds_nest
, kmp_index_t n
) {
784 kmp_loop_nest_iv_t total
= 1;
786 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
787 auto bounds
= &(bounds_nest
[ind
]);
788 kmp_loop_nest_iv_t trip_count
= 0;
790 switch (bounds
->b
.loop_type
) {
791 case loop_type_t::loop_type_int32
:
792 trip_count
= kmp_process_one_loop_XX
<kmp_int32
>(
793 /*in/out*/ (bounds_info_internalXX_template
<kmp_int32
> *)(bounds
),
794 /*in/out*/ bounds_nest
);
796 case loop_type_t::loop_type_uint32
:
797 trip_count
= kmp_process_one_loop_XX
<kmp_uint32
>(
798 /*in/out*/ (bounds_info_internalXX_template
<kmp_uint32
> *)(bounds
),
799 /*in/out*/ bounds_nest
);
801 case loop_type_t::loop_type_int64
:
802 trip_count
= kmp_process_one_loop_XX
<kmp_int64
>(
803 /*in/out*/ (bounds_info_internalXX_template
<kmp_int64
> *)(bounds
),
804 /*in/out*/ bounds_nest
);
806 case loop_type_t::loop_type_uint64
:
807 trip_count
= kmp_process_one_loop_XX
<kmp_uint64
>(
808 /*in/out*/ (bounds_info_internalXX_template
<kmp_uint64
> *)(bounds
),
809 /*in/out*/ bounds_nest
);
820 //----------Calculate iterations (in the original or updated space)-----------
822 // Calculate number of iterations in original or updated space resulting in
823 // original_ivs[ind] (only on this level, non-negative)
824 // (not counting initial iteration)
825 template <typename T
>
827 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template
<T
> *bounds
,
828 const kmp_point_t original_ivs
,
831 kmp_loop_nest_iv_t iterations
= 0;
833 if (bounds
->comparison
== comparison_t::comp_less_or_eq
) {
835 (static_cast<T
>(original_ivs
[ind
]) - bounds
->lb0
-
836 bounds
->lb1
* static_cast<T
>(original_ivs
[bounds
->outer_iv
])) /
837 __kmp_abs(bounds
->step
);
839 KMP_DEBUG_ASSERT(bounds
->comparison
== comparison_t::comp_greater_or_eq
);
840 iterations
= (bounds
->lb0
+
841 bounds
->lb1
* static_cast<T
>(original_ivs
[bounds
->outer_iv
]) -
842 static_cast<T
>(original_ivs
[ind
])) /
843 __kmp_abs(bounds
->step
);
849 // Calculate number of iterations in the original or updated space resulting in
850 // original_ivs[ind] (only on this level, non-negative)
851 kmp_loop_nest_iv_t
kmp_calc_number_of_iterations(const bounds_info_t
*bounds
,
852 const kmp_point_t original_ivs
,
855 switch (bounds
->loop_type
) {
856 case loop_type_t::loop_type_int32
:
857 return kmp_calc_number_of_iterations_XX
<kmp_int32
>(
858 (bounds_infoXX_template
<kmp_int32
> *)(bounds
), original_ivs
, ind
);
860 case loop_type_t::loop_type_uint32
:
861 return kmp_calc_number_of_iterations_XX
<kmp_uint32
>(
862 (bounds_infoXX_template
<kmp_uint32
> *)(bounds
), original_ivs
, ind
);
864 case loop_type_t::loop_type_int64
:
865 return kmp_calc_number_of_iterations_XX
<kmp_int64
>(
866 (bounds_infoXX_template
<kmp_int64
> *)(bounds
), original_ivs
, ind
);
868 case loop_type_t::loop_type_uint64
:
869 return kmp_calc_number_of_iterations_XX
<kmp_uint64
>(
870 (bounds_infoXX_template
<kmp_uint64
> *)(bounds
), original_ivs
, ind
);
878 //----------Calculate new iv corresponding to original ivs--------------------
880 // We got a point in the original loop nest.
881 // Take updated bounds and calculate what new_iv will correspond to this point.
882 // When we are getting original IVs from new_iv, we have to adjust to fit into
883 // original loops bounds. Getting new_iv for the adjusted original IVs will help
884 // with making more chunks non-empty.
886 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t
*bounds_nest
,
887 const kmp_point_t original_ivs
,
890 kmp_loop_nest_iv_t new_iv
= 0;
892 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
893 auto bounds
= &(bounds_nest
[ind
].b
);
895 new_iv
= new_iv
* bounds
->trip_count
+
896 kmp_calc_number_of_iterations(bounds
, original_ivs
, ind
);
902 //----------Calculate original ivs for provided iterations--------------------
904 // Calculate original IVs for provided iterations, assuming iterations are
905 // calculated in the original space.
906 // Loop nest is in canonical form (with <= / >=).
907 bool kmp_calc_original_ivs_from_iterations(
908 const bounds_info_t
*original_bounds_nest
, kmp_index_t n
,
909 /*in/out*/ kmp_point_t original_ivs
,
910 /*in/out*/ kmp_iterations_t iterations
, kmp_index_t ind
) {
912 kmp_index_t lengthened_ind
= n
;
915 auto bounds
= &(original_bounds_nest
[ind
]);
916 bool good
= kmp_calc_one_iv(bounds
, /*in/out*/ original_ivs
, iterations
,
917 ind
, (lengthened_ind
< ind
), true);
920 // The calculated iv value is too big (or too small for >=):
925 // Go to next iteration on the outer loop:
928 lengthened_ind
= ind
;
929 for (kmp_index_t i
= ind
+ 1; i
< n
; ++i
) {
941 //----------Calculate original ivs for the beginning of the loop nest---------
943 // Calculate IVs for the beginning of the loop nest.
944 // Note: lower bounds of all loops may not work -
945 // if on some of the iterations of the outer loops inner loops are empty.
946 // Loop nest is in canonical form (with <= / >=).
947 bool kmp_calc_original_ivs_for_start(const bounds_info_t
*original_bounds_nest
,
949 /*out*/ kmp_point_t original_ivs
) {
951 // Iterations in the original space, multiplied by step:
952 CollapseAllocator
<kmp_loop_nest_iv_t
> iterations(n
);
953 for (kmp_index_t ind
= n
; ind
> 0;) {
958 // Now calculate the point:
959 bool b
= kmp_calc_original_ivs_from_iterations(original_bounds_nest
, n
,
960 /*in/out*/ original_ivs
,
961 /*in/out*/ iterations
, 0);
965 //----------Calculate next point in the original loop space-------------------
967 // From current set of original IVs calculate next point.
968 // Return false if there is no next point in the loop bounds.
969 bool kmp_calc_next_original_ivs(const bounds_info_t
*original_bounds_nest
,
970 kmp_index_t n
, const kmp_point_t original_ivs
,
971 /*out*/ kmp_point_t next_original_ivs
) {
972 // Iterations in the original space, multiplied by step (so can be negative):
973 CollapseAllocator
<kmp_loop_nest_iv_t
> iterations(n
);
974 // First, calc corresponding iteration in every original loop:
975 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
976 auto bounds
= &(original_bounds_nest
[ind
]);
977 iterations
[ind
] = kmp_calc_number_of_iterations(bounds
, original_ivs
, ind
);
980 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
981 next_original_ivs
[ind
] = original_ivs
[ind
];
984 // Next add one step to the iterations on the inner-most level, and see if we
985 // need to move up the nest:
986 kmp_index_t ind
= n
- 1;
989 bool b
= kmp_calc_original_ivs_from_iterations(
990 original_bounds_nest
, n
, /*in/out*/ next_original_ivs
, iterations
, ind
);
995 //----------Calculate chunk end in the original loop space--------------------
997 // For one level calculate old induction variable corresponding to overall
998 // new_iv for the chunk end.
999 // Return true if it fits into upper bound on this level
1000 // (if not, we need to re-calculate)
1001 template <typename T
>
1002 bool kmp_calc_one_iv_for_chunk_end_XX(
1003 const bounds_infoXX_template
<T
> *bounds
,
1004 const bounds_infoXX_template
<T
> *updated_bounds
,
1005 /*in/out*/ kmp_point_t original_ivs
, const kmp_iterations_t iterations
,
1006 kmp_index_t ind
, bool start_with_lower_bound
, bool compare_with_start
,
1007 const kmp_point_t original_ivs_start
) {
1009 // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1012 // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1015 T outer_iv
= static_cast<T
>(original_ivs
[bounds
->outer_iv
]);
1017 if (start_with_lower_bound
) {
1018 // we moved to the next iteration on one of outer loops, may as well use
1019 // the lower bound here:
1020 temp
= bounds
->lb0
+ bounds
->lb1
* outer_iv
;
1022 // Start in expanded space, but:
1023 // - we need to hit original space lower bound, so need to account for
1025 // - we have to go into original space, even if that means adding more
1026 // iterations than was planned
1027 // - we have to go past (or equal to) previous point (which is the chunk
1030 auto iteration
= iterations
[ind
];
1032 auto step
= bounds
->step
;
1034 // In case of >= it's negative:
1035 auto accountForStep
=
1036 ((bounds
->lb0
+ bounds
->lb1
* outer_iv
) -
1037 (updated_bounds
->lb0
+ updated_bounds
->lb1
* outer_iv
)) %
1040 temp
= updated_bounds
->lb0
+ updated_bounds
->lb1
* outer_iv
+
1041 accountForStep
+ iteration
* step
;
1043 if (((bounds
->comparison
== comparison_t::comp_less_or_eq
) &&
1044 (temp
< (bounds
->lb0
+ bounds
->lb1
* outer_iv
))) ||
1045 ((bounds
->comparison
== comparison_t::comp_greater_or_eq
) &&
1046 (temp
> (bounds
->lb0
+ bounds
->lb1
* outer_iv
)))) {
1047 // Too small (or too big), didn't reach the original lower bound. Use
1049 temp
= bounds
->lb0
+ bounds
->lb1
* outer_iv
+ iteration
/ 2 * step
;
1052 if (compare_with_start
) {
1054 T start
= static_cast<T
>(original_ivs_start
[ind
]);
1056 temp
= kmp_fix_iv(bounds
->loop_iv_type
, temp
);
1058 // On all previous levels start of the chunk is same as the end, need to
1059 // be really careful here:
1060 if (((bounds
->comparison
== comparison_t::comp_less_or_eq
) &&
1062 ((bounds
->comparison
== comparison_t::comp_greater_or_eq
) &&
1064 // End of the chunk can't be smaller (for >= bigger) than it's start.
1066 temp
= start
+ iteration
/ 4 * step
;
1071 original_ivs
[ind
] = temp
= kmp_fix_iv(bounds
->loop_iv_type
, temp
);
1073 if (((bounds
->comparison
== comparison_t::comp_less_or_eq
) &&
1074 (temp
> (bounds
->ub0
+ bounds
->ub1
* outer_iv
))) ||
1075 ((bounds
->comparison
== comparison_t::comp_greater_or_eq
) &&
1076 (temp
< (bounds
->ub0
+ bounds
->ub1
* outer_iv
)))) {
1077 // Too big (or too small for >=).
1084 // For one level calculate old induction variable corresponding to overall
1085 // new_iv for the chunk end.
1086 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t
*bounds
,
1087 const bounds_info_t
*updated_bounds
,
1088 /*in/out*/ kmp_point_t original_ivs
,
1089 const kmp_iterations_t iterations
,
1090 kmp_index_t ind
, bool start_with_lower_bound
,
1091 bool compare_with_start
,
1092 const kmp_point_t original_ivs_start
) {
1094 switch (bounds
->loop_type
) {
1095 case loop_type_t::loop_type_int32
:
1096 return kmp_calc_one_iv_for_chunk_end_XX
<kmp_int32
>(
1097 (bounds_infoXX_template
<kmp_int32
> *)(bounds
),
1098 (bounds_infoXX_template
<kmp_int32
> *)(updated_bounds
),
1100 original_ivs
, iterations
, ind
, start_with_lower_bound
,
1101 compare_with_start
, original_ivs_start
);
1103 case loop_type_t::loop_type_uint32
:
1104 return kmp_calc_one_iv_for_chunk_end_XX
<kmp_uint32
>(
1105 (bounds_infoXX_template
<kmp_uint32
> *)(bounds
),
1106 (bounds_infoXX_template
<kmp_uint32
> *)(updated_bounds
),
1108 original_ivs
, iterations
, ind
, start_with_lower_bound
,
1109 compare_with_start
, original_ivs_start
);
1111 case loop_type_t::loop_type_int64
:
1112 return kmp_calc_one_iv_for_chunk_end_XX
<kmp_int64
>(
1113 (bounds_infoXX_template
<kmp_int64
> *)(bounds
),
1114 (bounds_infoXX_template
<kmp_int64
> *)(updated_bounds
),
1116 original_ivs
, iterations
, ind
, start_with_lower_bound
,
1117 compare_with_start
, original_ivs_start
);
1119 case loop_type_t::loop_type_uint64
:
1120 return kmp_calc_one_iv_for_chunk_end_XX
<kmp_uint64
>(
1121 (bounds_infoXX_template
<kmp_uint64
> *)(bounds
),
1122 (bounds_infoXX_template
<kmp_uint64
> *)(updated_bounds
),
1124 original_ivs
, iterations
, ind
, start_with_lower_bound
,
1125 compare_with_start
, original_ivs_start
);
1133 // Calculate old induction variables corresponding to overall new_iv for the
1134 // chunk end. If due to space extension we are getting old IVs outside of the
1135 // boundaries, bring them into the boundaries. Need to do this in the runtime,
1136 // esp. on the lower bounds side. When getting result need to make sure that the
1137 // new chunk starts at next position to old chunk, not overlaps with it (this is
1138 // done elsewhere), and need to make sure end of the chunk is further than the
1139 // beginning of the chunk. We don't need an exact ending point here, just
1140 // something more-or-less close to the desired chunk length, bigger is fine
1141 // (smaller would be fine, but we risk going into infinite loop, so do smaller
1142 // only at the very end of the space). result: false if could not find the
1143 // ending point in the original loop space. In this case the caller can use
1144 // original upper bounds as the end of the chunk. Chunk won't be empty, because
1145 // it'll have at least the starting point, which is by construction in the
1147 bool kmp_calc_original_ivs_for_chunk_end(
1148 const bounds_info_t
*original_bounds_nest
, kmp_index_t n
,
1149 const bounds_info_internal_t
*updated_bounds_nest
,
1150 const kmp_point_t original_ivs_start
, kmp_loop_nest_iv_t new_iv
,
1151 /*out*/ kmp_point_t original_ivs
) {
1153 // Iterations in the expanded space:
1154 CollapseAllocator
<kmp_loop_nest_iv_t
> iterations(n
);
1155 // First, calc corresponding iteration in every modified loop:
1156 for (kmp_index_t ind
= n
; ind
> 0;) {
1158 auto &updated_bounds
= updated_bounds_nest
[ind
];
1160 // should be optimized to OPDIVREM:
1161 auto new_ind
= new_iv
/ updated_bounds
.b
.trip_count
;
1162 auto iteration
= new_iv
% updated_bounds
.b
.trip_count
;
1165 iterations
[ind
] = iteration
;
1167 KMP_DEBUG_ASSERT(new_iv
== 0);
1169 kmp_index_t lengthened_ind
= n
;
1170 kmp_index_t equal_ind
= -1;
1172 // Next calculate the point, but in original loop nest.
1173 for (kmp_index_t ind
= 0; ind
< n
;) {
1174 auto bounds
= &(original_bounds_nest
[ind
]);
1175 auto updated_bounds
= &(updated_bounds_nest
[ind
].b
);
1177 bool good
= kmp_calc_one_iv_for_chunk_end(
1178 bounds
, updated_bounds
,
1179 /*in/out*/ original_ivs
, iterations
, ind
, (lengthened_ind
< ind
),
1180 (equal_ind
>= ind
- 1), original_ivs_start
);
1183 // Too big (or too small for >=).
1185 // Need to reduce to the end.
1188 // Go to next iteration on outer loop:
1190 ++(iterations
[ind
]);
1191 lengthened_ind
= ind
;
1192 if (equal_ind
>= lengthened_ind
) {
1193 // We've changed the number of iterations here,
1194 // can't be same anymore:
1195 equal_ind
= lengthened_ind
- 1;
1197 for (kmp_index_t i
= ind
+ 1; i
< n
; ++i
) {
1204 if ((equal_ind
== ind
- 1) &&
1205 (kmp_ivs_eq(bounds
->loop_iv_type
, original_ivs
[ind
],
1206 original_ivs_start
[ind
]))) {
1208 } else if ((equal_ind
> ind
- 1) &&
1209 !(kmp_ivs_eq(bounds
->loop_iv_type
, original_ivs
[ind
],
1210 original_ivs_start
[ind
]))) {
1211 equal_ind
= ind
- 1;
1219 //----------Calculate upper bounds for the last chunk-------------------------
1221 // Calculate one upper bound for the end.
1222 template <typename T
>
1223 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template
<T
> *bounds
,
1224 /*in/out*/ kmp_point_t original_ivs
,
1227 T temp
= bounds
->ub0
+
1228 bounds
->ub1
* static_cast<T
>(original_ivs
[bounds
->outer_iv
]);
1230 original_ivs
[ind
] = kmp_fix_iv(bounds
->loop_iv_type
, temp
);
1233 void kmp_calc_one_iv_end(const bounds_info_t
*bounds
,
1234 /*in/out*/ kmp_point_t original_ivs
, kmp_index_t ind
) {
1236 switch (bounds
->loop_type
) {
1240 case loop_type_t::loop_type_int32
:
1241 kmp_calc_one_iv_end_XX
<kmp_int32
>(
1242 (bounds_infoXX_template
<kmp_int32
> *)(bounds
),
1243 /*in/out*/ original_ivs
, ind
);
1245 case loop_type_t::loop_type_uint32
:
1246 kmp_calc_one_iv_end_XX
<kmp_uint32
>(
1247 (bounds_infoXX_template
<kmp_uint32
> *)(bounds
),
1248 /*in/out*/ original_ivs
, ind
);
1250 case loop_type_t::loop_type_int64
:
1251 kmp_calc_one_iv_end_XX
<kmp_int64
>(
1252 (bounds_infoXX_template
<kmp_int64
> *)(bounds
),
1253 /*in/out*/ original_ivs
, ind
);
1255 case loop_type_t::loop_type_uint64
:
1256 kmp_calc_one_iv_end_XX
<kmp_uint64
>(
1257 (bounds_infoXX_template
<kmp_uint64
> *)(bounds
),
1258 /*in/out*/ original_ivs
, ind
);
1263 // Calculate upper bounds for the last loop iteration. Just use original upper
1264 // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265 // this point is in the original space (it's likely not)
1266 void kmp_calc_original_ivs_for_end(
1267 const bounds_info_t
*const original_bounds_nest
, kmp_index_t n
,
1268 /*out*/ kmp_point_t original_ivs
) {
1269 for (kmp_index_t ind
= 0; ind
< n
; ++ind
) {
1270 auto bounds
= &(original_bounds_nest
[ind
]);
1271 kmp_calc_one_iv_end(bounds
, /*in/out*/ original_ivs
, ind
);
1275 //----------Init API for non-rectangular loops--------------------------------
1277 // Init API for collapsed loops (static, no chunks defined).
1278 // "bounds_nest" has to be allocated per thread.
1279 // API will modify original bounds_nest array to bring it to a canonical form
1280 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1281 // canonical form there will be no changes to bounds in bounds_nest array
1282 // (only trip counts will be calculated). Internally API will expand the space
1283 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1284 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1285 // important on the left side, to hit the lower bounds and not step over), and
1286 // pick the correct chunk for this thread (so it will calculate chunks up to the
1287 // needed one). It could be optimized to calculate just this chunk, potentially
1288 // a bit less well distributed among threads. It is designed to make sure that
1289 // threads will receive predictable chunks, deterministically (so that next nest
1290 // of loops with similar characteristics will get exactly same chunks on same
1292 // Current contract: chunk_bounds_nest has only lb0 and ub0,
1293 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1294 extern "C" kmp_int32
1295 __kmpc_for_collapsed_init(ident_t
*loc
, kmp_int32 gtid
,
1296 /*in/out*/ bounds_info_t
*original_bounds_nest
,
1297 /*out*/ bounds_info_t
*chunk_bounds_nest
,
1298 kmp_index_t n
, /*out*/ kmp_int32
*plastiter
) {
1300 KMP_DEBUG_ASSERT(plastiter
&& original_bounds_nest
);
1301 KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid
));
1303 if (__kmp_env_consistency_check
) {
1304 __kmp_push_workshare(gtid
, ct_pdo
, loc
);
1307 kmp_canonicalize_loop_nest(loc
, /*in/out*/ original_bounds_nest
, n
);
1309 CollapseAllocator
<bounds_info_internal_t
> updated_bounds_nest(n
);
1311 for (kmp_index_t i
= 0; i
< n
; ++i
) {
1312 updated_bounds_nest
[i
].b
= original_bounds_nest
[i
];
1315 kmp_loop_nest_iv_t total
=
1316 kmp_process_loop_nest(/*in/out*/ updated_bounds_nest
, n
);
1318 if (plastiter
!= NULL
) {
1323 // Loop won't execute:
1327 // OMPTODO: DISTRIBUTE is not supported yet
1328 __kmp_assert_valid_gtid(gtid
);
1329 kmp_uint32 tid
= __kmp_tid_from_gtid(gtid
);
1331 kmp_info_t
*th
= __kmp_threads
[gtid
];
1332 kmp_team_t
*team
= th
->th
.th_team
;
1333 kmp_uint32 nth
= team
->t
.t_nproc
; // Number of threads
1335 KMP_DEBUG_ASSERT(tid
< nth
);
1337 CollapseAllocator
<kmp_uint64
> original_ivs_start(n
);
1339 if (!kmp_calc_original_ivs_for_start(original_bounds_nest
, n
,
1340 /*out*/ original_ivs_start
)) {
1341 // Loop won't execute:
1345 // Not doing this optimization for one thread:
1347 // (2) without it current contract that chunk_bounds_nest has only lb0 and
1348 // ub0, lb1 and ub1 are set to 0 and can be ignored.
1351 // // Copy all info from original_bounds_nest, it'll be good enough.
1353 // for (kmp_index_t i = 0; i < n; ++i) {
1354 // chunk_bounds_nest[i] = original_bounds_nest[i];
1357 // if (plastiter != NULL) {
1358 // *plastiter = TRUE;
1363 kmp_loop_nest_iv_t new_iv
= kmp_calc_new_iv_from_original_ivs(
1364 updated_bounds_nest
, original_ivs_start
, n
);
1366 bool last_iter
= false;
1369 // We could calculate chunk size once, but this is to compensate that the
1370 // original space is not parallelepiped and some threads can be left
1372 KMP_DEBUG_ASSERT(total
>= new_iv
);
1374 kmp_loop_nest_iv_t total_left
= total
- new_iv
;
1375 kmp_loop_nest_iv_t chunk_size
= total_left
/ nth
;
1376 kmp_loop_nest_iv_t remainder
= total_left
% nth
;
1378 kmp_loop_nest_iv_t curr_chunk_size
= chunk_size
;
1380 if (remainder
> 0) {
1385 #if defined(KMP_DEBUG)
1386 kmp_loop_nest_iv_t new_iv_for_start
= new_iv
;
1389 if (curr_chunk_size
> 1) {
1390 new_iv
+= curr_chunk_size
- 1;
1393 CollapseAllocator
<kmp_uint64
> original_ivs_end(n
);
1394 if ((nth
== 1) || (new_iv
>= total
- 1)) {
1395 // Do this one till the end - just in case we miscalculated
1396 // and either too much is left to process or new_iv is a bit too big:
1397 kmp_calc_original_ivs_for_end(original_bounds_nest
, n
,
1398 /*out*/ original_ivs_end
);
1402 // Note: here we make sure it's past (or equal to) the previous point.
1403 if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest
, n
,
1404 updated_bounds_nest
,
1405 original_ivs_start
, new_iv
,
1406 /*out*/ original_ivs_end
)) {
1407 // We could not find the ending point, use the original upper bounds:
1408 kmp_calc_original_ivs_for_end(original_bounds_nest
, n
,
1409 /*out*/ original_ivs_end
);
1415 #if defined(KMP_DEBUG)
1416 auto new_iv_for_end
= kmp_calc_new_iv_from_original_ivs(
1417 updated_bounds_nest
, original_ivs_end
, n
);
1418 KMP_DEBUG_ASSERT(new_iv_for_end
>= new_iv_for_start
);
1421 if (last_iter
&& (tid
!= 0)) {
1422 // We are done, this was last chunk, but no chunk for current thread was
1428 // We found the chunk for this thread, now we need to check if it's the
1429 // last chunk or not:
1431 CollapseAllocator
<kmp_uint64
> original_ivs_next_start(n
);
1433 !kmp_calc_next_original_ivs(original_bounds_nest
, n
, original_ivs_end
,
1434 /*out*/ original_ivs_next_start
)) {
1435 // no more loop iterations left to process,
1436 // this means that currently found chunk is the last chunk:
1437 if (plastiter
!= NULL
) {
1442 // Fill in chunk bounds:
1443 for (kmp_index_t i
= 0; i
< n
; ++i
) {
1444 chunk_bounds_nest
[i
] =
1445 original_bounds_nest
[i
]; // To fill in types, etc. - optional
1446 chunk_bounds_nest
[i
].lb0_u64
= original_ivs_start
[i
];
1447 chunk_bounds_nest
[i
].lb1_u64
= 0;
1449 chunk_bounds_nest
[i
].ub0_u64
= original_ivs_end
[i
];
1450 chunk_bounds_nest
[i
].ub1_u64
= 0;
1459 bool next_chunk
= kmp_calc_next_original_ivs(
1460 original_bounds_nest
, n
, original_ivs_end
, /*out*/ original_ivs_start
);
1462 // no more loop iterations to process,
1463 // the prevoius chunk was the last chunk
1467 // original_ivs_start is next to previous chunk original_ivs_end,
1468 // we need to start new chunk here, so chunks will be one after another
1469 // without any gap or overlap:
1470 new_iv
= kmp_calc_new_iv_from_original_ivs(updated_bounds_nest
,
1471 original_ivs_start
, n
);