Index: openmp/runtime/src/CMakeLists.txt =================================================================== --- openmp/runtime/src/CMakeLists.txt +++ openmp/runtime/src/CMakeLists.txt @@ -88,6 +88,7 @@ kmp_dispatch.cpp kmp_lock.cpp kmp_sched.cpp + kmp_collapse.cpp ) if(WIN32) # Windows specific files Index: openmp/runtime/src/dllexports =================================================================== --- openmp/runtime/src/dllexports +++ openmp/runtime/src/dllexports @@ -401,6 +401,9 @@ __kmpc_sections_init 289 __kmpc_next_section 290 __kmpc_end_sections 291 + __kmpc_process_loop_nest_rectang 293 + __kmpc_calc_original_ivs_rectang 295 + __kmpc_for_collapsed_init 296 %endif # User API entry points that have both lower- and upper- case versions for Fortran. Index: openmp/runtime/src/kmp_collapse.h =================================================================== --- /dev/null +++ openmp/runtime/src/kmp_collapse.h @@ -0,0 +1,244 @@ +/* + * kmp_collapse.h -- header for loop collapse feature + */ + +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef KMP_COLLAPSE_H +#define KMP_COLLAPSE_H + +#include +#include + +// Type of the index into the loop nest structures +// (with values from 0 to less than n from collapse(n)) +typedef kmp_int32 kmp_index_t; + +// Type for combined loop nest space IV: +typedef kmp_uint64 kmp_loop_nest_iv_t; + +// Loop has <, <=, etc. as a comparison: +enum comparison_t : kmp_int32 { + comp_less_or_eq = 0, + comp_greater_or_eq = 1, + comp_not_eq = 2, + comp_less = 3, + comp_greater = 4 +}; + +// Type of loop IV. +// Type of bounds and step, after usual promotions +// are a subset of these types (32 & 64 only): +enum loop_type_t : kmp_int32 { + loop_type_uint8 = 0, + loop_type_int8 = 1, + loop_type_uint16 = 2, + loop_type_int16 = 3, + loop_type_uint32 = 4, + loop_type_int32 = 5, + loop_type_uint64 = 6, + loop_type_int64 = 7 +}; + +/*! + @ingroup WORK_SHARING + * Describes the structure for rectangular nested loops. + */ +template struct bounds_infoXX_template { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + loop_type_t loop_type; // The differentiator + loop_type_t loop_iv_type; + comparison_t comparison; + // outer_iv should be 0 (or any other less then number of dimentions) + // if loop doesn't depend on it (lb1 and ub1 will be 0). + // This way we can do multiplication without a check. + kmp_index_t outer_iv; + + // unions to keep the size constant: + union { + T lb0; + kmp_uint64 lb0_u64; // real type can be signed + }; + + union { + T lb1; + kmp_uint64 lb1_u64; // real type can be signed + }; + + union { + T ub0; + kmp_uint64 ub0_u64; // real type can be signed + }; + + union { + T ub1; + kmp_uint64 ub1_u64; // real type can be signed + }; + + union { + ST step; // signed even if bounds type is unsigned + kmp_int64 step_64; // signed + }; + + kmp_loop_nest_iv_t trip_count; +}; + +/*! + @ingroup WORK_SHARING + * Interface struct for rectangular nested loops. + * Same size as bounds_infoXX_template. + */ +struct bounds_info_t { + + loop_type_t loop_type; // The differentiator + loop_type_t loop_iv_type; + comparison_t comparison; + // outer_iv should be 0 (or any other less then number of dimentions) + // if loop doesn't depend on it (lb1 and ub1 will be 0). + // This way we can do multiplication without a check. + kmp_index_t outer_iv; + + kmp_uint64 lb0_u64; // real type can be signed + kmp_uint64 lb1_u64; // real type can be signed + kmp_uint64 ub0_u64; // real type can be signed + kmp_uint64 ub1_u64; // real type can be signed + kmp_int64 step_64; // signed + + // This is internal, but it's the only internal thing we need + // in rectangular case, so let's expose it here: + kmp_loop_nest_iv_t trip_count; +}; + +//------------------------------------------------------------------------- +// Additional types for internal representation: + +// A point in the loop space, in the original space. +// It's represented in kmp_uint64, but each dimention is calculated in +// that loop IV type. Also dimentions have to be converted to those types +// when used in generated code. +typedef std::vector kmp_point_t; + +// Number of loop iterations on each nesting level to achieve some point, +// in expanded space or in original space. +// OMPTODO: move from using iterations to using offsets (iterations multiplied +// by steps). For those we need to be careful with the types, as step can be +// negative, but it'll remove multiplications and divisions in several places. +typedef std::vector kmp_iterations_t; + +// Internal struct with additional info: +template struct bounds_info_internalXX_template { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + // OMPTODO: should span have type T or should it better be + // kmp_uint64/kmp_int64 depending on T sign? (if kmp_uint64/kmp_int64 than + // updated bounds should probably also be kmp_uint64/kmp_int64). I'd like to + // use big_span_t, if it can be resolved at compile time. + typedef + typename std::conditional::value, kmp_int64, kmp_uint64> + big_span_t; + + // typedef typename big_span_t span_t; + typedef T span_t; + + bounds_infoXX_template b; // possibly adjusted bounds + + // Leaving this as a union in case we'll switch to span_t with different sizes + // (depending on T) + union { + // Smallest possible value of iv (may be smaller than actually possible) + span_t span_smallest; + kmp_uint64 span_smallest_u64; + }; + + // Leaving this as a union in case we'll switch to span_t with different sizes + // (depending on T) + union { + // Biggest possible value of iv (may be bigger than actually possible) + span_t span_biggest; + kmp_uint64 span_biggest_u64; + }; + + // Did we adjust loop bounds (not counting canonicalization)? + bool loop_bounds_adjusted; +}; + +// Internal struct with additional info: +struct bounds_info_internal_t { + + bounds_info_t b; // possibly adjusted bounds + + // Smallest possible value of iv (may be smaller than actually possible) + kmp_uint64 span_smallest_u64; + + // Biggest possible value of iv (may be bigger than actually possible) + kmp_uint64 span_biggest_u64; + + // Did we adjust loop bounds (not counting canonicalization)? + bool loop_bounds_adjusted; +}; + +//----------APIs for rectangular loop nests-------------------------------- + +// Canonicalize loop nest and calculate overall trip count. +// "bounds_nest" has to be allocated per thread. +// API will modify original bounds_nest array to bring it to a canonical form +// (only <= and >=, no !=, <, >). If the original loop nest was already in a +// canonical form there will be no changes to bounds in bounds_nest array +// (only trip counts will be calculated). +// Returns trip count of overall space. +extern "C" kmp_loop_nest_iv_t +__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, + /*in/out*/ bounds_info_t *original_bounds_nest, + kmp_index_t n); + +// Calculate old induction variables corresponding to overall new_iv. +// Note: original IV will be returned as if it had kmp_uint64 type, +// will have to be converted to original type in user code. +// Note: trip counts should be already calculated by +// __kmpc_process_loop_nest_rectang. +// OMPTODO: special case 2, 3 nested loops - if it'll be possible to inline +// that into user code. +extern "C" void +__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, + const bounds_info_t *original_bounds_nest, + /*out*/ kmp_uint64 *original_ivs, + kmp_index_t n); + +//----------Init API for non-rectangular loops-------------------------------- + +// Init API for collapsed loops (static, no chunks defined). +// "bounds_nest" has to be allocated per thread. +// API will modify original bounds_nest array to bring it to a canonical form +// (only <= and >=, no !=, <, >). If the original loop nest was already in a +// canonical form there will be no changes to bounds in bounds_nest array +// (only trip counts will be calculated). Internally API will expand the space +// to parallelogram/parallelepiped, calculate total, calculate bounds for the +// chunks in terms of the new IV, re-calc them in terms of old IVs (especially +// important on the left side, to hit the lower bounds and not step over), and +// pick the correct chunk for this thread (so it will calculate chunks up to the +// needed one). It could be optimized to calculate just this chunk, potentially +// a bit less well distributed among threads. It is designed to make sure that +// threads will receive predictable chunks, deterministically (so that next nest +// of loops with similar characteristics will get exactly same chunks on same +// threads). +// Current contract: chunk_bounds_nest has only lb0 and ub0, +// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). +extern "C" kmp_int32 +__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, + /*in/out*/ bounds_info_t *original_bounds_nest, + /*out*/ bounds_info_t *chunk_bounds_nest, + kmp_index_t n, + /*out*/ kmp_int32 *plastiter); + +#endif // KMP_COLLAPSE_H Index: openmp/runtime/src/kmp_collapse.cpp =================================================================== --- /dev/null +++ openmp/runtime/src/kmp_collapse.cpp @@ -0,0 +1,1466 @@ +/* + * kmp_collapse.cpp -- loop collapse feature + */ + +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "kmp.h" +#include "kmp_error.h" +#include "kmp_i18n.h" +#include "kmp_itt.h" +#include "kmp_stats.h" +#include "kmp_str.h" +#include "kmp_collapse.h" + +#if OMPT_SUPPORT +#include "ompt-specific.h" +#endif + +// OMPTODO: different style of comments (see kmp_sched) +// OMPTODO: OMPT/OMPD + +//---------------------------------------------------------------------------- +// Common functions for working with rectangular and non-rectangular loops +//---------------------------------------------------------------------------- + +template int sign(T val) { return (T(0) < val) - (val < T(0)); } + +//----------Loop canonicalization--------------------------------------------- + +// For loop nest (any shape): +// convert != to < or >; +// switch from using < or > to <= or >=. +// "bounds" array has to be allocated per thread. +// All other internal functions will work only with canonicalized loops. +template +void kmp_canonicalize_one_loop_XX(ident_t *loc, + /*in/out*/ bounds_infoXX_template *bounds) { + + if (__kmp_env_consistency_check) { + if (bounds->step == 0) { + __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, + loc); + } + } + + if (bounds->comparison == comparison_t::comp_not_eq) { + // We can convert this to < or >, depends on the sign of the step: + if (bounds->step > 0) { + bounds->comparison = comparison_t::comp_less; + } else { + bounds->comparison = comparison_t::comp_greater; + } + } + + if (bounds->comparison == comparison_t::comp_less) { + // Note: ub0 can be unsigned. Should be Ok to hit overflow here, + // because ub0 + ub1*j should be still positive (otherwise loop was not + // well formed) + bounds->ub0 -= 1; + bounds->comparison = comparison_t::comp_less_or_eq; + } else if (bounds->comparison == comparison_t::comp_greater) { + bounds->ub0 += 1; + bounds->comparison = comparison_t::comp_greater_or_eq; + } +} + +// Canonicalize loop nest. original_bounds_nest is an array of length n. +void kmp_canonicalize_loop_nest(ident_t *loc, + /*in/out*/ bounds_info_t *original_bounds_nest, + kmp_index_t n) { + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + kmp_canonicalize_one_loop_XX( + loc, + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + case loop_type_t::loop_type_uint32: + kmp_canonicalize_one_loop_XX( + loc, + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + case loop_type_t::loop_type_int64: + kmp_canonicalize_one_loop_XX( + loc, + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + case loop_type_t::loop_type_uint64: + kmp_canonicalize_one_loop_XX( + loc, + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + default: + KMP_ASSERT(false); + } + } +} + +//----------Calculating trip count on one level------------------------------- + +// Calculate trip count on this loop level. +// We do this either for a rectangular loop nest, +// or after an adjustment bringing the loops to a parallelepiped shape. +// This number should not depend on the value of outer IV +// even if the formular has lb1 and ub1. +// Note: for non-rectangular loops don't use span for this, it's too big. + +template +kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( + /*in/out*/ bounds_infoXX_template *bounds) { + + if (bounds->comparison == comparison_t::comp_less_or_eq) { + if (bounds->ub0 < bounds->lb0) { + // Note: after this we don't need to calculate inner loops, + // but that should be an edge case: + bounds->trip_count = 0; + } else { + // ub - lb may exceed signed type range; we need to cast to + // kmp_loop_nest_iv_t anyway + bounds->trip_count = + static_cast(bounds->ub0 - bounds->lb0) / + std::abs(bounds->step) + + 1; + } + } else if (bounds->comparison == comparison_t::comp_greater_or_eq) { + if (bounds->lb0 < bounds->ub0) { + // Note: after this we don't need to calculate inner loops, + // but that should be an edge case: + bounds->trip_count = 0; + } else { + // lb - ub may exceed signed type range; we need to cast to + // kmp_loop_nest_iv_t anyway + bounds->trip_count = + static_cast(bounds->lb0 - bounds->ub0) / + std::abs(bounds->step) + + 1; + } + } else { + KMP_ASSERT(false); + } + return bounds->trip_count; +} + +// Calculate trip count on this loop level. +kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { + + kmp_loop_nest_iv_t trip_count = 0; + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + trip_count = kmp_calculate_trip_count_XX( + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + case loop_type_t::loop_type_uint32: + trip_count = kmp_calculate_trip_count_XX( + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + case loop_type_t::loop_type_int64: + trip_count = kmp_calculate_trip_count_XX( + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + case loop_type_t::loop_type_uint64: + trip_count = kmp_calculate_trip_count_XX( + /*in/out*/ (bounds_infoXX_template *)(bounds)); + break; + default: + KMP_ASSERT(false); + } + + return trip_count; +} + +//----------Trim original iv according to its type---------------------------- + +// Trim original iv according to its type. +// Return kmp_uint64 value which can be easily used in all internal calculations +// And can be statically cast back to original type in user code. +kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { + kmp_uint64 res = 0; + + switch (loop_iv_type) { + case loop_type_t::loop_type_int8: + res = static_cast(static_cast(original_iv)); + break; + case loop_type_t::loop_type_uint8: + res = static_cast(static_cast(original_iv)); + break; + case loop_type_t::loop_type_int16: + res = static_cast(static_cast(original_iv)); + break; + case loop_type_t::loop_type_uint16: + res = static_cast(static_cast(original_iv)); + break; + case loop_type_t::loop_type_int32: + res = static_cast(static_cast(original_iv)); + break; + case loop_type_t::loop_type_uint32: + res = static_cast(static_cast(original_iv)); + break; + case loop_type_t::loop_type_int64: + res = static_cast(static_cast(original_iv)); + break; + case loop_type_t::loop_type_uint64: + res = static_cast(original_iv); + break; + default: + KMP_ASSERT(false); + } + + return res; +} + +//----------Compare two IVs (remember they have a type)----------------------- + +bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, + kmp_uint64 original_iv2) { + bool res = false; + + switch (loop_iv_type) { + case loop_type_t::loop_type_int8: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + case loop_type_t::loop_type_uint8: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + case loop_type_t::loop_type_int16: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + case loop_type_t::loop_type_uint16: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + case loop_type_t::loop_type_int32: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + case loop_type_t::loop_type_uint32: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + case loop_type_t::loop_type_int64: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + case loop_type_t::loop_type_uint64: + res = static_cast(original_iv1) == + static_cast(original_iv2); + break; + default: + KMP_ASSERT(false); + } + + return res; +} + +//----------Calculate original iv on one level-------------------------------- + +// Return true if the point fits into upper bounds on this level, +// false otherwise +template +bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template *bounds, + const kmp_point_t &original_ivs, + kmp_index_t ind) { + + T iv = static_cast(original_ivs[ind]); + T outer_iv = static_cast(original_ivs[bounds->outer_iv]); + + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (iv > + (bounds->ub0 + + bounds->ub1 * outer_iv))) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (iv < + (bounds->ub0 + bounds->ub1 * outer_iv)))) { + // The calculated point is outside of loop upper boundary: + return false; + } + + return true; +} + +// Calculate one iv corresponding to iteration on the level ind. +// Return true if it fits into lower-upper bounds on this level +// (if not, we need to re-calculate) +template +bool kmp_calc_one_iv_XX(const bounds_infoXX_template *bounds, + /*in/out*/ kmp_point_t &original_ivs, + const kmp_iterations_t &iterations, kmp_index_t ind, + bool start_with_lower_bound, bool checkBounds) { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + kmp_uint64 temp = 0; + T outer_iv = static_cast(original_ivs[bounds->outer_iv]); + + if (start_with_lower_bound) { + // we moved to the next iteration on one of outer loops, should start + // with the lower bound here: + temp = bounds->lb0 + bounds->lb1 * outer_iv; + } else { + auto iteration = iterations[ind]; + temp = bounds->lb0 + bounds->lb1 * outer_iv + + iteration * bounds->step; + } + + // Now trim original iv according to its type: + original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); + + if (checkBounds) { + return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind); + } else { + return true; + } +} + +bool kmp_calc_one_iv(const bounds_info_t *bounds, + /*in/out*/ kmp_point_t &original_ivs, + const kmp_iterations_t &iterations, kmp_index_t ind, + bool start_with_lower_bound, bool checkBounds) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + return kmp_calc_one_iv_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + case loop_type_t::loop_type_uint32: + return kmp_calc_one_iv_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + case loop_type_t::loop_type_int64: + return kmp_calc_one_iv_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + case loop_type_t::loop_type_uint64: + return kmp_calc_one_iv_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, + checkBounds); + break; + default: + KMP_ASSERT(false); + return false; + } +} + +//----------Calculate original iv on one level for rectangular loop nest------ +// Main difference with the common case: original_ivs is an array supplied by +// user + +// Calculate one iv corresponding to iteration on the level ind. +// Return true if it fits into lower-upper bounds on this level +// (if not, we need to re-calculate) +template +void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template *bounds, + /*in/out*/ kmp_uint64 *original_ivs, + const kmp_iterations_t &iterations, + kmp_index_t ind) { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + auto iteration = iterations[ind]; + + kmp_uint64 temp = + bounds->lb0 + + bounds->lb1 * static_cast(original_ivs[bounds->outer_iv]) + + iteration * bounds->step; + + // Now trim original iv according to its type: + original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); +} + +void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, + /*in/out*/ kmp_uint64 *original_ivs, + const kmp_iterations_t &iterations, + kmp_index_t ind) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + kmp_calc_one_iv_rectang_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + case loop_type_t::loop_type_uint32: + kmp_calc_one_iv_rectang_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + case loop_type_t::loop_type_int64: + kmp_calc_one_iv_rectang_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + case loop_type_t::loop_type_uint64: + kmp_calc_one_iv_rectang_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, iterations, ind); + break; + default: + KMP_ASSERT(false); + } +} + +//---------------------------------------------------------------------------- +// Rectangular loop nest +//---------------------------------------------------------------------------- + +//----------Canonicalize loop nest and calculate trip count------------------- + +// Canonicalize loop nest and calculate overall trip count. +// "bounds_nest" has to be allocated per thread. +// API will modify original bounds_nest array to bring it to a canonical form +// (only <= and >=, no !=, <, >). If the original loop nest was already in a +// canonical form there will be no changes to bounds in bounds_nest array +// (only trip counts will be calculated). +// Returns trip count of overall space. +extern "C" kmp_loop_nest_iv_t +__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, + /*in/out*/ bounds_info_t *original_bounds_nest, + kmp_index_t n) { + + kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); + + kmp_loop_nest_iv_t total = 1; + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + + kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds); + total *= trip_count; + } + + return total; +} + +//----------Calculate old induction variables--------------------------------- + +// Calculate old induction variables corresponding to overall new_iv. +// Note: original IV will be returned as if it had kmp_uint64 type, +// will have to be converted to original type in user code. +// Note: trip counts should be already calculated by +// __kmpc_process_loop_nest_rectang. +// OMPTODO: special case 2, 3 nested loops: either do different +// interface without array or possibly template this over n +extern "C" void +__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, + const bounds_info_t *original_bounds_nest, + /*out*/ kmp_uint64 *original_ivs, + kmp_index_t n) { + + kmp_iterations_t iterations(n); + + // First, calc corresponding iteration in every original loop: + for (kmp_index_t ind = n; ind > 0;) { + --ind; + auto bounds = &(original_bounds_nest[ind]); + + // should be optimized to OPDIVREM: + auto temp = new_iv / bounds->trip_count; + auto iteration = new_iv % bounds->trip_count; + new_iv = temp; + + iterations[ind] = iteration; + } + KMP_ASSERT(new_iv == 0); + + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + + kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); + } +} + +//---------------------------------------------------------------------------- +// Non-rectangular loop nest +//---------------------------------------------------------------------------- + +//----------Calculate maximum possible span of iv values on one level--------- + +// Calculate span for IV on this loop level for "<=" case. +// Note: it's for <= on this loop nest level, so lower bound should be smallest +// value, upper bound should be the biggest value. If the loop won't execute, +// 'smallest' may be bigger than 'biggest', but we'd better not switch them +// around. +template +void kmp_calc_span_lessoreq_XX( + /* in/out*/ bounds_info_internalXX_template *bounds, + /* in/out*/ std::vector &bounds_nest) { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + // typedef typename big_span_t span_t; + typedef T span_t; + + auto &bbounds = bounds->b; + + if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { + // This dimention depends on one of previous ones; can't be the outermost + // one. + bounds_info_internalXX_template *previous = + reinterpret_cast *>( + &(bounds_nest[bbounds.outer_iv])); + + // OMPTODO: assert that T is compatible with loop variable type on + // 'previous' loop + + { + span_t bound_candidate1 = + bbounds.lb0 + bbounds.lb1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.lb0 + bbounds.lb1 * previous->span_biggest; + if (bound_candidate1 < bound_candidate2) { + bounds->span_smallest = bound_candidate1; + } else { + bounds->span_smallest = bound_candidate2; + } + } + + { + // We can't adjust the upper bound with respect to step, because + // lower bound might be off after adjustments + + span_t bound_candidate1 = + bbounds.ub0 + bbounds.ub1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.ub0 + bbounds.ub1 * previous->span_biggest; + if (bound_candidate1 < bound_candidate2) { + bounds->span_biggest = bound_candidate2; + } else { + bounds->span_biggest = bound_candidate1; + } + } + } else { + // Rectangular: + bounds->span_smallest = bbounds.lb0; + bounds->span_biggest = bbounds.ub0; + } + if (!bounds->loop_bounds_adjusted) { + // Here it's safe to reduce the space to the multiply of step. + // OMPTODO: check if the formular is correct. + // Also check if it would be safe to do this if we didn't adjust left side. + bounds->span_biggest -= + (static_cast(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? + } +} + +// Calculate span for IV on this loop level for ">=" case. +template +void kmp_calc_span_greateroreq_XX( + /* in/out*/ bounds_info_internalXX_template *bounds, + /* in/out*/ std::vector &bounds_nest) { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + // typedef typename big_span_t span_t; + typedef T span_t; + + auto &bbounds = bounds->b; + + if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { + // This dimention depends on one of previous ones; can't be the outermost + // one. + bounds_info_internalXX_template *previous = + reinterpret_cast *>( + &(bounds_nest[bbounds.outer_iv])); + + // OMPTODO: assert that T is compatible with loop variable type on + // 'previous' loop + + { + span_t bound_candidate1 = + bbounds.lb0 + bbounds.lb1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.lb0 + bbounds.lb1 * previous->span_biggest; + if (bound_candidate1 >= bound_candidate2) { + bounds->span_smallest = bound_candidate1; + } else { + bounds->span_smallest = bound_candidate2; + } + } + + { + // We can't adjust the upper bound with respect to step, because + // lower bound might be off after adjustments + + span_t bound_candidate1 = + bbounds.ub0 + bbounds.ub1 * previous->span_smallest; + span_t bound_candidate2 = + bbounds.ub0 + bbounds.ub1 * previous->span_biggest; + if (bound_candidate1 >= bound_candidate2) { + bounds->span_biggest = bound_candidate2; + } else { + bounds->span_biggest = bound_candidate1; + } + } + + } else { + // Rectangular: + bounds->span_biggest = bbounds.lb0; + bounds->span_smallest = bbounds.ub0; + } + if (!bounds->loop_bounds_adjusted) { + // Here it's safe to reduce the space to the multiply of step. + // OMPTODO: check if the formular is correct. + // Also check if it would be safe to do this if we didn't adjust left side. + bounds->span_biggest -= + (static_cast(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? + } +} + +// Calculate maximum possible span for IV on this loop level. +template +void kmp_calc_span_XX( + /* in/out*/ bounds_info_internalXX_template *bounds, + /* in/out*/ std::vector &bounds_nest) { + + if (bounds->b.comparison == comparison_t::comp_less_or_eq) { + kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + } else { + KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq); + kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + } +} + +//----------All initial processing of the loop nest--------------------------- + +// Calculate new bounds for this loop level. +// To be able to work with the nest we need to get it to a parallelepiped shape. +// We need to stay in the original range of values, so that there will be no +// overflow, for that we'll adjust both upper and lower bounds as needed. +template +void kmp_calc_new_bounds_XX( + /* in/out*/ bounds_info_internalXX_template *bounds, + /* in/out*/ std::vector &bounds_nest) { + + auto &bbounds = bounds->b; + + if (bbounds.lb1 == bbounds.ub1) { + // Already parallel, no need to adjust: + bounds->loop_bounds_adjusted = false; + } else { + bounds->loop_bounds_adjusted = true; + + T old_lb1 = bbounds.lb1; + T old_ub1 = bbounds.ub1; + + if (sign(old_lb1) != sign(old_ub1)) { + // With this shape we can adjust to a rectangle: + bbounds.lb1 = 0; + bbounds.ub1 = 0; + } else { + // get upper and lower bounds to be parallel + // with values in the old range. + // Note: std::abs didn't work here. + if (((sign(old_lb1) == -1) && (old_lb1 < old_ub1)) || + ((sign(old_lb1) == 1) && (old_lb1 > old_ub1))) { + bbounds.lb1 = old_ub1; + } else { + bbounds.ub1 = old_lb1; + } + } + + // Now need to adjust lb0, ub0, otherwise in some cases space will shrink. + // The idea here that for this IV we are now getting the same span + // irrespective of the previous IV value. + bounds_info_internalXX_template *previous = + reinterpret_cast *>( + &bounds_nest[bbounds.outer_iv]); + + if (bbounds.comparison == comparison_t::comp_less_or_eq) { + if (old_lb1 < bbounds.lb1) { + KMP_ASSERT(old_lb1 < 0); + // The length is good on outer_iv biggest number, + // can use it to find where to move the lower bound: + + T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest; + bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space? + // e.g. it was 0?? (same below) + } else if (old_lb1 > bbounds.lb1) { + // still need to move lower bound: + T add = (old_lb1 - bbounds.lb1) * previous->span_smallest; + bbounds.lb0 += add; + } + + if (old_ub1 > bbounds.ub1) { + KMP_ASSERT(old_ub1 > 0); + // The length is good on outer_iv biggest number, + // can use it to find where to move upper bound: + + T add = (old_ub1 - bbounds.ub1) * previous->span_biggest; + bbounds.ub0 += add; + } else if (old_ub1 < bbounds.ub1) { + // still need to move upper bound: + T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest; + bbounds.ub0 -= sub; + } + } else { + KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq); + if (old_lb1 < bbounds.lb1) { + KMP_ASSERT(old_lb1 < 0); + T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest; + bbounds.lb0 -= sub; + } else if (old_lb1 > bbounds.lb1) { + T add = (old_lb1 - bbounds.lb1) * previous->span_biggest; + bbounds.lb0 += add; + } + + if (old_ub1 > bbounds.ub1) { + KMP_ASSERT(old_ub1 > 0); + T add = (old_ub1 - bbounds.ub1) * previous->span_smallest; + bbounds.ub0 += add; + } else if (old_ub1 < bbounds.ub1) { + T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest; + bbounds.ub0 -= sub; + } + } + } +} + +// Do all processing for one canonicalized loop in the nest +// (assuming that outer loops already were processed): +template +kmp_loop_nest_iv_t kmp_process_one_loop_XX( + /* in/out*/ bounds_info_internalXX_template *bounds, + /*in/out*/ std::vector &bounds_nest) { + + kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); + return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b)); +} + +// Non-rectangular loop nest, canonicalized to use <= or >=. +// Process loop nest to have a parallelepiped shape, +// calculate biggest spans for IV's on all levels and calculate overall trip +// count. "bounds_nest" has to be allocated per thread. +// Returns overall trip count (for adjusted space). +kmp_loop_nest_iv_t kmp_process_loop_nest( + /*in/out*/ std::vector &bounds_nest) { + + kmp_loop_nest_iv_t total = 1; + + for (kmp_index_t ind = 0; ind < bounds_nest.size(); ++ind) { + auto bounds = &(bounds_nest[ind]); + kmp_loop_nest_iv_t trip_count = 0; + + switch (bounds->b.loop_type) { + case loop_type_t::loop_type_int32: + trip_count = kmp_process_one_loop_XX( + /*in/out*/ (bounds_info_internalXX_template *)(bounds), + /*in/out*/ bounds_nest); + break; + case loop_type_t::loop_type_uint32: + trip_count = kmp_process_one_loop_XX( + /*in/out*/ (bounds_info_internalXX_template *)(bounds), + /*in/out*/ bounds_nest); + break; + case loop_type_t::loop_type_int64: + trip_count = kmp_process_one_loop_XX( + /*in/out*/ (bounds_info_internalXX_template *)(bounds), + /*in/out*/ bounds_nest); + break; + case loop_type_t::loop_type_uint64: + trip_count = kmp_process_one_loop_XX( + /*in/out*/ (bounds_info_internalXX_template *)(bounds), + /*in/out*/ bounds_nest); + break; + default: + KMP_ASSERT(false); + } + total *= trip_count; + } + + return total; +} + +//----------Calculate iterations (in the original or updated space)----------- + +// Calculate number of iterations in original or updated space resulting in +// original_ivs[ind] (only on this level, non-negative) +// (not counting initial iteration) +template +kmp_loop_nest_iv_t +kmp_calc_number_of_iterations_XX(const bounds_infoXX_template *bounds, + const kmp_point_t &original_ivs, + kmp_index_t ind) { + + kmp_loop_nest_iv_t iterations = 0; + + if (bounds->comparison == comparison_t::comp_less_or_eq) { + iterations = + (static_cast(original_ivs[ind]) - bounds->lb0 - + bounds->lb1 * static_cast(original_ivs[bounds->outer_iv])) / + std::abs(bounds->step); + } else { + KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq); + iterations = (bounds->lb0 + + bounds->lb1 * static_cast(original_ivs[bounds->outer_iv]) - + static_cast(original_ivs[ind])) / + std::abs(bounds->step); + } + + return iterations; +} + +// Calculate number of iterations in the original or updated space resulting in +// original_ivs[ind] (only on this level, non-negative) +kmp_loop_nest_iv_t +kmp_calc_number_of_iterations(const bounds_info_t *bounds, + const kmp_point_t &original_ivs, + kmp_index_t ind) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + return kmp_calc_number_of_iterations_XX( + (bounds_infoXX_template *)(bounds), original_ivs, ind); + break; + case loop_type_t::loop_type_uint32: + return kmp_calc_number_of_iterations_XX( + (bounds_infoXX_template *)(bounds), original_ivs, ind); + break; + case loop_type_t::loop_type_int64: + return kmp_calc_number_of_iterations_XX( + (bounds_infoXX_template *)(bounds), original_ivs, ind); + break; + case loop_type_t::loop_type_uint64: + return kmp_calc_number_of_iterations_XX( + (bounds_infoXX_template *)(bounds), original_ivs, ind); + break; + default: + KMP_ASSERT(false); + return 0; + } +} + +//----------Calculate new iv corresponding to original ivs-------------------- + +// We got a point in the original loop nest. +// Take updated bounds and calculate what new_iv will correspond to this point. +// When we are getting original IVs from new_iv, we have to adjust to fit into +// original loops bounds. Getting new_iv for the adjusted original IVs will help +// with making more chunks non-empty. +kmp_loop_nest_iv_t kmp_calc_new_iv_from_original_ivs( + const std::vector &bounds_nest, + const kmp_point_t &original_ivs) { + + kmp_loop_nest_iv_t new_iv = 0; + + for (kmp_index_t ind = 0; ind < bounds_nest.size(); ++ind) { + auto bounds = &(bounds_nest[ind].b); + + new_iv = new_iv * bounds->trip_count + + kmp_calc_number_of_iterations(bounds, original_ivs, ind); + } + + return new_iv; +} + +//----------Calculate original ivs for provided iterations-------------------- + +// Calculate original IVs for provided iterations, assuming iterations are +// calculated in the original space. +// Loop nest is in canonical form (with <= / >=). +bool kmp_calc_original_ivs_from_iterations( + const bounds_info_t *original_bounds_nest, kmp_index_t n, + /*in/out*/ kmp_point_t &original_ivs, + /*in/out*/ kmp_iterations_t &iterations, kmp_index_t ind) { + + kmp_index_t lengthened_ind = n; + + for (; ind < n;) { + auto bounds = &(original_bounds_nest[ind]); + bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations, + ind, (lengthened_ind < ind), true); + + if (!good) { + // The calculated iv value is too big (or too small for >=): + if (ind == 0) { + // Space is empty: + return false; + } else { + // Go to next iteration on the outer loop: + --ind; + ++iterations[ind]; + lengthened_ind = ind; + for (kmp_index_t i = ind + 1; i < n; ++i) { + iterations[i] = 0; + } + continue; + } + } + ++ind; + } + + return true; +} + +//----------Calculate original ivs for the beginning of the loop nest--------- + +// Calculate IVs for the beginning of the loop nest. +// Note: lower bounds of all loops may not work - +// if on some of the iterations of the outer loops inner loops are empty. +// Loop nest is in canonical form (with <= / >=). +bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, + kmp_index_t n, + /*out*/ kmp_point_t &original_ivs) { + + // Iterations in the original space, multiplied by step: + kmp_iterations_t iterations(n); + + for (kmp_index_t ind = n; ind > 0;) { + --ind; + iterations[ind] = 0; + } + + // Now calculate the point: + return kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, + /*in/out*/ original_ivs, + /*in/out*/ iterations, 0); +} + +//----------Calculate next point in the original loop space------------------- + +// From current set of original IVs calculate next point. +// Return false if there is no next point in the loop bounds. +bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, + kmp_index_t n, const kmp_point_t &original_ivs, + /*out*/ kmp_point_t &next_original_ivs) { + // Iterations in the original space, multiplied by step (so can be negative): + kmp_iterations_t iterations(n); + + // First, calc corresponding iteration in every original loop: + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind); + } + + next_original_ivs = original_ivs; + + // Next add one step to the iterations on the inner-most level, and see if we + // need to move up the nest: + kmp_index_t ind = n - 1; + ++iterations[ind]; + + return kmp_calc_original_ivs_from_iterations( + original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); +} + +//----------Calculate chunk end in the original loop space-------------------- + +// For one level calculate old induction variable corresponding to overall +// new_iv for the chunk end. +// Return true if it fits into upper bound on this level +// (if not, we need to re-calculate) +template +bool kmp_calc_one_iv_for_chunk_end_XX( + const bounds_infoXX_template *bounds, + const bounds_infoXX_template *updated_bounds, + /*in/out*/ kmp_point_t &original_ivs, const kmp_iterations_t &iterations, + kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, + const kmp_point_t &original_ivs_start) { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + typedef + typename std::conditional::value, kmp_int64, kmp_uint64> + big_span_t; + + // OMPTODO: is it good enough, or do we need ST or do we need big_span_t? + T temp = 0; + + T outer_iv = static_cast(original_ivs[bounds->outer_iv]); + + if (start_with_lower_bound) { + // we moved to the next iteration on one of outer loops, may as well use + // the lower bound here: + temp = bounds->lb0 + bounds->lb1 * outer_iv; + } else { + // Start in expanded space, but: + // - we need to hit original space lower bound, so need to account for + // that + // - we have to go into original space, even if that means adding more + // iterations than was planned + // - we have to go past (or equal to) previous point (which is the chunk + // starting point) + + auto iteration = iterations[ind]; + + auto step = bounds->step; + + // In case of >= it's negative: + auto accountForStep = + ((bounds->lb0 + bounds->lb1 * outer_iv) - + (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) % + step; + + temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv + + accountForStep + iteration * step; + + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) { + // Too small (or too big), didn't reach the original lower bound. Use + // heuristic: + temp = bounds->lb0 + bounds->lb1 * outer_iv + + iteration / 2 * step; + } + + if (compare_with_start) { + + T start = static_cast(original_ivs_start[ind]); + + temp = kmp_fix_iv(bounds->loop_iv_type, temp); + + // On all previous levels start of the chunk is same as the end, need to + // be really careful here: + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (temp < start)) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (temp > start))) { + // End of the chunk can't be smaller (for >= bigger) than it's start. + // Use heuristic: + temp = start + iteration / 4 * step; + } + } + } + + original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp); + + if (((bounds->comparison == comparison_t::comp_less_or_eq) && + (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) || + ((bounds->comparison == comparison_t::comp_greater_or_eq) && + (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) { + // Too big (or too small for >=). + return false; + } + + return true; +} + +// For one level calculate old induction variable corresponding to overall +// new_iv for the chunk end. +bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, + const bounds_info_t *updated_bounds, + /*in/out*/ kmp_point_t &original_ivs, + const kmp_iterations_t &iterations, + kmp_index_t ind, bool start_with_lower_bound, + bool compare_with_start, + const kmp_point_t &original_ivs_start) { + + switch (bounds->loop_type) { + case loop_type_t::loop_type_int32: + return kmp_calc_one_iv_for_chunk_end_XX( + (bounds_infoXX_template *)(bounds), + (bounds_infoXX_template *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + case loop_type_t::loop_type_uint32: + return kmp_calc_one_iv_for_chunk_end_XX( + (bounds_infoXX_template *)(bounds), + (bounds_infoXX_template *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + case loop_type_t::loop_type_int64: + return kmp_calc_one_iv_for_chunk_end_XX( + (bounds_infoXX_template *)(bounds), + (bounds_infoXX_template *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + case loop_type_t::loop_type_uint64: + return kmp_calc_one_iv_for_chunk_end_XX( + (bounds_infoXX_template *)(bounds), + (bounds_infoXX_template *)(updated_bounds), + /*in/out*/ + original_ivs, iterations, ind, start_with_lower_bound, + compare_with_start, original_ivs_start); + break; + default: + KMP_ASSERT(false); + return false; + } +} + +// Calculate old induction variables corresponding to overall new_iv for the +// chunk end. If due to space extension we are getting old IVs outside of the +// boundaries, bring them into the boundaries. Need to do this in the runtime, +// esp. on the lower bounds side. When getting result need to make sure that the +// new chunk starts at next position to old chunk, not overlaps with it (this is +// done elsewhere), and need to make sure end of the chunk is further than the +// beginning of the chunk. We don't need an exact ending point here, just +// something more-or-less close to the desired chunk length, bigger is fine +// (smaller would be fine, but we risk going into infinite loop, so do smaller +// only at the very end of the space). result: false if could not find the +// ending point in the original loop space. In this case the caller can use +// original upper bounds as the end of the chunk. Chunk won't be empty, because +// it'll have at least the starting point, which is by construction in the +// original space. +bool kmp_calc_original_ivs_for_chunk_end( + const bounds_info_t *original_bounds_nest, kmp_index_t n, + const std::vector &updated_bounds_nest, + const kmp_point_t &original_ivs_start, kmp_loop_nest_iv_t new_iv, + /*out*/ kmp_point_t &original_ivs) { + + // Iterations in the expanded space: + kmp_iterations_t iterations(n); + + KMP_DEBUG_ASSERT(updated_bounds_nest.size() == n); + +#if defined(KMP_DEBUG) + auto new_iv_saved = new_iv; +#endif + + // First, calc corresponding iteration in every modified loop: + for (kmp_index_t ind = n; ind > 0;) { + --ind; + auto &updated_bounds = updated_bounds_nest[ind]; + + // should be optimized to OPDIVREM: + auto new_ind = new_iv / updated_bounds.b.trip_count; + auto iteration = new_iv % updated_bounds.b.trip_count; + + new_iv = new_ind; + iterations[ind] = iteration; + } + KMP_DEBUG_ASSERT(new_iv == 0); + + kmp_index_t lengthened_ind = n; + kmp_index_t equal_ind = -1; + + // Next calculate the point, but in original loop nest. + for (kmp_index_t ind = 0; ind < n;) { + auto bounds = &(original_bounds_nest[ind]); + auto updated_bounds = &(updated_bounds_nest[ind].b); + + bool good = kmp_calc_one_iv_for_chunk_end( + bounds, updated_bounds, + /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind), + (equal_ind >= ind - 1), original_ivs_start); + + if (!good) { + // Too big (or too small for >=). + if (ind == 0) { + // Need to reduce to the end. + return false; + } else { + // Go to next iteration on outer loop: + --ind; + ++(iterations[ind]); + lengthened_ind = ind; + if (equal_ind >= lengthened_ind) { + // We've changed the number of iterations here, + // can't be same anymore: + equal_ind = lengthened_ind - 1; + } + for (kmp_index_t i = ind + 1; i < n; ++i) { + iterations[i] = 0; + } + continue; + } + } + + if ((equal_ind == ind - 1) && + (kmp_ivs_eq(bounds->loop_iv_type, + original_ivs[ind], original_ivs_start[ind]))) { + equal_ind = ind; + } else if ((equal_ind > ind - 1) && + !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], + original_ivs_start[ind]))) { + equal_ind = ind - 1; + } + ++ind; + } + + return true; +} + +//----------Calculate upper bounds for the last chunk------------------------- + +// Calculate one upper bound for the end. +template +void kmp_calc_one_iv_end_XX(const bounds_infoXX_template *bounds, + /*in/out*/ kmp_point_t &original_ivs, + kmp_index_t ind) { + + typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; + + T temp = bounds->ub0 + + bounds->ub1 * static_cast(original_ivs[bounds->outer_iv]); + + original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); +} + +void kmp_calc_one_iv_end(const bounds_info_t *bounds, + /*in/out*/ kmp_point_t &original_ivs, + kmp_index_t ind) { + + switch (bounds->loop_type) { + default: + KMP_ASSERT(false); + break; + case loop_type_t::loop_type_int32: + kmp_calc_one_iv_end_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, ind); + break; + case loop_type_t::loop_type_uint32: + kmp_calc_one_iv_end_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, ind); + break; + case loop_type_t::loop_type_int64: + kmp_calc_one_iv_end_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, ind); + break; + case loop_type_t::loop_type_uint64: + kmp_calc_one_iv_end_XX( + (bounds_infoXX_template *)(bounds), + /*in/out*/ original_ivs, ind); + break; + } +} + +// Calculate upper bounds for the last loop iteration. Just use original upper +// bounds (adjusted when canonicalized to use <= / >=). No need to check that this +// point is in the original space (it's likely not) +void kmp_calc_original_ivs_for_end( + const bounds_info_t *const original_bounds_nest, kmp_index_t n, + /*out*/ kmp_point_t &original_ivs) { + for (kmp_index_t ind = 0; ind < n; ++ind) { + auto bounds = &(original_bounds_nest[ind]); + kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind); + } +} + +//----------Init API for non-rectangular loops-------------------------------- + +// Init API for collapsed loops (static, no chunks defined). +// "bounds_nest" has to be allocated per thread. +// API will modify original bounds_nest array to bring it to a canonical form +// (only <= and >=, no !=, <, >). If the original loop nest was already in a +// canonical form there will be no changes to bounds in bounds_nest array +// (only trip counts will be calculated). Internally API will expand the space +// to parallelogram/parallelepiped, calculate total, calculate bounds for the +// chunks in terms of the new IV, re-calc them in terms of old IVs (especially +// important on the left side, to hit the lower bounds and not step over), and +// pick the correct chunk for this thread (so it will calculate chunks up to the +// needed one). It could be optimized to calculate just this chunk, potentially +// a bit less well distributed among threads. It is designed to make sure that +// threads will receive predictable chunks, deterministically (so that next nest +// of loops with similar characteristics will get exactly same chunks on same +// threads). +// Current contract: chunk_bounds_nest has only lb0 and ub0, +// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). +extern "C" kmp_int32 +__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, + /*in/out*/ bounds_info_t *original_bounds_nest, + /*out*/ bounds_info_t *chunk_bounds_nest, + kmp_index_t n, /*out*/ kmp_int32 *plastiter) { + + KMP_DEBUG_ASSERT(plastiter && original_bounds_nest); + KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid)); + + if (__kmp_env_consistency_check) { + __kmp_push_workshare(gtid, ct_pdo, loc); + } + + kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); + + std::vector updated_bounds_nest(n); + + for (kmp_index_t i = 0; i < n; ++i) { + updated_bounds_nest[i].b = original_bounds_nest[i]; + } + + kmp_loop_nest_iv_t total = + kmp_process_loop_nest(/*in/out*/ updated_bounds_nest); + + if (plastiter != NULL) { + *plastiter = FALSE; + } + + if (total == 0) { + // Loop won't execute: + return FALSE; + } + + // OMPTODO: DISTRIBUTE is not supported yet + __kmp_assert_valid_gtid(gtid); + kmp_uint32 tid = __kmp_tid_from_gtid(gtid); + + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_uint32 nth = team->t.t_nproc; // Number of threads + + KMP_DEBUG_ASSERT(tid < nth); + + kmp_point_t original_ivs_start(n); + + if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, + /*out*/ original_ivs_start)) { + // Loop won't execute: + return FALSE; + } + + // Not doing this optimization for one thread: + // (1) more to test + // (2) without it current contract that chunk_bounds_nest has only lb0 and ub0, + // lb1 and ub1 are set to 0 and can be ignored. + //if (nth == 1) { + // // One thread: + // // Copy all info from original_bounds_nest, it'll be good enough. + + // for (kmp_index_t i = 0; i < n; ++i) { + // chunk_bounds_nest[i] = original_bounds_nest[i]; + // } + + // if (plastiter != NULL) { + // *plastiter = TRUE; + // } + // return TRUE; + //} + + kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs( + updated_bounds_nest, original_ivs_start); + + bool last_iter = false; + + for (; nth > 0;) { + // We could calculate chunk size once, but this is to compensate that the + // original space is not parallelepiped and some threads can be left + // without work: + KMP_DEBUG_ASSERT(total >= new_iv); + + kmp_loop_nest_iv_t total_left = total - new_iv; + kmp_loop_nest_iv_t chunk_size = total_left / nth; + kmp_loop_nest_iv_t remainder = total_left % nth; + + kmp_loop_nest_iv_t curr_chunk_size = chunk_size; + + if (remainder > 0) { + ++curr_chunk_size; + --remainder; + } + +#if defined(KMP_DEBUG) + kmp_loop_nest_iv_t new_iv_for_start = new_iv; +#endif + + if (curr_chunk_size > 1) { + new_iv += curr_chunk_size - 1; + } + + kmp_point_t original_ivs_end(n); + if ((nth == 1) || (new_iv >= total - 1)) { + // Do this one till the end - just in case we miscalculated + // and either too much is left to process or new_iv is a bit too big: + kmp_calc_original_ivs_for_end(original_bounds_nest, n, + /*out*/ original_ivs_end); + + last_iter = true; + } else { + // Note: here we make sure it's past (or equal to) the previous point. + if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n, + updated_bounds_nest, + original_ivs_start, new_iv, + /*out*/ original_ivs_end)) { + // We could not find the ending point, use the original upper bounds: + kmp_calc_original_ivs_for_end(original_bounds_nest, n, + /*out*/ original_ivs_end); + + last_iter = true; + } + } + +#if defined(KMP_DEBUG) + auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, + original_ivs_end); + KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start); +#endif + + if (last_iter && (tid != 0)) { + // We are done, this was last chunk, but no chunk for current thread was + // found: + return FALSE; + } + + if (tid == 0) { + // We found the chunk for this thread, now we need to check if it's the + // last chunk or not: + + kmp_point_t original_ivs_next_start(n); + + if (last_iter || + !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, + /*out*/ original_ivs_next_start)) { + // no more loop iterations left to process, + // this means that currently found chunk is the last chunk: + if (plastiter != NULL) { + *plastiter = TRUE; + } + } + + // Fill in chunk bounds: + for (kmp_index_t i = 0; i < n; ++i) { + chunk_bounds_nest[i] = + original_bounds_nest[i]; // To fill in types, etc. - optional + chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i]; + chunk_bounds_nest[i].lb1_u64 = 0; + + chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i]; + chunk_bounds_nest[i].ub1_u64 = 0; + } + + return TRUE; + } + + --tid; + --nth; + + bool next_chunk = kmp_calc_next_original_ivs( + original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start); + if (!next_chunk) { + // no more loop iterations to process, + // the prevoius chunk was the last chunk + break; + } + + // original_ivs_start is next to previous chunk original_ivs_end, + // we need to start new chunk here, so chunks will be one after another + // without any gap or overlap: + new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, + original_ivs_start); + } + + return FALSE; +}