Index: include/polly/CodeGen/IslAst.h =================================================================== --- include/polly/CodeGen/IslAst.h +++ include/polly/CodeGen/IslAst.h @@ -52,7 +52,7 @@ IslAstUserPayload() : IsInnermost(false), IsInnermostParallel(false), IsOutermostParallel(false), IsReductionParallel(false), - Build(nullptr) {} + DependenceDistance(nullptr), Build(nullptr) {} /// @brief Cleanup all isl structs on destruction. ~IslAstUserPayload(); @@ -69,6 +69,9 @@ /// @brief Flag to mark parallel loops which break reductions. bool IsReductionParallel; + /// @brief The dependence distance for this loop if constant and not zero. + struct isl_pw_aff *DependenceDistance; + /// @brief The build environment at the time this node was constructed. isl_ast_build *Build; @@ -126,6 +129,10 @@ /// @brief Get the nodes schedule or a nullptr if not available. static __isl_give isl_union_map *getSchedule(__isl_keep isl_ast_node *Node); + /// @brief Get the nodes dependence distance or a nullptr if not available. + static __isl_give isl_pw_aff * + getDependenceDistance(__isl_keep isl_ast_node *Node); + /// @brief Get the nodes broken reductions or a nullptr if not available. static MemoryAccessSet *getBrokenReductions(__isl_keep isl_ast_node *Node); Index: include/polly/Dependences.h =================================================================== --- include/polly/Dependences.h +++ include/polly/Dependences.h @@ -28,6 +28,7 @@ #include #include "isl/ctx.h" +struct isl_pw_aff; struct isl_union_map; struct isl_union_set; struct isl_map; @@ -88,13 +89,16 @@ /// @brief Check if a partial schedule is parallel wrt to @p Deps. /// - /// @param Schedule The subset of the scattering space that we want to check. - /// @param Deps The dependences @p Schedule needs to respect. + /// @param Schedule The subset of the scattering space that we want to + /// check. + /// @param Deps The dependences @p Schedule needs to respect. + /// @param DistancePtr If given the dependences distance will be return here. /// /// @return Returns true, if executing parallel the outermost dimension of /// @p Schedule is valid according to the dependences @p Deps. bool isParallel(__isl_keep isl_union_map *Schedule, - __isl_take isl_union_map *Deps); + __isl_take isl_union_map *Deps, + __isl_give isl_pw_aff **DistancePtr = nullptr); /// @brief Get the dependences in this Scop. /// Index: lib/Analysis/Dependences.cpp =================================================================== --- lib/Analysis/Dependences.cpp +++ lib/Analysis/Dependences.cpp @@ -486,9 +486,12 @@ // dimension, then the loop is parallel. The distance is zero in the current // dimension if it is a subset of a map with equal values for the current // dimension. -bool Dependences::isParallel(isl_union_map *Schedule, isl_union_map *Deps) { - isl_map *ScheduleDeps, *Test; - unsigned Dimension, IsParallel; +bool Dependences::isParallel(isl_union_map *Schedule, isl_union_map *Deps, + isl_pw_aff **DistancePtr) { + isl_set *Deltas, *Distance; + isl_map *ScheduleDeps; + unsigned Dimension; + bool IsParallel; Deps = isl_union_map_apply_range(Deps, isl_union_map_copy(Schedule)); Deps = isl_union_map_apply_domain(Deps, isl_union_map_copy(Schedule)); @@ -504,14 +507,28 @@ for (unsigned i = 0; i < Dimension; i++) ScheduleDeps = isl_map_equate(ScheduleDeps, isl_dim_out, i, isl_dim_in, i); - Test = isl_map_universe(isl_map_get_space(ScheduleDeps)); - Test = isl_map_equate(Test, isl_dim_out, Dimension, isl_dim_in, Dimension); - IsParallel = isl_map_is_subset(ScheduleDeps, Test); + Deltas = isl_map_deltas(ScheduleDeps); + Distance = isl_set_universe(isl_set_get_space(Deltas)); - isl_map_free(Test); - isl_map_free(ScheduleDeps); + // [0, ..., 0, +] - All zeros and last dimension larger than zero + for (unsigned i = 0; i < Dimension; i++) + Distance = isl_set_fix_si(Distance, isl_dim_set, i, 0); + + Distance = isl_set_lower_bound_si(Distance, isl_dim_set, Dimension, 1); + Distance = isl_set_intersect(Distance, Deltas); + + IsParallel = isl_set_is_empty(Distance); + if (IsParallel || !DistancePtr) { + isl_set_free(Distance); + return IsParallel; + } - return IsParallel; + Distance = isl_set_project_out(Distance, isl_dim_set, 0, Dimension); + Distance = isl_set_coalesce(Distance); + + *DistancePtr = isl_pw_aff_coalesce(isl_set_dim_min(Distance, 0)); + + return false; } static void printDependencyMap(raw_ostream &OS, __isl_keep isl_union_map *DM) { Index: lib/CodeGen/IslAst.cpp =================================================================== --- lib/CodeGen/IslAst.cpp +++ lib/CodeGen/IslAst.cpp @@ -82,6 +82,7 @@ IslAstInfo::IslAstUserPayload::~IslAstUserPayload() { isl_ast_build_free(Build); + isl_pw_aff_free(DependenceDistance); } /// @brief Temporary information used when building the ast. @@ -102,9 +103,12 @@ /// @brief Print a string @p str in a single line using @p Printer. static isl_printer *printLine(__isl_take isl_printer *Printer, - const std::string &str) { + const std::string &str, + __isl_keep isl_pw_aff *PWA = nullptr) { Printer = isl_printer_start_line(Printer); Printer = isl_printer_print_str(Printer, str.c_str()); + if (PWA) + Printer = isl_printer_print_pw_aff(Printer, PWA); return isl_printer_end_line(Printer); } @@ -141,16 +145,22 @@ __isl_take isl_ast_print_options *Options, __isl_keep isl_ast_node *Node, void *) { + isl_pw_aff *DD = IslAstInfo::getDependenceDistance(Node); const std::string BrokenReductionsStr = getBrokenReductionsStr(Node); + const std::string DepDisPragmaStr = "#pragma minimal dependence distance: "; const std::string SimdPragmaStr = "#pragma simd"; const std::string OmpPragmaStr = "#pragma omp parallel for"; + if (DD) + Printer = printLine(Printer, DepDisPragmaStr, DD); + if (IslAstInfo::isInnermostParallel(Node)) Printer = printLine(Printer, SimdPragmaStr + BrokenReductionsStr); if (IslAstInfo::isOutermostParallel(Node)) Printer = printLine(Printer, OmpPragmaStr + BrokenReductionsStr); + isl_pw_aff_free(DD); return isl_ast_node_for_print(Node, Printer, Options); } @@ -173,7 +183,9 @@ isl_union_map *Schedule = isl_ast_build_get_schedule(Build); isl_union_map *Deps = D->getDependences( Dependences::TYPE_RAW | Dependences::TYPE_WAW | Dependences::TYPE_WAR); - if (!D->isParallel(Schedule, Deps) && !isl_union_map_free(Schedule)) + + if (!D->isParallel(Schedule, Deps, &NodeInfo->DependenceDistance) && + !isl_union_map_free(Schedule)) return false; isl_union_map *RedDeps = D->getDependences(Dependences::TYPE_TC_RED); @@ -408,6 +420,11 @@ return Payload ? isl_ast_build_get_schedule(Payload->Build) : nullptr; } +isl_pw_aff *IslAstInfo::getDependenceDistance(__isl_keep isl_ast_node *Node) { + IslAstUserPayload *Payload = getNodePayload(Node); + return Payload ? isl_pw_aff_copy(Payload->DependenceDistance) : nullptr; +} + IslAstInfo::MemoryAccessSet * IslAstInfo::getBrokenReductions(__isl_keep isl_ast_node *Node) { IslAstUserPayload *Payload = getNodePayload(Node); Index: test/Isl/Ast/dependence_distance_constant.ll =================================================================== --- /dev/null +++ test/Isl/Ast/dependence_distance_constant.ll @@ -0,0 +1,52 @@ +; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s +; +; void f(int *A, int N) { +; CHECK: #pragma minimal dependence distance: 1 +; for (int j = 0; j < N; j++) +; CHECK: #pragma minimal dependence distance: 8 +; for (int i = 0; i < N; i++) +; A[i + 8] = A[i] + 1; +; } +; +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +define void @f(i32* %A, i32 %N) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc6, %entry + %j.0 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %cmp = icmp slt i32 %j.0, %N + br i1 %cmp, label %for.body, label %for.end8 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %i.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %i.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %arrayidx = getelementptr inbounds i32* %A, i32 %i.0 + %tmp = load i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + %add4 = add nsw i32 %i.0, 8 + %arrayidx5 = getelementptr inbounds i32* %A, i32 %add4 + store i32 %add, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i32 %i.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc6 + +for.inc6: ; preds = %for.end + %inc7 = add nsw i32 %j.0, 1 + br label %for.cond + +for.end8: ; preds = %for.cond + ret void +} Index: test/Isl/Ast/dependence_distance_multiple_constant.ll =================================================================== --- /dev/null +++ test/Isl/Ast/dependence_distance_multiple_constant.ll @@ -0,0 +1,43 @@ +; RUN: opt %loadPolly -basicaa -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s +; +; void f(int *restrict A, int *restrict B, int N) { +; CHECK: #pragma minimal dependence distance: 5 +; for (int i = 0; i < N; i++) { +; A[i + 7] = A[i] + 1; +; B[i + 5] = B[i] + 1; +; } +; } +; +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +define void @f(i32* noalias %A, i32* noalias %B, i32 %N) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %N + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds i32* %A, i32 %i.0 + %tmp = load i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + %add1 = add nsw i32 %i.0, 7 + %arrayidx2 = getelementptr inbounds i32* %A, i32 %add1 + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32* %B, i32 %i.0 + %tmp1 = load i32* %arrayidx3, align 4 + %add4 = add nsw i32 %tmp1, 1 + %add5 = add nsw i32 %i.0, 5 + %arrayidx6 = getelementptr inbounds i32* %B, i32 %add5 + store i32 %add4, i32* %arrayidx6, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Isl/Ast/dependence_distance_parametric.ll =================================================================== --- /dev/null +++ test/Isl/Ast/dependence_distance_parametric.ll @@ -0,0 +1,52 @@ +; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s +; +; void f(int *A, int N, int c) { +; CHECK: #pragma minimal dependence distance: 1 +; for (int j = 0; j < N; j++) +; CHECK: #pragma minimal dependence distance: (-1 + c >= 0) ? (c) : -c +; for (int i = 0; i < N; i++) +; A[i + c] = A[i] + 1; +; } +; +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +define void @f(i32* %A, i32 %N, i32 %c) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc6, %entry + %j.0 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %cmp = icmp slt i32 %j.0, %N + br i1 %cmp, label %for.body, label %for.end8 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %i.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %i.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %arrayidx = getelementptr inbounds i32* %A, i32 %i.0 + %tmp = load i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + %add4 = add nsw i32 %i.0, %c + %arrayidx5 = getelementptr inbounds i32* %A, i32 %add4 + store i32 %add, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i32 %i.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc6 + +for.inc6: ; preds = %for.end + %inc7 = add nsw i32 %j.0, 1 + br label %for.cond + +for.end8: ; preds = %for.cond + ret void +} Index: test/Isl/Ast/dependence_distance_parametric_expr.ll =================================================================== --- /dev/null +++ test/Isl/Ast/dependence_distance_parametric_expr.ll @@ -0,0 +1,53 @@ +; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s +; +; void f(int *A, int N, int c, int v) { +; CHECK: #pragma minimal dependence distance: 1 +; for (int j = 0; j < N; j++) +; CHECK: #pragma minimal dependence distance: (-1 + c + v >= 0) ? (c + v) : -c - v +; for (int i = 0; i < N; i++) +; A[i + c + v] = A[i] + 1; +; } +; +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +define void @f(i32* %A, i32 %N, i32 %c, i32 %v) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc7, %entry + %j.0 = phi i32 [ 0, %entry ], [ %inc8, %for.inc7 ] + %cmp = icmp slt i32 %j.0, %N + br i1 %cmp, label %for.body, label %for.end9 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %i.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %i.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %arrayidx = getelementptr inbounds i32* %A, i32 %i.0 + %tmp = load i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + %add4 = add nsw i32 %i.0, %c + %add5 = add nsw i32 %add4, %v + %arrayidx6 = getelementptr inbounds i32* %A, i32 %add5 + store i32 %add, i32* %arrayidx6, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i32 %i.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc7 + +for.inc7: ; preds = %for.end + %inc8 = add nsw i32 %j.0, 1 + br label %for.cond + +for.end9: ; preds = %for.cond + ret void +} Index: test/Isl/Ast/dependence_distance_varying.ll =================================================================== --- /dev/null +++ test/Isl/Ast/dependence_distance_varying.ll @@ -0,0 +1,35 @@ +; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s +; +; void f(int *A, int N) { +; CHECK: #pragma minimal dependence distance: ((N - 1) % 2) + 1 +; for (int i = 0; i < N; i++) +; A[i] = A[N - i] + 1; +; } +; +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +define void @f(i32* %A, i32 %N) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %N + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %sub = sub nsw i32 %N, %i.0 + %arrayidx = getelementptr inbounds i32* %A, i32 %sub + %tmp = load i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + %arrayidx1 = getelementptr inbounds i32* %A, i32 %i.0 + store i32 %add, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Isl/Ast/dependence_distance_varying_in_outer_loop.ll =================================================================== --- /dev/null +++ test/Isl/Ast/dependence_distance_varying_in_outer_loop.ll @@ -0,0 +1,53 @@ +; RUN: opt %loadPolly -polly-canonicalize -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s +; +; void f(int *restrict A, int *restrict sum) { +; CHECK: #pragma minimal dependence distance: 1 +; for (int j = 0; j < 1024; j++) +; CHECK: #pragma minimal dependence distance: 1 +; for (int i = j; i < 1024; i++) +; A[i - 3] = A[j] * 2 + A[j] + 2; +; } +; +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +define void @f(i32* noalias %A, i32* noalias %sum) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc7, %entry + %j.0 = phi i32 [ 0, %entry ], [ %inc8, %for.inc7 ] + %exitcond1 = icmp ne i32 %j.0, 1024 + br i1 %exitcond1, label %for.body, label %for.end9 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %i.0 = phi i32 [ %j.0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %i.0, 1024 + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %arrayidx = getelementptr inbounds i32* %A, i32 %j.0 + %tmp = load i32* %arrayidx, align 4 + %add = mul nsw i32 %tmp, 3 + %add5 = add nsw i32 %add, 2 + %sub = add nsw i32 %i.0, -3 + %arrayidx6 = getelementptr inbounds i32* %A, i32 %sub + store i32 %add5, i32* %arrayidx6, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i32 %i.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc7 + +for.inc7: ; preds = %for.end + %inc8 = add nsw i32 %j.0, 1 + br label %for.cond + +for.end9: ; preds = %for.cond + ret void +} Index: test/Isl/Ast/dependence_distance_varying_multiple.ll =================================================================== --- /dev/null +++ test/Isl/Ast/dependence_distance_varying_multiple.ll @@ -0,0 +1,70 @@ +; RUN: opt %loadPolly -basicaa -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s +; +; void f(int *restrict A, int *restrict B, int *restrict C, int *restrict D, +; int *restrict E, int N) { +; CHECK: #pragma minimal dependence distance: (-35 + N >= 0) ? (1) : (-17 + N >= 0 && 34 - N >= 0) ? (2) : 5 +; for (int i = 0; i < N; i++) { +; A[i] = A[100 - 2 * i] + 1; +; B[i] = B[100 - 3 * i] + 1; +; C[i] = C[100 - 4 * i] + 1; +; D[i] = D[100 - 5 * i] + 1; +; E[i] = E[100 - 6 * i] + 1; +; } +; } +; +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +define void @f(i32* noalias %A, i32* noalias %B, i32* noalias %C, i32* noalias %D, i32* noalias %E, i32 %N) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %N + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = shl nsw i32 %i.0, 1 + %sub = sub nsw i32 100, %mul + %arrayidx = getelementptr inbounds i32* %A, i32 %sub + %tmp = load i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + %arrayidx1 = getelementptr inbounds i32* %A, i32 %i.0 + store i32 %add, i32* %arrayidx1, align 4 + %tmp1 = mul i32 %i.0, -3 + %sub3 = add i32 %tmp1, 100 + %arrayidx4 = getelementptr inbounds i32* %B, i32 %sub3 + %tmp2 = load i32* %arrayidx4, align 4 + %add5 = add nsw i32 %tmp2, 1 + %arrayidx6 = getelementptr inbounds i32* %B, i32 %i.0 + store i32 %add5, i32* %arrayidx6, align 4 + %mul7 = shl nsw i32 %i.0, 2 + %sub8 = sub nsw i32 100, %mul7 + %arrayidx9 = getelementptr inbounds i32* %C, i32 %sub8 + %tmp3 = load i32* %arrayidx9, align 4 + %add10 = add nsw i32 %tmp3, 1 + %arrayidx11 = getelementptr inbounds i32* %C, i32 %i.0 + store i32 %add10, i32* %arrayidx11, align 4 + %tmp4 = mul i32 %i.0, -5 + %sub13 = add i32 %tmp4, 100 + %arrayidx14 = getelementptr inbounds i32* %D, i32 %sub13 + %tmp5 = load i32* %arrayidx14, align 4 + %add15 = add nsw i32 %tmp5, 1 + %arrayidx16 = getelementptr inbounds i32* %D, i32 %i.0 + store i32 %add15, i32* %arrayidx16, align 4 + %tmp6 = mul i32 %i.0, -6 + %sub18 = add i32 %tmp6, 100 + %arrayidx19 = getelementptr inbounds i32* %E, i32 %sub18 + %tmp7 = load i32* %arrayidx19, align 4 + %add20 = add nsw i32 %tmp7, 1 + %arrayidx21 = getelementptr inbounds i32* %E, i32 %i.0 + store i32 %add20, i32* %arrayidx21, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Isl/Ast/reduction_dependences_equal_non_reduction_dependences.ll =================================================================== --- test/Isl/Ast/reduction_dependences_equal_non_reduction_dependences.ll +++ test/Isl/Ast/reduction_dependences_equal_non_reduction_dependences.ll @@ -4,6 +4,7 @@ ; accesses to the array A. We need to ensure we do __not__ parallelize anything ; here. ; +; CHECK: #pragma minimal dependence distance: 1 ; CHECK-NOT: pragma ; CHECK-NOT: reduction ;