Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -455,6 +455,38 @@ llvm::SmallVectorImpl &Loads); //@} + /// @brief Derive assumptions about parameter values from GetElementPtrInst + /// + /// In case a GEP instruction references into a fixed size array e.g., an + /// access A[i][j] into an array A[100x100], LLVM-IR does not guarantee that + /// the subscripts always compute values that are within array bounds. In this + /// function we derive the set of parameter values for which all accesses are + /// within bounds and add the assumption that the scop is only every executed + /// with this set of parameter values. + /// + /// Example: + /// + /// void foo(float A[10][20], long n, long m { + /// for (long i = 0; i < n; i++) + /// for (long j = 0; j < m; j++) + /// A[i][j] = ... + /// + /// This loop yields out-of-bound accesses if m is at least 20 and at the same + /// time at least one iteration of the outer loop is executed. Hence, we + /// assume: + /// + /// n <= 0 or m <= 20. + /// + /// TODO: The location where the GEP instruction is executed is not + /// necessarily the location where the memory is actually accessed. As a + /// result scanning for GEP[s] is imprecise. Even though this is not a + /// correctness problem, this imprecision may result in missed optimizations + /// or non-optimal run-time checks. + void deriveAssumptionsFromGEP(GetElementPtrInst *Inst); + + /// @brief Scan the scop and derive assumptions about parameter values. + void deriveAssumptions(); + /// Create the ScopStmt from a BasicBlock. ScopStmt(Scop &parent, TempScop &tempScop, const Region &CurRegion, BasicBlock &bb, SmallVectorImpl &NestLoops, Index: lib/Analysis/Dependences.cpp =================================================================== --- lib/Analysis/Dependences.cpp +++ lib/Analysis/Dependences.cpp @@ -122,6 +122,9 @@ } *StmtSchedule = isl_union_map_add_map(*StmtSchedule, Stmt->getScattering()); } + + *StmtSchedule = + isl_union_map_intersect_params(*StmtSchedule, S.getAssumedContext()); } /// @brief Fix all dimension of @p Zero to 0 and add it to @p user Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -847,6 +847,60 @@ return Domain; } +void ScopStmt::deriveAssumptionsFromGEP(GetElementPtrInst *Inst) { + int Dimension = 0; + isl_ctx *Ctx = Parent.getIslCtx(); + isl_local_space *LSpace = isl_local_space_from_space(getDomainSpace()); + Type *Ty = Inst->getPointerOperandType(); + + if (auto *PtrTy = dyn_cast(Ty)) { + Dimension = 1; + Ty = PtrTy->getElementType(); + } + + while (auto ArrayTy = dyn_cast(Ty)) { + unsigned int Operand = 1 + Dimension; + + if (Inst->getNumOperands() <= Operand) + break; + + const SCEV *Expr = Parent.getSE()->getSCEV(Inst->getOperand(1 + Dimension)); + + if (isAffineExpr(&Parent.getRegion(), Expr, *Parent.getSE())) { + isl_pw_aff *AccessOffset = SCEVAffinator::getPwAff(this, Expr); + AccessOffset = + isl_pw_aff_set_tuple_id(AccessOffset, isl_dim_in, getDomainId()); + + isl_pw_aff *DimSize = isl_pw_aff_from_aff(isl_aff_val_on_domain( + isl_local_space_copy(LSpace), + isl_val_int_from_si(Ctx, ArrayTy->getNumElements()))); + + isl_set *OutOfBound = isl_pw_aff_ge_set(AccessOffset, DimSize); + OutOfBound = isl_set_intersect(getDomain(), OutOfBound); + OutOfBound = isl_set_params(OutOfBound); + isl_set *InBound = isl_set_complement(OutOfBound); + isl_set *Executed = isl_set_params(getDomain()); + + // A => B == !A or B + isl_set *InBoundIfExecuted = + isl_set_union(isl_set_complement(Executed), InBound); + + Parent.addAssumption(InBoundIfExecuted); + } + + Dimension += 1; + Ty = ArrayTy->getElementType(); + } + + isl_local_space_free(LSpace); +} + +void ScopStmt::deriveAssumptions() { + for (Instruction &Inst : *BB) + if (auto *GEP = dyn_cast(&Inst)) + deriveAssumptionsFromGEP(GEP); +} + ScopStmt::ScopStmt(Scop &parent, TempScop &tempScop, const Region &CurRegion, BasicBlock &bb, SmallVectorImpl &Nest, SmallVectorImpl &Scatter) @@ -867,6 +921,7 @@ buildScattering(Scatter); buildAccesses(tempScop); checkForReductions(); + deriveAssumptions(); } /// @brief Collect loads which might form a reduction chain with @p StoreMA @@ -1530,6 +1585,7 @@ void Scop::addAssumption(__isl_take isl_set *Set) { AssumedContext = isl_set_intersect(AssumedContext, Set); + AssumedContext = isl_set_coalesce(AssumedContext); } void Scop::printContext(raw_ostream &OS) const { Index: test/Dependences/sequential_loops.ll =================================================================== --- test/Dependences/sequential_loops.ll +++ test/Dependences/sequential_loops.ll @@ -273,7 +273,7 @@ ; VALUE: RAW dependences: ; VALUE: [p] -> { ; VALUE: Stmt_S1[i0] -> Stmt_S2[-p + i0] : -; VALUE: i0 >= p and i0 <= 9 + p and i0 >= 0 and i0 <= 99 +; VALUE: i0 >= p and i0 <= 9 + p and p <= 190 and i0 <= 99 and i0 >= 0 ; VALUE: } ; VALUE: WAR dependences: ; VALUE: [p] -> { Index: test/Isl/Ast/OpenMP/nested_loop_both_parallel_parametric.ll =================================================================== --- test/Isl/Ast/OpenMP/nested_loop_both_parallel_parametric.ll +++ test/Isl/Ast/OpenMP/nested_loop_both_parallel_parametric.ll @@ -41,20 +41,9 @@ ret void } -; At the first look both loops seem parallel, however due to the linearization -; of memory access functions, we get the following dependences: -; [n] -> { loop_body[i0, i1] -> loop_body[1024 + i0, -1 + i1]: -; 0 <= i0 < n - 1024 and 1 <= i1 < n} -; They cause the outer loop to be non-parallel. We can only prove their -; absence, if we know that n < 1024. This information is currently not available -; to polly. However, we should be able to obtain it due to the out of bounds -; memory accesses, that would happen if n >= 1024. - -; Note that we do not delinearize this access function because it is considered -; to already be affine: {{0,+,4}<%loop.i>,+,4096}<%loop.j>. - -; CHECK: for (int c1 = 0; c1 < n; c1 += 1) -; CHECK: #pragma simd +; CHECK: if (n <= 1024 ? 1 : 0) ; CHECK: #pragma omp parallel for -; CHECK: for (int c3 = 0; c3 < n; c3 += 1) -; CHECK: Stmt_loop_body(c1, c3); +; CHECK: for (int c1 = 0; c1 < n; c1 += 1) +; CHECK: #pragma simd +; CHECK: for (int c3 = 0; c3 < n; c3 += 1) +; CHECK: Stmt_loop_body(c1, c3); Index: test/Isl/Ast/alias_simple_1.ll =================================================================== --- test/Isl/Ast/alias_simple_1.ll +++ test/Isl/Ast/alias_simple_1.ll @@ -12,11 +12,11 @@ ; A[i] = B[i]; ; } ; -; NOAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; BASI: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; TBAA: if (1) -; SCEV: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; GLOB: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; NOAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; BASI: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; TBAA: if (N <= 1024 ? 1 : 0) +; SCEV: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; GLOB: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/Isl/Ast/alias_simple_2.ll =================================================================== --- test/Isl/Ast/alias_simple_2.ll +++ test/Isl/Ast/alias_simple_2.ll @@ -12,11 +12,11 @@ ; A[i] = B[i]; ; } ; -; NOAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; BASI: if (1) -; TBAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; SCEV: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; GLOB: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; NOAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; BASI: if (N <= 1024 ? 1 : 0) +; TBAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; SCEV: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; GLOB: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/Isl/Ast/alias_simple_3.ll =================================================================== --- test/Isl/Ast/alias_simple_3.ll +++ test/Isl/Ast/alias_simple_3.ll @@ -12,11 +12,11 @@ ; A[i] = B[i]; ; } ; -; NOAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; BASI: if (1) -; TBAA: if (1) -; SCEV: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) -; GLOB: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; NOAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; BASI: if (N <= 1024 ? 1 : 0) +; TBAA: if (N <= 1024 ? 1 : 0) +; SCEV: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) +; GLOB: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0])) ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/ScopInfo/assume_gep_bounds.ll =================================================================== --- /dev/null +++ test/ScopInfo/assume_gep_bounds.ll @@ -0,0 +1,76 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s + +; void foo(float A[10][20][30], long n, long m, long p) { +; for (long i = 0; i < n; i++) +; for (long j = 0; j < m; j++) +; for (long k = 0; k < p; k++) +; A[i][j][k] = i + j + k; +; } + +; For the above code we want to assume that all memory accesses are within the +; bounds of the array A. In C (and LLVM-IR) this is not required, such that out +; of bounds accesses are valid. However, as such accesses are uncommon, cause +; complicated dependence pattern and as a result make dependence analysis more +; costly and may prevent or hinder useful program transformations, we assume +; absence of out-of-bound accesses. To do so we derive the set of parameter +; values for which our assumption holds. + +; CHECK: Assumed Context +; CHECK-NEXT: [n, m, p] -> { : p <= 30 and m <= 20 } + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo([20 x [30 x float]]* %A, i64 %n, i64 %m, i64 %p) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc13, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc14, %for.inc13 ] + %cmp = icmp slt i64 %i.0, %n + br i1 %cmp, label %for.body, label %for.end15 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc10, %for.body + %j.0 = phi i64 [ 0, %for.body ], [ %inc11, %for.inc10 ] + %cmp2 = icmp slt i64 %j.0, %m + br i1 %cmp2, label %for.body3, label %for.end12 + +for.body3: ; preds = %for.cond1 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %k.0 = phi i64 [ 0, %for.body3 ], [ %inc, %for.inc ] + %cmp5 = icmp slt i64 %k.0, %p + br i1 %cmp5, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %add = add nsw i64 %i.0, %j.0 + %add7 = add nsw i64 %add, %k.0 + %conv = sitofp i64 %add7 to float + %arrayidx9 = getelementptr inbounds [20 x [30 x float]]* %A, i64 %i.0, i64 %j.0, i64 %k.0 + store float %conv, float* %arrayidx9, align 4 + br label %for.inc + +for.inc: ; preds = %for.body6 + %inc = add nsw i64 %k.0, 1 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc10 + +for.inc10: ; preds = %for.end + %inc11 = add nsw i64 %j.0, 1 + br label %for.cond1 + +for.end12: ; preds = %for.cond1 + br label %for.inc13 + +for.inc13: ; preds = %for.end12 + %inc14 = add nsw i64 %i.0, 1 + br label %for.cond + +for.end15: ; preds = %for.cond + ret void +} Index: test/ScopInfo/assume_gep_bounds_2.ll =================================================================== --- /dev/null +++ test/ScopInfo/assume_gep_bounds_2.ll @@ -0,0 +1,94 @@ +; RUN: opt %loadPolly -basicaa -polly-scops -analyze < %s | FileCheck %s +; +; void foo(float A[restrict 10][20], float B[restrict 10][20], long n, long m, +; long p) { +; for (long i = 0; i < n; i++) +; for (long j = 0; j < m; j++) +; A[i][j] = i + j; +; for (long i = 0; i < m; i++) +; for (long j = 0; j < p; j++) +; B[i][j] = i + j; +; } + +; This code is within bounds either if m and p are smaller than the array sizes, +; but also if only p is smaller than the size of the second B dimension and n +; is such that the first loop is never executed and consequently A is never +; accessed. In this case the value of m does not matter. + +; CHECK: Assumed Context: +; CHECK-NEXT: [n, m, p] -> { : (n <= 0 and p <= 20) or (m <= 20 and p <= 20) } + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo([20 x float]* noalias %A, [20 x float]* noalias %B, i64 %n, i64 %m, i64 %p) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc5, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ] + %cmp = icmp slt i64 %i.0, %n + br i1 %cmp, label %for.body, label %for.end7 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %j.0 = phi i64 [ 0, %for.body ], [ %inc, %for.inc ] + %cmp2 = icmp slt i64 %j.0, %m + br i1 %cmp2, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %add = add nsw i64 %i.0, %j.0 + %conv = sitofp i64 %add to float + %arrayidx4 = getelementptr inbounds [20 x float]* %A, i64 %i.0, i64 %j.0 + store float %conv, float* %arrayidx4, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nsw i64 %j.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc5 + +for.inc5: ; preds = %for.end + %inc6 = add nsw i64 %i.0, 1 + br label %for.cond + +for.end7: ; preds = %for.cond + br label %for.cond9 + +for.cond9: ; preds = %for.inc25, %for.end7 + %i8.0 = phi i64 [ 0, %for.end7 ], [ %inc26, %for.inc25 ] + %cmp10 = icmp slt i64 %i8.0, %m + br i1 %cmp10, label %for.body12, label %for.end27 + +for.body12: ; preds = %for.cond9 + br label %for.cond14 + +for.cond14: ; preds = %for.inc22, %for.body12 + %j13.0 = phi i64 [ 0, %for.body12 ], [ %inc23, %for.inc22 ] + %cmp15 = icmp slt i64 %j13.0, %p + br i1 %cmp15, label %for.body17, label %for.end24 + +for.body17: ; preds = %for.cond14 + %add18 = add nsw i64 %i8.0, %j13.0 + %conv19 = sitofp i64 %add18 to float + %arrayidx21 = getelementptr inbounds [20 x float]* %B, i64 %i8.0, i64 %j13.0 + store float %conv19, float* %arrayidx21, align 4 + br label %for.inc22 + +for.inc22: ; preds = %for.body17 + %inc23 = add nsw i64 %j13.0, 1 + br label %for.cond14 + +for.end24: ; preds = %for.cond14 + br label %for.inc25 + +for.inc25: ; preds = %for.end24 + %inc26 = add nsw i64 %i8.0, 1 + br label %for.cond9 + +for.end27: ; preds = %for.cond9 + ret void +}