Index: include/polly/Support/SCEVValidator.h =================================================================== --- include/polly/Support/SCEVValidator.h +++ include/polly/Support/SCEVValidator.h @@ -70,8 +70,9 @@ /// @param Scope Location where the value is needed. /// @param AllowLoops Whether loop recurrences outside the loop that are in the /// region count as dependence. -bool hasScalarDepsInsideRegion(const llvm::SCEV *S, const llvm::Region *R, - llvm::Loop *Scope, bool AllowLoops); +bool hasScalarDepsInsideRegion(const llvm::SCEV *Expr, const llvm::Region *R, + llvm::Loop *Scope, bool AllowLoops, + const InvariantLoadsSetTy &ILS); bool isAffineExpr(const llvm::Region *R, llvm::Loop *Scope, const llvm::SCEV *Expression, llvm::ScalarEvolution &SE, InvariantLoadsSetTy *ILS = nullptr); Index: include/polly/Support/ScopHelper.h =================================================================== --- include/polly/Support/ScopHelper.h +++ include/polly/Support/ScopHelper.h @@ -446,4 +446,4 @@ llvm::Loop *getFirstNonBoxedLoopFor(llvm::BasicBlock *BB, llvm::LoopInfo &LI, const BoxedLoopsSetTy &BoxedLoops); } // namespace polly -#endif \ No newline at end of file +#endif Index: lib/Analysis/ScopDetection.cpp =================================================================== --- lib/Analysis/ScopDetection.cpp +++ lib/Analysis/ScopDetection.cpp @@ -856,7 +856,8 @@ continue; } } - if (hasScalarDepsInsideRegion(DelinearizedSize, &CurRegion, Scope, false)) + if (hasScalarDepsInsideRegion(DelinearizedSize, &CurRegion, Scope, false, + Context.RequiredILS)) return invalid( Context, /*Assert=*/true, DelinearizedSize, Context.Accesses[BasePointer].front().first, BaseValue); Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -322,6 +322,16 @@ Loops.remove_if([this](const Loop *L) { return S.contains(L) || L->contains(S.getEntry()); }); + + SetVector ReplacedValues; + for (auto V : Values) { + if (ValueMap.count(V)) { + ReplacedValues.insert(ValueMap[V]); + } else { + ReplacedValues.insert(V); + } + } + Values = ReplacedValues; } void IslNodeBuilder::updateValues(ValueMapT &NewValues) { Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1281,6 +1281,7 @@ assert(F && "F is an invalid pointer"); // We string compare against the name of the function to allow // all variants of the intrinsic "llvm.sqrt.*" + return F->isIntrinsic() && F->getName().startswith("llvm.sqrt"); } @@ -1355,10 +1356,19 @@ make_filter_range(SubtreeValues, isValidSubtreeValue); SetVector ValidSubtreeValues(ValidSubtreeValuesIt.begin(), ValidSubtreeValuesIt.end()); + SetVector ValidSubtreeFunctions( getFunctionsFromRawSubtreeValues(SubtreeValues)); - return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions); + SetVector MappedSubtreeValues; + for (auto Val : ValidSubtreeValues) { + if (ValueMap.count(Val)) { + MappedSubtreeValues.insert(ValueMap[Val]); + } else { + MappedSubtreeValues.insert(Val); + } + } + return std::make_pair(MappedSubtreeValues, ValidSubtreeFunctions); } void GPUNodeBuilder::clearDominators(Function *F) { @@ -1501,8 +1511,7 @@ Index++; } - int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); - + const int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); for (long i = 0; i < NumHostIters; i++) { isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); Value *Val = IDToValue[Id]; @@ -1519,11 +1528,12 @@ Index++; } - int NumVars = isl_space_dim(Kernel->space, isl_dim_param); - + const int NumVars = isl_space_dim(Kernel->space, isl_dim_param); for (long i = 0; i < NumVars; i++) { isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); Value *Val = IDToValue[Id]; + if (ValueMap.count(Val)) + Val = ValueMap[Val]; isl_id_free(Id); ArgSizes[Index] = computeSizeInBytes(Val->getType()); @@ -2968,7 +2978,6 @@ SE = &getAnalysis().getSE(); DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); RI = &getAnalysis().getRegionInfo(); - // We currently do not support functions other than intrinsics inside // kernels, as code generation will need to offload function calls to the // kernel. This may lead to a kernel trying to call a function on the host. Index: lib/Support/SCEVValidator.cpp =================================================================== --- lib/Support/SCEVValidator.cpp +++ lib/Support/SCEVValidator.cpp @@ -462,12 +462,14 @@ class SCEVInRegionDependences { const Region *R; Loop *Scope; + const InvariantLoadsSetTy &ILS; bool AllowLoops; bool HasInRegionDeps = false; public: - SCEVInRegionDependences(const Region *R, Loop *Scope, bool AllowLoops) - : R(R), Scope(Scope), AllowLoops(AllowLoops) {} + SCEVInRegionDependences(const Region *R, Loop *Scope, bool AllowLoops, + const InvariantLoadsSetTy &ILS) + : R(R), Scope(Scope), ILS(ILS), AllowLoops(AllowLoops) {} bool follow(const SCEV *S) { if (auto Unknown = dyn_cast(S)) { @@ -478,6 +480,13 @@ if (Call && isConstCall(Call)) return false; + if (Inst) { + // Do not consider invariant loads. + LoadInst *LI = dyn_cast(Inst); + if (LI && ILS.count(LI) > 0) + return false; + } + // Return true when Inst is defined inside the region R. if (!Inst || !R->contains(Inst)) return true; @@ -579,8 +588,9 @@ } bool hasScalarDepsInsideRegion(const SCEV *Expr, const Region *R, - llvm::Loop *Scope, bool AllowLoops) { - SCEVInRegionDependences InRegionDeps(R, Scope, AllowLoops); + llvm::Loop *Scope, bool AllowLoops, + const InvariantLoadsSetTy &ILS) { + SCEVInRegionDependences InRegionDeps(R, Scope, AllowLoops, ILS); SCEVTraversal ST(InRegionDeps); ST.visitAll(Expr); return InRegionDeps.hasDependences(); Index: lib/Support/ScopHelper.cpp =================================================================== --- lib/Support/ScopHelper.cpp +++ lib/Support/ScopHelper.cpp @@ -499,9 +499,10 @@ if (!V || !SE->isSCEVable(V->getType())) return false; + const InvariantLoadsSetTy &ILS = S.getRequiredInvariantLoads(); if (const SCEV *Scev = SE->getSCEVAtScope(const_cast(V), Scope)) if (!isa(Scev)) - if (!hasScalarDepsInsideRegion(Scev, &S.getRegion(), Scope, false)) + if (!hasScalarDepsInsideRegion(Scev, &S.getRegion(), Scope, false, ILS)) return true; return false; Index: test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll =================================================================== --- /dev/null +++ test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll @@ -0,0 +1,58 @@ +; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_entry_split[] -> MemRef_begin[0] }; +; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_for_body[i0] -> MemRef_end[0] }; +; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } +; SCOP-NEXT: } + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; void f(int *begin, int *end, int *arr) { +; for (int i = *begin; i < *end; i++) { +; arr[i] = 0; +; } +; } +; +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" + +define void @f(i32* %begin, i32* %end, i32* %arr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp1 = load i32, i32* %begin, align 4 + %tmp41 = load i32, i32* %end, align 4 + %cmp2 = icmp slt i32 %tmp1, %tmp41 + br i1 %cmp2, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.03 = phi i32 [ %tmp1, %for.body.lr.ph ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %arr, i32 %i.03 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i32 %i.03, 1 + %tmp4 = load i32, i32* %end, align 4 + %cmp = icmp slt i32 %inc, %tmp4 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} Index: test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll =================================================================== --- /dev/null +++ test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll @@ -0,0 +1,54 @@ +; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; Check that we detect a scop with invariant accesses. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [beginval] -> { Stmt_entry_split[] -> MemRef_begin[0] }; +; SCOP-NEXT: Execution Context: [beginval] -> { : } +; SCOP-NEXT: } + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" + +; +; void f(int *begin, int *arr) { +; for (int i = *begin; i < 100; i++) { +; arr[i] = 0; +; } +; } + + +define void @f(i32* %begin, i32* %arr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %beginval = load i32, i32* %begin, align 4 + %cmp1 = icmp slt i32 %beginval, 100 + br i1 %cmp1, label %for.body, label %for.end + + + +for.body: ; preds = %for.body.lr.ph, %for.body + %ival = phi i32 [ %beginval, %entry.split ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %arr, i32 %ival + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i32 %ival, 1 + %cmp = icmp slt i32 %ival, 99 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} Index: test/lit.cfg =================================================================== --- test/lit.cfg +++ test/lit.cfg @@ -3,6 +3,7 @@ import os import platform import re +import subprocess import lit.formats import lit.util @@ -80,8 +81,9 @@ lit_config.fatal('No site specific configuration available!') # Get the source and object roots. - llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip() - llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip() + llvm_src_root = subprocess.check_output(['llvm-config', '--src-root']).decode("utf-8").strip() + llvm_obj_root = subprocess.check_output(['llvm-config', '--obj-root']).decode("utf-8").strip() + polly_src_root = os.path.join(llvm_src_root, "tools", "polly") polly_obj_root = os.path.join(llvm_obj_root, "tools", "polly")