Index: include/polly/Support/SCEVValidator.h =================================================================== --- include/polly/Support/SCEVValidator.h +++ include/polly/Support/SCEVValidator.h @@ -70,8 +70,9 @@ /// @param Scope Location where the value is needed. /// @param AllowLoops Whether loop recurrences outside the loop that are in the /// region count as dependence. -bool hasScalarDepsInsideRegion(const llvm::SCEV *S, const llvm::Region *R, - llvm::Loop *Scope, bool AllowLoops); +bool hasScalarDepsInsideRegion(const llvm::SCEV *Expr, const llvm::Region *R, + llvm::Loop *Scope, bool AllowLoops, + const InvariantLoadsSetTy *ILS); bool isAffineExpr(const llvm::Region *R, llvm::Loop *Scope, const llvm::SCEV *Expression, llvm::ScalarEvolution &SE, InvariantLoadsSetTy *ILS = nullptr); Index: include/polly/Support/ScopHelper.h =================================================================== --- include/polly/Support/ScopHelper.h +++ include/polly/Support/ScopHelper.h @@ -446,4 +446,4 @@ llvm::Loop *getFirstNonBoxedLoopFor(llvm::BasicBlock *BB, llvm::LoopInfo &LI, const BoxedLoopsSetTy &BoxedLoops); } // namespace polly -#endif \ No newline at end of file +#endif Index: lib/Analysis/ScopDetection.cpp =================================================================== --- lib/Analysis/ScopDetection.cpp +++ lib/Analysis/ScopDetection.cpp @@ -856,7 +856,9 @@ continue; } } - if (hasScalarDepsInsideRegion(DelinearizedSize, &CurRegion, Scope, false)) + if (hasScalarDepsInsideRegion(DelinearizedSize, &CurRegion, Scope, false, + nullptr)) + // &Context.RequiredILS)) return invalid( Context, /*Assert=*/true, DelinearizedSize, Context.Accesses[BasePointer].front().first, BaseValue); Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1433,13 +1433,35 @@ Value * GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, SetVector SubtreeValues) { + + BasicBlock *EntryBlock = + &Builder.GetInsertBlock()->getParent()->getEntryBlock(); + errs() << __FILE__ << ":" << __LINE__ << " subtreeValues: \n"; + for (Value *Val : SubtreeValues) { + Instruction *Inst = dyn_cast(Val); + if (!Inst) { + continue; + // assert(false && "no instruction?"); + } + + errs() << "\t**" << *Inst << "\n"; + BasicBlock *ValBB = Inst->getParent(); + // if invariant load hoisting is enabled, hoist all the parameters to the + // nearest dominator of the basic block of the value and the entry block + // of the PPCGCodeGen + if (PollyInvariantLoadHoisting) { + BasicBlock *Parent = DT.findNearestCommonDominator(ValBB, EntryBlock); + Inst->removeFromParent(); + Inst->insertBefore(Parent->getTerminator()); + } + } + errs() << "--\n"; + const int NumArgs = F->arg_size(); std::vector ArgSizes(NumArgs); Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); - BasicBlock *EntryBlock = - &Builder.GetInsertBlock()->getParent()->getEntryBlock(); auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); std::string Launch = "polly_launch_" + std::to_string(Kernel->id); Instruction *Parameters = new AllocaInst( @@ -2702,6 +2724,7 @@ int has_permutable = has_any_permutable_node(Schedule); if (!has_permutable || has_permutable < 0) { + errs() << "** Schedule does not have permutable bands\n"; Schedule = isl_schedule_free(Schedule); } else { Schedule = map_to_device(PPCGGen, Schedule); Index: lib/Support/SCEVValidator.cpp =================================================================== --- lib/Support/SCEVValidator.cpp +++ lib/Support/SCEVValidator.cpp @@ -462,12 +462,14 @@ class SCEVInRegionDependences { const Region *R; Loop *Scope; + const InvariantLoadsSetTy *ILS; bool AllowLoops; bool HasInRegionDeps = false; public: - SCEVInRegionDependences(const Region *R, Loop *Scope, bool AllowLoops) - : R(R), Scope(Scope), AllowLoops(AllowLoops) {} + SCEVInRegionDependences(const Region *R, Loop *Scope, bool AllowLoops, + const InvariantLoadsSetTy *ILS) + : R(R), Scope(Scope), ILS(ILS), AllowLoops(AllowLoops) {} bool follow(const SCEV *S) { if (auto Unknown = dyn_cast(S)) { @@ -478,6 +480,32 @@ if (Call && isConstCall(Call)) return false; + if (ILS && Inst) { + auto printILS = [this]() -> void { + if (!ILS) { + dbgs() << "nullptr"; + return; + } + int i = 0; + for (const AssertingVH &invariantLoad : *ILS) { + dbgs() << i << "#:"; + invariantLoad->dump(); + i++; + } + }; + // Do not consider invariant loads. + LoadInst *LI = dyn_cast(Inst); + DEBUG(dbgs() << "\n===///===\n"); + DEBUG(dbgs() << "Inst: " << *Inst + << " |isload?: " << (LI ? "true" : "false") << "\n";); + DEBUG(dbgs() << "Invariant Loads set:\n"); + DEBUG(printILS()); + if (LI && ILS->count(LI) > 0) { + DEBUG(dbgs() << "returning false for instruction: " << *LI << "\n";); + return false; + } + } + // Return true when Inst is defined inside the region R. if (!Inst || !R->contains(Inst)) return true; @@ -579,8 +607,9 @@ } bool hasScalarDepsInsideRegion(const SCEV *Expr, const Region *R, - llvm::Loop *Scope, bool AllowLoops) { - SCEVInRegionDependences InRegionDeps(R, Scope, AllowLoops); + llvm::Loop *Scope, bool AllowLoops, + const InvariantLoadsSetTy *ILS) { + SCEVInRegionDependences InRegionDeps(R, Scope, AllowLoops, ILS); SCEVTraversal ST(InRegionDeps); ST.visitAll(Expr); return InRegionDeps.hasDependences(); Index: lib/Support/ScopHelper.cpp =================================================================== --- lib/Support/ScopHelper.cpp +++ lib/Support/ScopHelper.cpp @@ -496,13 +496,18 @@ bool polly::canSynthesize(const Value *V, const Scop &S, ScalarEvolution *SE, Loop *Scope) { + const InvariantLoadsSetTy &ILS = S.getRequiredInvariantLoads(); if (!V || !SE->isSCEVable(V->getType())) return false; - if (const SCEV *Scev = SE->getSCEVAtScope(const_cast(V), Scope)) - if (!isa(Scev)) - if (!hasScalarDepsInsideRegion(Scev, &S.getRegion(), Scope, false)) + if (const SCEV *Scev = SE->getSCEVAtScope(const_cast(V), Scope)) { + if (!isa(Scev)) { + if (!hasScalarDepsInsideRegion(Scev, &S.getRegion(), Scope, false, + &ILS)) { return true; + } + } + } return false; } Index: test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll =================================================================== --- /dev/null +++ test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll @@ -0,0 +1,58 @@ +; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_entry_split[] -> MemRef_begin[0] }; +; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_for_body[i0] -> MemRef_end[0] }; +; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } +; SCOP-NEXT: } + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; void f(int *begin, int *end, int *arr) { +; for (int i = *begin; i < *end; i++) { +; arr[i] = 0; +; } +; } +; +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" + +define void @f(i32* %begin, i32* %end, i32* %arr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp1 = load i32, i32* %begin, align 4 + %tmp41 = load i32, i32* %end, align 4 + %cmp2 = icmp slt i32 %tmp1, %tmp41 + br i1 %cmp2, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.03 = phi i32 [ %tmp1, %for.body.lr.ph ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %arr, i32 %i.03 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i32 %i.03, 1 + %tmp4 = load i32, i32* %end, align 4 + %cmp = icmp slt i32 %inc, %tmp4 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} Index: test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll =================================================================== --- /dev/null +++ test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll @@ -0,0 +1,54 @@ +; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; Check that we detect a scop with invariant accesses. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: [beginval] -> { Stmt_entry_split[] -> MemRef_begin[0] }; +; SCOP-NEXT: Execution Context: [beginval] -> { : } +; SCOP-NEXT: } + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" + +; +; void f(int *begin, int *arr) { +; for (int i = *begin; i < 100; i++) { +; arr[i] = 0; +; } +; } + + +define void @f(i32* %begin, i32* %arr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %beginval = load i32, i32* %begin, align 4 + %cmp1 = icmp slt i32 %beginval, 100 + br i1 %cmp1, label %for.body, label %for.end + + + +for.body: ; preds = %for.body.lr.ph, %for.body + %ival = phi i32 [ %beginval, %entry.split ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %arr, i32 %ival + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i32 %ival, 1 + %cmp = icmp slt i32 %ival, 99 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +}