Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -296,6 +296,7 @@ for (const auto &I : IDToValue) Values.insert(I.second); + // NOTE: this is populated in IslNodeBuilder::addParameters for (const auto &I : OutsideLoopIterations) Values.insert(cast(I.second)->getValue()); Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -417,7 +417,7 @@ /// referenced by the kernel, and whose second element contains the /// set of functions referenced by the kernel. All functions in the /// second set satisfy isValidFunctionInKernel. - std::pair, SetVector> + std::tuple, SetVector, SetVector> getReferencesInKernel(ppcg_kernel *Kernel); /// Compute the sizes of the execution grid for a given kernel. @@ -1389,7 +1389,7 @@ return SubtreeFunctions; } -std::pair, SetVector> +std::tuple, SetVector, SetVector> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { SetVector SubtreeValues; SetVector SCEVs; @@ -1400,11 +1400,21 @@ for (const auto &I : IDToValue) SubtreeValues.insert(I.second); + // NOTE: this is populated in IslNodeBuilder::addParameters + for (const auto &I : OutsideLoopIterations) + SubtreeValues.insert(cast(I.second)->getValue()); + isl_ast_node_foreach_descendant_top_down( Kernel->tree, collectReferencesInGPUStmt, &References); - for (const SCEV *Expr : SCEVs) + for (const SCEV *Expr : SCEVs) { findValues(Expr, SE, SubtreeValues); + findLoops(Expr, Loops); + } + + Loops.remove_if([this](const Loop *L) { + return S.contains(L) || L->contains(S.getEntry()); + }); for (auto &SAI : S.arrays()) SubtreeValues.remove(SAI->getBasePtr()); @@ -1451,7 +1461,7 @@ else ReplacedValues.insert(It->second); } - return std::make_pair(ReplacedValues, ValidSubtreeFunctions); + return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops); } void GPUNodeBuilder::clearDominators(Function *F) { @@ -1528,6 +1538,7 @@ Value * GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, SetVector SubtreeValues) { + const int NumArgs = F->arg_size(); std::vector ArgSizes(NumArgs); @@ -1695,7 +1706,9 @@ SetVector SubtreeValues; SetVector SubtreeFunctions; - std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); + SetVector Loops; + std::tie(SubtreeValues, SubtreeFunctions, Loops) = + getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1705,8 +1718,6 @@ BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; ScalarMap.clear(); - SetVector Loops; - // Create for all loops we depend on values that contain the current loop // iteration. These values are necessary to generate code for SCEVs that // depend on such loops. As a result we need to pass them to the subfunction. Index: test/GPGPU/test.ll =================================================================== --- /dev/null +++ test/GPGPU/test.ll @@ -0,0 +1,61 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ +; RUN: -disable-output < %s + +; ModuleID = 'reduced.ll' +source_filename = "bugpoint-output-cb5513b.bc" +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +declare void @fn_to_fence(i32 *%val) + + +; void f(int *arr, bool shouldcont) { +; for(int i = 0; ; i++) { +; for(int j = 0; j < 10; j++) { +; arr[j] = i; +; } +; fence(arr); +; if (!shouldcont) break; +; } +; } + + +; Function Attrs: nounwind uwtable +define void @f(i32 *%arr, i1 %shouldcont) #1 { +entry: + br label %for.init + +for.init: ; preds = %for.end, %entry.split + %i = phi i32 [ %i.next, %for.end ], [ 0, %entry ] + br label %for2.body + +for2.body: ; preds = %"65", %"64" + %j = phi i32 [ %j.next, %for2.body ], [ 0, %for.init ] + %j.sext = sext i32 %j to i64 + %arr.slot = getelementptr i32, i32* %arr, i64 %j.sext + store i32 %i, i32* %arr.slot, align 4 + %exitcond = icmp eq i32 %j, 10 + %j.next = add i32 %j, 1 + br i1 %exitcond, label %for2.body.fence, label %for2.body + +for2.body.fence: ; preds = %"65" + call void @fn_to_fence(i32* %arr) #2 + br i1 %shouldcont, label %for.end, label %exit +for.end: ; preds = %"69" + %i.next = add i32 %i, 1 + br label %for.init + +exit: ; preds = %"69" + ret void + +} + +; Function Attrs: nounwind +declare noalias i8* @malloc(i64) #2 + +declare void @used_fn(...) + + +attributes #0 = { argmemonly nounwind } +attributes #1 = { nounwind uwtable } +attributes #2 = { nounwind }