Index: polly/trunk/lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- polly/trunk/lib/CodeGen/IslNodeBuilder.cpp +++ polly/trunk/lib/CodeGen/IslNodeBuilder.cpp @@ -296,6 +296,7 @@ for (const auto &I : IDToValue) Values.insert(I.second); + // NOTE: this is populated in IslNodeBuilder::addParameters for (const auto &I : OutsideLoopIterations) Values.insert(cast(I.second)->getValue()); Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -420,11 +420,18 @@ /// /// @param Kernel The kernel to scan for llvm::Values /// - /// @returns A pair, whose first element contains the set of values - /// referenced by the kernel, and whose second element contains the - /// set of functions referenced by the kernel. All functions in the - /// second set satisfy isValidFunctionInKernel. - std::pair, SetVector> + /// @returns A tuple, whose: + /// - First element contains the set of values referenced by the + /// kernel + /// - Second element contains the set of functions referenced by the + /// kernel. All functions in the set satisfy + /// `isValidFunctionInKernel`. + /// - Third element contains loops that have induction variables + /// which are used in the kernel, *and* these loops are *neither* + /// in the scop, nor do they immediately surroung the Scop. + /// See [Code generation of induction variables of loops outside + /// Scops] + std::tuple, SetVector, SetVector> getReferencesInKernel(ppcg_kernel *Kernel); /// Compute the sizes of the execution grid for a given kernel. @@ -1396,7 +1403,7 @@ return SubtreeFunctions; } -std::pair, SetVector> +std::tuple, SetVector, SetVector> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { SetVector SubtreeValues; SetVector SCEVs; @@ -1407,11 +1414,22 @@ for (const auto &I : IDToValue) SubtreeValues.insert(I.second); + // NOTE: this is populated in IslNodeBuilder::addParameters + // See [Code generation of induction variables of loops outside Scops]. + for (const auto &I : OutsideLoopIterations) + SubtreeValues.insert(cast(I.second)->getValue()); + isl_ast_node_foreach_descendant_top_down( Kernel->tree, collectReferencesInGPUStmt, &References); - for (const SCEV *Expr : SCEVs) + for (const SCEV *Expr : SCEVs) { findValues(Expr, SE, SubtreeValues); + findLoops(Expr, Loops); + } + + Loops.remove_if([this](const Loop *L) { + return S.contains(L) || L->contains(S.getEntry()); + }); for (auto &SAI : S.arrays()) SubtreeValues.remove(SAI->getBasePtr()); @@ -1458,7 +1476,7 @@ else ReplacedValues.insert(It->second); } - return std::make_pair(ReplacedValues, ValidSubtreeFunctions); + return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops); } void GPUNodeBuilder::clearDominators(Function *F) { @@ -1702,7 +1720,9 @@ SetVector SubtreeValues; SetVector SubtreeFunctions; - std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); + SetVector Loops; + std::tie(SubtreeValues, SubtreeFunctions, Loops) = + getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1712,8 +1732,6 @@ BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; ScalarMap.clear(); - SetVector Loops; - // Create for all loops we depend on values that contain the current loop // iteration. These values are necessary to generate code for SCEVs that // depend on such loops. As a result we need to pass them to the subfunction. Index: polly/trunk/test/GPGPU/loops-outside-scop.ll =================================================================== --- polly/trunk/test/GPGPU/loops-outside-scop.ll +++ polly/trunk/test/GPGPU/loops-outside-scop.ll @@ -0,0 +1,67 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP + +; There is no FileCheck because we want to make sure that this doesn't crash. +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ +; RUN: -disable-output < %s + +; REQUIRES: pollyacc + +; Due to the existence of the `fence` call, We can only detect the inner loop +; and not the outer loop. PPCGCodeGeneration had not implemented this case. +; The fix was to pull the implementation from `IslNodeBuilder. + +; Make sure that we only capture the inner loop +; SCOP: Function: f +; SCOP-NEXT: Region: %for2.body---%for2.body.fence +; SCOP-NEXT: Max Loop Depth: 1 + +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +declare void @fn_to_fence(i32 *%val) + +; void f(int *arr, bool shouldcont) { +; for(int i = 0; ; i++) { +; for(int j = 0; j < 10; j++) { +; arr[j] = i; +; } +; fence(arr); +; if (!shouldcont) break; +; } +; } + + +; Function Attrs: nounwind uwtable +define void @f(i32 *%arr, i1 %shouldcont) #1 { +entry: + br label %for.init + +for.init: ; preds = %for.end, %entry.split + %i = phi i32 [ %i.next, %for.end ], [ 0, %entry ] + br label %for2.body + +for2.body: ; preds = %"65", %"64" + %j = phi i32 [ %j.next, %for2.body ], [ 0, %for.init ] + %j.sext = sext i32 %j to i64 + %arr.slot = getelementptr i32, i32* %arr, i64 %j.sext + store i32 %i, i32* %arr.slot, align 4 + %exitcond = icmp eq i32 %j, 10 + %j.next = add i32 %j, 1 + br i1 %exitcond, label %for2.body.fence, label %for2.body + +for2.body.fence: ; preds = %"65" + call void @fn_to_fence(i32* %arr) #2 + br i1 %shouldcont, label %for.end, label %exit +for.end: ; preds = %"69" + %i.next = add i32 %i, 1 + br label %for.init + +exit: ; preds = %"69" + ret void + +} + + +attributes #0 = { argmemonly nounwind } +attributes #1 = { nounwind uwtable } +attributes #2 = { nounwind }