Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -159,6 +159,11 @@ return true; } +static bool canKillSAI(const Scop &S, const ScopArrayInfo *SAI) { + return (SAI->isPHIKind() || + (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI))); +} + /// Compute must-kills needed to enable live range reordering with PPCG. /// /// @params S The Scop to compute live range reordering information @@ -173,8 +178,7 @@ // 1.2 scalars that are only used within the scop SmallVector KillMemIds; for (ScopArrayInfo *SAI : S.arrays()) { - if (SAI->isPHIKind() || - (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI))) + if (canKillSAI(S, SAI)) KillMemIds.push_back(isl::manage(SAI->getBasePtrId())); } @@ -1902,7 +1906,11 @@ Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); Value *Val = Builder.CreateLoad(Alloca); Builder.CreateStore(Val, TypedArgPtr); - StoredScalar = true; + // We only care about a scalar if there are reads / writes to it + // after the scop: this condition is captured by checking if it cannot + // be killed. + if (!canKillSAI(S, SAI)) + StoredScalar = true; Arg++; } Index: test/GPGPU/allow-scalar-stores.ll =================================================================== --- /dev/null +++ test/GPGPU/allow-scalar-stores.ll @@ -0,0 +1,58 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; void f(int A[], int B[], int control, int C[]) { +; int x; +; #pragma scop +; for(int i = 0; i < 1000; i ++) { +; x = 0; +; if(control) x = C[i]; +; B[i] = x * A[i]; +; +; } +; #pragma endscop +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %B, i32 %control, i32* %C) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] + %tobool = icmp eq i32 %control, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %tmp4 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp8 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %tmp8, %x.0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %if.end + ret void +}