Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -138,6 +138,25 @@ MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){}; }; +/// Check if SAI's uses are entirely contained within Scop S. +/// If a scalar is used only with a Scop, we are free to kill it, as no data +/// can flow in/out of the value any more. +/// @see computeMustKillsInfo +static bool isScalarUsesContainedInScop(const Scop &S, + const ScopArrayInfo *SAI) { + assert(SAI->isValueKind() && "this function only deals with scalars." + " Dealing with arrays required alias analysis"); + + const Region &R = S.getRegion(); + for (User *U : SAI->getBasePtr()->users()) { + Instruction *I = dyn_cast(U); + assert(I && "invalid user of scop array info"); + if (!R.contains(I)) + return false; + } + return true; +} + /// Compute must-kills needed to enable live range reordering with PPCG. /// /// @params S The Scop to compute live range reordering information @@ -147,13 +166,14 @@ const isl::space ParamSpace(isl::manage(S.getParamSpace())); MustKillsInfo Info; - // 1. Collect phi nodes in scop. + // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria: + // 1.1 phi nodes in scop. + // 1.2 scalars that are only used within the scop SmallVector KillMemIds; for (ScopArrayInfo *SAI : S.arrays()) { - if (!SAI->isPHIKind()) - continue; - - KillMemIds.push_back(isl::manage(SAI->getBasePtrId())); + if (SAI->isPHIKind() || + (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI))) + KillMemIds.push_back(isl::manage(SAI->getBasePtrId())); } Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace)); Index: polly/trunk/test/GPGPU/add-scalars-in-scop-to-kills.ll =================================================================== --- polly/trunk/test/GPGPU/add-scalars-in-scop-to-kills.ll +++ polly/trunk/test/GPGPU/add-scalars-in-scop-to-kills.ll @@ -0,0 +1,71 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; Check that we detect a scop. +; SCOP: Function: checkScalarKill +; SCOP-NEXT: Region: %XLoopInit---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; Check that we have a scalar that is not a phi node in the scop. +; SCOP: i32 MemRef_x_0; // Element size 4 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; Check that we add variables that are local to a scop into the kills that we +; pass to PPCG. This should enable PPCG to codegen this example. +; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) { +; int x; +; #pragma scop +; for(int i = 0; i < 1000; i++) { +; XLoopInit: x = 0; +; +; if (control1 > 2) +; C1Add: x += 10; +; if (control2 > 3) +; C2Add: x += A[i]; +; +; BLoopAccumX: B[i] += x; +; } +; +; #pragma endscop +; } +; ModuleID = 'test.ll' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @checkScalarKill(i32* %A, i32* %B, i32* %C, i32 %control1, i32 %control2) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %XLoopInit + +XLoopInit: ; preds = %entry.split, %BLoopAccumX + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ] + %cmp1 = icmp sgt i32 %control1, 2 + %x.0 = select i1 %cmp1, i32 10, i32 0 + %cmp2 = icmp sgt i32 %control2, 3 + br i1 %cmp2, label %C2Add, label %BLoopAccumX + +C2Add: ; preds = %XLoopInit + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp6 = load i32, i32* %arrayidx, align 4 + %add4 = add nsw i32 %tmp6, %x.0 + br label %BLoopAccumX + +BLoopAccumX: ; preds = %XLoopInit, %C2Add + %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ] + %arrayidx7 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %tmp11 = load i32, i32* %arrayidx7, align 4 + %add8 = add nsw i32 %tmp11, %x.1 + store i32 %add8, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %XLoopInit, label %for.end + +for.end: ; preds = %BLoopAccumX + ret void +} Index: polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll =================================================================== --- polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll +++ polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll @@ -11,6 +11,24 @@ ; REQUIRES: pollyacc +; Approximate C source: +; void kernel_dynprog(int c[50]) { +; int iter = 0; +; int outl = 0; +; +; while(1) { +; for(int indvar = 1 ; indvar <= 49; indvar++) { +; c[indvar] = undef; +; } +; add78 = c[49] + outl; +; inc80 = iter + 1; +; +; if (true) break; +; +; outl = add78; +; iter = inc80; +; } +;} target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -24,8 +42,7 @@ ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); ; CODE-NEXT: } ; CODE: # kernel0 @@ -40,8 +57,8 @@ ; IR: [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8* ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4) -; IR: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8* -; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4) +; IR: [[REGC:%.+]] = bitcast i32* %38 to i8* +; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196) ; KERNEL-IR: entry: ; KERNEL-IR-NEXT: %out_l.055.s2a = alloca i32