Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -138,6 +138,25 @@
   MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){};
 };
 
+/// Check if SAI's uses are entirely contained within Scop S.
+/// If a scalar is used only with a Scop, we are free to kill it, as no data
+/// can flow in/out of the value any more.
+/// @see computeMustKillsInfo
+static bool isScalarUsesContainedInScop(const Scop &S,
+                                        const ScopArrayInfo *SAI) {
+  assert(SAI->isValueKind() && "this function only deals with scalars."
+                               " Dealing with arrays required alias analysis");
+
+  const Region &R = S.getRegion();
+  for (User *U : SAI->getBasePtr()->users()) {
+    Instruction *I = dyn_cast<Instruction>(U);
+    assert(I && "invalid user of scop array info");
+    if (!R.contains(I))
+      return false;
+  }
+  return true;
+}
+
 /// Compute must-kills needed to enable live range reordering with PPCG.
 ///
 /// @params S The Scop to compute live range reordering information
@@ -147,13 +166,14 @@
   const isl::space ParamSpace(isl::manage(S.getParamSpace()));
   MustKillsInfo Info;
 
-  // 1. Collect phi nodes in scop.
+  // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria:
+  //      1.1 phi nodes in scop.
+  //      1.2 scalars that are only used within the scop
   SmallVector<isl::id, 4> KillMemIds;
   for (ScopArrayInfo *SAI : S.arrays()) {
-    if (!SAI->isPHIKind())
-      continue;
-
-    KillMemIds.push_back(isl::manage(SAI->getBasePtrId()));
+    if (SAI->isPHIKind() ||
+        (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI)))
+      KillMemIds.push_back(isl::manage(SAI->getBasePtrId()));
   }
 
   Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace));
Index: polly/trunk/test/GPGPU/add-scalars-in-scop-to-kills.ll
===================================================================
--- polly/trunk/test/GPGPU/add-scalars-in-scop-to-kills.ll
+++ polly/trunk/test/GPGPU/add-scalars-in-scop-to-kills.ll
@@ -0,0 +1,71 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; REQUIRES: pollyacc
+
+; Check that we detect a scop.
+; SCOP:       Function: checkScalarKill
+; SCOP-NEXT: Region: %XLoopInit---%for.end
+; SCOP-NEXT: Max Loop Depth:  1
+
+; Check that we have a scalar that is not a phi node in the scop.
+; SCOP: i32 MemRef_x_0; // Element size 4
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; Check that we add variables that are local to a scop into the kills that we
+; pass to PPCG. This should enable PPCG to codegen this example.
+; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
+; int x;
+; #pragma scop
+;     for(int i = 0; i < 1000; i++) {
+; XLoopInit:        x = 0;
+; 
+;         if (control1 > 2)
+;             C1Add: x += 10;
+;         if (control2 > 3)
+;             C2Add: x += A[i];
+; 
+; BLoopAccumX:        B[i] += x;
+;     }
+; 
+; #pragma endscop
+; }
+; ModuleID = 'test.ll'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @checkScalarKill(i32* %A, i32* %B, i32* %C, i32 %control1, i32 %control2) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %XLoopInit
+
+XLoopInit:                                        ; preds = %entry.split, %BLoopAccumX
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
+  %cmp1 = icmp sgt i32 %control1, 2
+  %x.0 = select i1 %cmp1, i32 10, i32 0
+  %cmp2 = icmp sgt i32 %control2, 3
+  br i1 %cmp2, label %C2Add, label %BLoopAccumX
+
+C2Add:                                            ; preds = %XLoopInit
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp6 = load i32, i32* %arrayidx, align 4
+  %add4 = add nsw i32 %tmp6, %x.0
+  br label %BLoopAccumX
+
+BLoopAccumX:                                      ; preds = %XLoopInit, %C2Add
+  %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %tmp11 = load i32, i32* %arrayidx7, align 4
+  %add8 = add nsw i32 %tmp11, %x.1
+  store i32 %add8, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %XLoopInit, label %for.end
+
+for.end:                                          ; preds = %BLoopAccumX
+  ret void
+}
Index: polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll
===================================================================
--- polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll
+++ polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll
@@ -11,6 +11,24 @@
 
 ; REQUIRES: pollyacc
 
+; Approximate C source:
+; void kernel_dynprog(int c[50]) {
+;     int iter = 0;
+;     int outl = 0;
+;
+;      while(1) {
+;         for(int indvar = 1 ; indvar <= 49; indvar++) {
+;             c[indvar] = undef;
+;         }
+;         add78 = c[49] + outl;
+;         inc80 = iter + 1;
+;
+;         if (true) break;
+;
+;         outl = add78;
+;         iter = inc80;
+;      }
+;}
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -24,8 +42,7 @@
 ; CODE-NEXT:     cudaCheckKernel();
 ; CODE-NEXT:   }
 
-; CODE:       cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT:  cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE:       cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
 ; CODE-NEXT: }
 
 ; CODE: # kernel0
@@ -40,8 +57,8 @@
 ; IR:      [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
 ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
 
-; IR: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)
+; IR: [[REGC:%.+]] =   bitcast i32* %38 to i8*
+; IR-NEXT:  call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196)
 
 ; KERNEL-IR: entry:
 ; KERNEL-IR-NEXT:   %out_l.055.s2a = alloca i32