Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
===================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -2815,6 +2815,9 @@
       Access->ref_id = Acc->getId().release();
       Access->next = Accesses;
       Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
+      // TODO: Also mark one-element accesses to arrays as fixed-element.
+      Access->fixed_element =
+          Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false;
       Accesses = Access;
     }
 
@@ -3029,6 +3032,7 @@
       i++;
 
       collect_references(PPCGProg, &PPCGArray);
+      PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray);
     }
   }
 
@@ -3070,13 +3074,6 @@
     PPCGProg->to_outer = getArrayIdentity();
     // TODO: verify that this assignment is correct.
     PPCGProg->any_to_outer = nullptr;
-
-    // this needs to be set when live range reordering is enabled.
-    // NOTE: I believe that is conservatively correct. I'm not sure
-    //       what the semantics of this is.
-    // Quoting PPCG/gpu.h: "Order dependences on non-scalars."
-    PPCGProg->array_order =
-        isl_union_map_empty(isl_set_get_space(PPCGScop->context));
     PPCGProg->n_stmts = std::distance(S->begin(), S->end());
     PPCGProg->stmts = getStatements();
 
@@ -3099,6 +3096,9 @@
 
     createArrays(PPCGProg, ValidSAIs);
 
+    PPCGProg->array_order = nullptr;
+    collect_order_dependences(PPCGProg);
+
     PPCGProg->may_persist = compute_may_persist(PPCGProg);
     return PPCGProg;
   }
Index: polly/trunk/lib/External/ppcg/gpu.h
===================================================================
--- polly/trunk/lib/External/ppcg/gpu.h
+++ polly/trunk/lib/External/ppcg/gpu.h
@@ -454,4 +454,6 @@
 
 __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
 void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
+void collect_order_dependences(struct gpu_prog *prog);
+isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
 #endif
Index: polly/trunk/lib/External/ppcg/gpu.c
===================================================================
--- polly/trunk/lib/External/ppcg/gpu.c
+++ polly/trunk/lib/External/ppcg/gpu.c
@@ -162,7 +162,7 @@
 /* Is "array" only accessed as individual, fixed elements?
  * That is, does each access to "array" access a single, fixed element?
  */
-static isl_bool only_fixed_element_accessed(struct gpu_array_info *array)
+isl_bool only_fixed_element_accessed(struct gpu_array_info *array)
 {
 	int i;
 
@@ -250,6 +250,9 @@
 static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog,
 	struct gpu_array_info *array, __isl_take isl_union_map *order)
 {
+	// We do not have independence information in Polly. Hence, make this
+	// function a no-op.
+	return order;
 	int i;
 
 	for (i = 0; i < prog->scop->pet->n_independence; ++i) {
Index: polly/trunk/test/GPGPU/invariant-load-hoisting-with-failing-scop.ll
===================================================================
--- polly/trunk/test/GPGPU/invariant-load-hoisting-with-failing-scop.ll
+++ polly/trunk/test/GPGPU/invariant-load-hoisting-with-failing-scop.ll
@@ -1,84 +0,0 @@
-; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp, tmp1] -> { Stmt_if_end[i0] -> MemRef_end[0] };
-; SCOP-NEXT:         Execution Context: [tmp, tmp1] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp, tmp1] -> { Stmt_for_body[i0] -> MemRef_control[0] };
-; SCOP-NEXT:         Execution Context: [tmp, tmp1] -> {  : tmp > 0 }
-; SCOP-NEXT: }
-
-; Check that we generate a correct "always false" branch.
-; HOST-IR:  br i1 false, label %polly.start, label %entry.split.pre_entry_bb
-
-; This test case checks that we generate correct code if PPCGCodeGeneration
-; decides a build is unsuccessful with invariant load hoisting enabled.
-;
-; There is a conditional branch which switches between the original code and
-; the new code. We try to set this conditional branch to branch on false.
-; However, invariant load hoisting changes the structure of the scop, so we
-; need to change the way we *locate* this instruction.
-;
-;    void f(const int *end, int *arr, const int *control, const int *readarr) {
-;      for (int i = 0; i < *end; i++) {
-;        int t = 0;
-;        if (*control > 3) {
-;          t += readarr[i];
-;        }
-;        arr[i] = t;
-;      }
-;    }
-;
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-
-define void @f(i32* %end, i32* %arr, i32* %control, i32* %readarr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp3 = load i32, i32* %end, align 4
-  %cmp4 = icmp sgt i32 %tmp3, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %if.end
-  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
-  %tmp1 = load i32, i32* %control, align 4
-  %cmp1 = icmp sgt i32 %tmp1, 3
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, i32* %readarr, i32 %i.05
-  %tmp2 = load i32, i32* %arrayidx, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i32 %i.05
-  store i32 %t.0, i32* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.05, 1
-  %tmp = load i32, i32* %end, align 4
-  %cmp = icmp slt i32 %inc, %tmp
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %if.end
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
Index: polly/trunk/test/GPGPU/live-range-reordering-with-privatization.ll
===================================================================
--- polly/trunk/test/GPGPU/live-range-reordering-with-privatization.ll
+++ polly/trunk/test/GPGPU/live-range-reordering-with-privatization.ll
@@ -0,0 +1,78 @@
+  ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
+; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
+; RUN: -polly-acc-dump-code -disable-output \
+; RUN:   < %s | FileCheck %s -check-prefix=CODE
+
+; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
+; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
+; RUN: -polly-acc-dump-kernel-ir -disable-output \
+; RUN:   < %s | FileCheck %s -check-prefix=KERNELIR
+
+; REQUIRES: pollyacc
+
+;    void f(const int *end, int *arr, const int *control, const int *readarr) {
+;      for (int i = 0; i < *end; i++) {
+;        int t = 0;
+;        if (*control > 3) {
+;          t += readarr[i];
+;        }
+;        arr[i] = t;
+;      }
+;    }
+
+; This test case tests the ability to infer that `t` is local to each loop
+; iteration, and can therefore be privatized.
+
+; CODE: # kernel0
+; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1)
+; CODE-NEXT:   if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) {
+; CODE-NEXT:     Stmt_for_body(32 * b0 + t0 + 1048576 * c0);
+; CODE-NEXT:     if (tmp1 >= 4)
+; CODE-NEXT:       Stmt_if_then(32 * b0 + t0 + 1048576 * c0);
+; CODE-NEXT:     Stmt_if_end(32 * b0 + t0 + 1048576 * c0);
+; CODE-NEXT:   }
+
+; KERNELIR: %private_array = alloca i32
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.12.0"
+
+define void @f(i32* %end, i32* %arr, i32* %control, i32* %readarr) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  %tmp3 = load i32, i32* %end, align 4
+  %cmp4 = icmp sgt i32 %tmp3, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %if.end
+  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
+  %tmp1 = load i32, i32* %control, align 4
+  %cmp1 = icmp sgt i32 %tmp1, 3
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %readarr, i32 %i.05
+  %tmp2 = load i32, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i32 %i.05
+  store i32 %t.0, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.05, 1
+  %tmp = load i32, i32* %end, align 4
+  %cmp = icmp slt i32 %inc, %tmp
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %if.end
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
+  ret void
+}
+
Index: polly/trunk/test/GPGPU/non-read-only-scalars.ll
===================================================================
--- polly/trunk/test/GPGPU/non-read-only-scalars.ll
+++ polly/trunk/test/GPGPU/non-read-only-scalars.ll
@@ -68,11 +68,16 @@
 ; CODE-NEXT: Stmt_bb17();
 
 ; CODE: # kernel2
-; CODE-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) {
-; CODE-NEXT:   Stmt_bb18(c0);
-; CODE-NEXT:   if (c0 <= 31)
-; CODE-NEXT:     Stmt_bb20(c0);
-; CODE-NEXT: }
+; CODE_NEXT: {
+; CODE_NEXT:   read();
+; CODE_NEXT:   for (int c0 = 0; c0 <= 32; c0 += 1) {
+; CODE_NEXT:     Stmt_bb18(c0);
+; CODE_NEXT:     if (c0 <= 31)
+; CODE_NEXT:       Stmt_bb20(c0);
+; CODE_NEXT:   }
+; CODE_NEXT:   write();
+; CODE_NEXT: }
+
 
 ; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_sum_0__phi)
 ; KERNEL-IR:  store float 0.000000e+00, float* %sum.0.phiops
Index: polly/trunk/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
===================================================================
--- polly/trunk/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
+++ polly/trunk/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
@@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-scops  \
+; RUN: -polly-acc-dump-code -analyze \
+; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP
+
+; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
+; RUN: -polly-acc-dump-code \
+; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE
+
+; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
+; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
+
+; REQUIRES: pollyacc
+
+; SCOP:      Invariant Accesses: {
+; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
+; SCOP-NEXT:             { Stmt_loop[i0] -> MemRef_p[0] };
+; SCOP-NEXT:         Execution Context: {  :  }
+; SCOP-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: {
+; CODE-NEXT:   if (32 * b0 + t0 <= 1025) {
+; CODE-NEXT:     Stmt_loop(32 * b0 + t0);
+; CODE-NEXT:     write(0);
+; CODE-NEXT:   }
+; CODE-NEXT:   sync0();
+; CODE-NEXT: }
+
+; Check that we generate a correct "always false" branch.
+; HOST-IR:  br i1 false, label %polly.start, label %loop.pre_entry_bb
+
+; This test case checks that we generate correct code if PPCGCodeGeneration
+; decides a build is unsuccessful with invariant load hoisting enabled.
+;
+; There is a conditional branch which switches between the original code and
+; the new code. We try to set this conditional branch to branch on false.
+; However, invariant load hoisting changes the structure of the scop, so we
+; need to change the way we *locate* this instruction.
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.12.0"
+
+define void @foo(float* %A, float* %p) {
+entry:
+  br label %loop
+
+loop:
+  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
+  %indvar.next = add i64 %indvar, 1
+  %invariant = load float, float* %p
+  %ptr = getelementptr float, float* %A, i64 %indvar
+  store float 42.0, float* %ptr
+  %cmp = icmp sle i64 %indvar, 1024
+  br i1 %cmp, label %loop, label %loop2
+
+loop2:
+  %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2]
+  %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2]
+  %indvar2.next = add i64 %indvar2, 1
+  store float %indvar2f, float* %A
+  %cmp2 = icmp sle i64 %indvar2, 1024
+  br i1 %cmp2, label %loop2, label %end
+
+end:
+  ret void
+}