Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -2815,6 +2815,9 @@ Access->ref_id = Acc->getId().release(); Access->next = Accesses; Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); + // TODO: Also mark one-element accesses to arrays as fixed-element. + Access->fixed_element = + Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false; Accesses = Access; } @@ -3029,6 +3032,7 @@ i++; collect_references(PPCGProg, &PPCGArray); + PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray); } } @@ -3070,13 +3074,6 @@ PPCGProg->to_outer = getArrayIdentity(); // TODO: verify that this assignment is correct. PPCGProg->any_to_outer = nullptr; - - // this needs to be set when live range reordering is enabled. - // NOTE: I believe that is conservatively correct. I'm not sure - // what the semantics of this is. - // Quoting PPCG/gpu.h: "Order dependences on non-scalars." - PPCGProg->array_order = - isl_union_map_empty(isl_set_get_space(PPCGScop->context)); PPCGProg->n_stmts = std::distance(S->begin(), S->end()); PPCGProg->stmts = getStatements(); @@ -3099,6 +3096,9 @@ createArrays(PPCGProg, ValidSAIs); + PPCGProg->array_order = nullptr; + collect_order_dependences(PPCGProg); + PPCGProg->may_persist = compute_may_persist(PPCGProg); return PPCGProg; } Index: polly/trunk/lib/External/ppcg/gpu.h =================================================================== --- polly/trunk/lib/External/ppcg/gpu.h +++ polly/trunk/lib/External/ppcg/gpu.h @@ -454,4 +454,6 @@ __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog); void collect_references(struct gpu_prog *prog, struct gpu_array_info *array); +void collect_order_dependences(struct gpu_prog *prog); +isl_bool only_fixed_element_accessed(struct gpu_array_info *array); #endif Index: polly/trunk/lib/External/ppcg/gpu.c =================================================================== --- polly/trunk/lib/External/ppcg/gpu.c +++ polly/trunk/lib/External/ppcg/gpu.c @@ -162,7 +162,7 @@ /* Is "array" only accessed as individual, fixed elements? * That is, does each access to "array" access a single, fixed element? */ -static isl_bool only_fixed_element_accessed(struct gpu_array_info *array) +isl_bool only_fixed_element_accessed(struct gpu_array_info *array) { int i; @@ -250,6 +250,9 @@ static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog, struct gpu_array_info *array, __isl_take isl_union_map *order) { + // We do not have independence information in Polly. Hence, make this + // function a no-op. + return order; int i; for (i = 0; i < prog->scop->pet->n_independence; ++i) { Index: polly/trunk/test/GPGPU/invariant-load-hoisting-with-failing-scop.ll =================================================================== --- polly/trunk/test/GPGPU/invariant-load-hoisting-with-failing-scop.ll +++ polly/trunk/test/GPGPU/invariant-load-hoisting-with-failing-scop.ll @@ -1,84 +0,0 @@ -; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp1] -> { Stmt_if_end[i0] -> MemRef_end[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp1] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp1] -> { Stmt_for_body[i0] -> MemRef_control[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp1] -> { : tmp > 0 } -; SCOP-NEXT: } - -; Check that we generate a correct "always false" branch. -; HOST-IR: br i1 false, label %polly.start, label %entry.split.pre_entry_bb - -; This test case checks that we generate correct code if PPCGCodeGeneration -; decides a build is unsuccessful with invariant load hoisting enabled. -; -; There is a conditional branch which switches between the original code and -; the new code. We try to set this conditional branch to branch on false. -; However, invariant load hoisting changes the structure of the scop, so we -; need to change the way we *locate* this instruction. -; -; void f(const int *end, int *arr, const int *control, const int *readarr) { -; for (int i = 0; i < *end; i++) { -; int t = 0; -; if (*control > 3) { -; t += readarr[i]; -; } -; arr[i] = t; -; } -; } -; - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.12.0" - -define void @f(i32* %end, i32* %arr, i32* %control, i32* %readarr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp3 = load i32, i32* %end, align 4 - %cmp4 = icmp sgt i32 %tmp3, 0 - br i1 %cmp4, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %if.end - %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ] - %tmp1 = load i32, i32* %control, align 4 - %cmp1 = icmp sgt i32 %tmp1, 3 - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, i32* %readarr, i32 %i.05 - %tmp2 = load i32, i32* %arrayidx, align 4 - br label %if.end - -if.end: ; preds = %if.then, %for.body - %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ] - %arrayidx2 = getelementptr inbounds i32, i32* %arr, i32 %i.05 - store i32 %t.0, i32* %arrayidx2, align 4 - %inc = add nuw nsw i32 %i.05, 1 - %tmp = load i32, i32* %end, align 4 - %cmp = icmp slt i32 %inc, %tmp - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %if.end - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - Index: polly/trunk/test/GPGPU/live-range-reordering-with-privatization.ll =================================================================== --- polly/trunk/test/GPGPU/live-range-reordering-with-privatization.ll +++ polly/trunk/test/GPGPU/live-range-reordering-with-privatization.ll @@ -0,0 +1,78 @@ + ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ +; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-code -disable-output \ +; RUN: < %s | FileCheck %s -check-prefix=CODE + +; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ +; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-kernel-ir -disable-output \ +; RUN: < %s | FileCheck %s -check-prefix=KERNELIR + +; REQUIRES: pollyacc + +; void f(const int *end, int *arr, const int *control, const int *readarr) { +; for (int i = 0; i < *end; i++) { +; int t = 0; +; if (*control > 3) { +; t += readarr[i]; +; } +; arr[i] = t; +; } +; } + +; This test case tests the ability to infer that `t` is local to each loop +; iteration, and can therefore be privatized. + +; CODE: # kernel0 +; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1) +; CODE-NEXT: if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) { +; CODE-NEXT: Stmt_for_body(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: if (tmp1 >= 4) +; CODE-NEXT: Stmt_if_then(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: Stmt_if_end(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: } + +; KERNELIR: %private_array = alloca i32 + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.12.0" + +define void @f(i32* %end, i32* %arr, i32* %control, i32* %readarr) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %tmp3 = load i32, i32* %end, align 4 + %cmp4 = icmp sgt i32 %tmp3, 0 + br i1 %cmp4, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %if.end + %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ] + %tmp1 = load i32, i32* %control, align 4 + %cmp1 = icmp sgt i32 %tmp1, 3 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %readarr, i32 %i.05 + %tmp2 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, i32* %arr, i32 %i.05 + store i32 %t.0, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.05, 1 + %tmp = load i32, i32* %end, align 4 + %cmp = icmp slt i32 %inc, %tmp + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %if.end + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + Index: polly/trunk/test/GPGPU/non-read-only-scalars.ll =================================================================== --- polly/trunk/test/GPGPU/non-read-only-scalars.ll +++ polly/trunk/test/GPGPU/non-read-only-scalars.ll @@ -68,11 +68,16 @@ ; CODE-NEXT: Stmt_bb17(); ; CODE: # kernel2 -; CODE-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { -; CODE-NEXT: Stmt_bb18(c0); -; CODE-NEXT: if (c0 <= 31) -; CODE-NEXT: Stmt_bb20(c0); -; CODE-NEXT: } +; CODE_NEXT: { +; CODE_NEXT: read(); +; CODE_NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { +; CODE_NEXT: Stmt_bb18(c0); +; CODE_NEXT: if (c0 <= 31) +; CODE_NEXT: Stmt_bb20(c0); +; CODE_NEXT: } +; CODE_NEXT: write(); +; CODE_NEXT: } + ; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_sum_0__phi) ; KERNEL-IR: store float 0.000000e+00, float* %sum.0.phiops Index: polly/trunk/test/GPGPU/scalar-writes-in-scop-requires-abort.ll =================================================================== --- polly/trunk/test/GPGPU/scalar-writes-in-scop-requires-abort.ll +++ polly/trunk/test/GPGPU/scalar-writes-in-scop-requires-abort.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-scops \ +; RUN: -polly-acc-dump-code -analyze \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-code \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE + +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: { Stmt_loop[i0] -> MemRef_p[0] }; +; SCOP-NEXT: Execution Context: { : } +; SCOP-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: { +; CODE-NEXT: if (32 * b0 + t0 <= 1025) { +; CODE-NEXT: Stmt_loop(32 * b0 + t0); +; CODE-NEXT: write(0); +; CODE-NEXT: } +; CODE-NEXT: sync0(); +; CODE-NEXT: } + +; Check that we generate a correct "always false" branch. +; HOST-IR: br i1 false, label %polly.start, label %loop.pre_entry_bb + +; This test case checks that we generate correct code if PPCGCodeGeneration +; decides a build is unsuccessful with invariant load hoisting enabled. +; +; There is a conditional branch which switches between the original code and +; the new code. We try to set this conditional branch to branch on false. +; However, invariant load hoisting changes the structure of the scop, so we +; need to change the way we *locate* this instruction. + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.12.0" + +define void @foo(float* %A, float* %p) { +entry: + br label %loop + +loop: + %indvar = phi i64 [0, %entry], [%indvar.next, %loop] + %indvar.next = add i64 %indvar, 1 + %invariant = load float, float* %p + %ptr = getelementptr float, float* %A, i64 %indvar + store float 42.0, float* %ptr + %cmp = icmp sle i64 %indvar, 1024 + br i1 %cmp, label %loop, label %loop2 + +loop2: + %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2] + %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2] + %indvar2.next = add i64 %indvar2, 1 + store float %indvar2f, float* %A + %cmp2 = icmp sle i64 %indvar2, 1024 + br i1 %cmp2, label %loop2, label %end + +end: + ret void +}