Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -1560,8 +1560,12 @@ RuntimeDebugBuilder::createCPUPrinter( Builder, "F: " + F->getName().str() + " R: " + S.getRegion().getNameStr() + - " __RTC: ", - RTC, " Overflow: ", OverflowHappened, "\n"); + "RTC: ", + RTC, " Overflow: ", OverflowHappened, + "\n" + " (0 failed, -1 succeeded)\n" + " (if one or both are 0 falling back to original code, if both are -1 " + "executing Polly code)\n"); } RTC = Builder.CreateAnd(RTC, OverflowHappened, "polly.rtc.result"); Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -2801,6 +2801,8 @@ Access->ref_id = Acc->getId().release(); Access->next = Accesses; Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); + Access->fixed_element = + Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false; Accesses = Access; } @@ -3015,6 +3017,7 @@ i++; collect_references(PPCGProg, &PPCGArray); + PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray); } } @@ -3056,13 +3059,6 @@ PPCGProg->to_outer = getArrayIdentity(); // TODO: verify that this assignment is correct. PPCGProg->any_to_outer = nullptr; - - // this needs to be set when live range reordering is enabled. - // NOTE: I believe that is conservatively correct. I'm not sure - // what the semantics of this is. - // Quoting PPCG/gpu.h: "Order dependences on non-scalars." - PPCGProg->array_order = - isl_union_map_empty(isl_set_get_space(PPCGScop->context)); PPCGProg->n_stmts = std::distance(S->begin(), S->end()); PPCGProg->stmts = getStatements(); @@ -3085,6 +3081,9 @@ createArrays(PPCGProg, ValidSAIs); + PPCGProg->array_order = nullptr; + collect_order_dependences(PPCGProg); + PPCGProg->may_persist = compute_may_persist(PPCGProg); return PPCGProg; } Index: lib/External/ppcg/gpu.h =================================================================== --- lib/External/ppcg/gpu.h +++ lib/External/ppcg/gpu.h @@ -454,4 +454,6 @@ __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog); void collect_references(struct gpu_prog *prog, struct gpu_array_info *array); +void collect_order_dependences(struct gpu_prog *prog); +isl_bool only_fixed_element_accessed(struct gpu_array_info *array); #endif Index: lib/External/ppcg/gpu.c =================================================================== --- lib/External/ppcg/gpu.c +++ lib/External/ppcg/gpu.c @@ -162,7 +162,7 @@ /* Is "array" only accessed as individual, fixed elements? * That is, does each access to "array" access a single, fixed element? */ -static isl_bool only_fixed_element_accessed(struct gpu_array_info *array) +isl_bool only_fixed_element_accessed(struct gpu_array_info *array) { int i; @@ -250,6 +250,9 @@ static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog, struct gpu_array_info *array, __isl_take isl_union_map *order) { + // We do not have independence information in Polly. Hence, make this + // function a no-op. + return order; int i; for (i = 0; i < prog->scop->pet->n_independence; ++i) { Index: test/GPGPU/live-range-reordering-with-privatization.ll =================================================================== --- test/GPGPU/live-range-reordering-with-privatization.ll +++ test/GPGPU/live-range-reordering-with-privatization.ll @@ -1,34 +1,15 @@ -; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP + ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ +; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-code -disable-output \ +; RUN: < %s | FileCheck %s -check-prefix=CODE -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR +; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ +; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-kernel-ir -disable-output \ +; RUN: < %s | FileCheck %s -check-prefix=KERNELIR ; REQUIRES: pollyacc -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp1] -> { Stmt_if_end[i0] -> MemRef_end[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp1] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp1] -> { Stmt_for_body[i0] -> MemRef_control[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp1] -> { : tmp > 0 } -; SCOP-NEXT: } - -; Check that we generate a correct "always false" branch. -; HOST-IR: br i1 false, label %polly.start, label %entry.split.pre_entry_bb - -; This test case checks that we generate correct code if PPCGCodeGeneration -; decides a build is unsuccessful with invariant load hoisting enabled. -; -; There is a conditional branch which switches between the original code and -; the new code. We try to set this conditional branch to branch on false. -; However, invariant load hoisting changes the structure of the scop, so we -; need to change the way we *locate* this instruction. -; ; void f(const int *end, int *arr, const int *control, const int *readarr) { ; for (int i = 0; i < *end; i++) { ; int t = 0; @@ -38,7 +19,17 @@ ; arr[i] = t; ; } ; } -; + +; CODE: # kernel0 +; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1) +; CODE-NEXT: if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) { +; CODE-NEXT: Stmt_for_body(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: if (tmp1 >= 4) +; CODE-NEXT: Stmt_if_then(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: Stmt_if_end(32 * b0 + t0 + 1048576 * c0); +; CODE-NEXT: } + +; KERNELIR: %private_array = alloca i32 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" target triple = "i386-apple-macosx10.12.0" Index: test/GPGPU/non-read-only-scalars.ll =================================================================== --- test/GPGPU/non-read-only-scalars.ll +++ test/GPGPU/non-read-only-scalars.ll @@ -68,11 +68,16 @@ ; CODE-NEXT: Stmt_bb17(); ; CODE: # kernel2 -; CODE-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { -; CODE-NEXT: Stmt_bb18(c0); -; CODE-NEXT: if (c0 <= 31) -; CODE-NEXT: Stmt_bb20(c0); -; CODE-NEXT: } +; CODE_NEXT: { +; CODE_NEXT: read(); +; CODE_NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { +; CODE_NEXT: Stmt_bb18(c0); +; CODE_NEXT: if (c0 <= 31) +; CODE_NEXT: Stmt_bb20(c0); +; CODE_NEXT: } +; CODE_NEXT: write(); +; CODE_NEXT: } + ; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_sum_0__phi) ; KERNEL-IR: store float 0.000000e+00, float* %sum.0.phiops Index: test/GPGPU/scalar-writes-in-scop-requires-abort.ll =================================================================== --- /dev/null +++ test/GPGPU/scalar-writes-in-scop-requires-abort.ll @@ -0,0 +1,49 @@ +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-acc-dump-code \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE + +; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; CODE: # kernel0 +; CODE-NEXT: { +; CODE-NEXT: if (32 * b0 + t0 <= 1025) { +; CODE-NEXT: Stmt_loop(32 * b0 + t0); +; CODE-NEXT: write(0); +; CODE-NEXT: } +; CODE-NEXT: sync0(); +; CODE-NEXT: } + +; Check that we generate a correct "always false" branch. +; HOST-IR: br i1 false, label %polly.start, label %loop.pre_entry_bb + + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.12.0" + +define void @foo(float* %A, float* %p) { +entry: + br label %loop + +loop: + %indvar = phi i64 [0, %entry], [%indvar.next, %loop] + %indvar.next = add i64 %indvar, 1 + %invariant = load float, float* %p + %ptr = getelementptr float, float* %A, i64 %indvar + store float 42.0, float* %ptr + %cmp = icmp sle i64 %indvar, 1024 + br i1 %cmp, label %loop, label %loop2 + +loop2: + %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2] + %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2] + %indvar2.next = add i64 %indvar2, 1 + store float %indvar2f, float* %A + %cmp2 = icmp sle i64 %indvar2, 1024 + br i1 %cmp2, label %loop2, label %end + +end: + ret void +}