Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1286,7 +1286,6 @@ isl_ast_expr_free(Expr); isl_ast_node_free(UserStmt); - return; } void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) { @@ -1477,6 +1476,27 @@ LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(), &ParamSpace}; + /// Check if a ScopArrayInfo is modelled by the `ppcg_kernel`. + /// + /// @param Needle The ScopArrayInfo to check whether the kernel models. + /// @param Kernel the PPCG representation of the current kernel. + /// @param Program the PPCG god object that knows about all arrays in the + /// scop. + /// @returns whether the ScopArrayInfo is referred to by the ppcg_kernel. + /// + /// Function is a local lambda because there is only one call site, which + /// is in this function. + auto isSAIModeledByKernel = [](const ScopArrayInfo *Needle, + ppcg_kernel *Kernel, gpu_prog *Prog) -> bool { + for (int i = 0; i < Prog->n_array; i++) { + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); + if (SAI == Needle) + return ppcg_kernel_requires_array_argument(Kernel, i); + } + return false; + }; + for (const auto &I : IDToValue) SubtreeValues.insert(I.second); @@ -1497,8 +1517,15 @@ return S.contains(L) || L->contains(S.getEntry()); }); + // Only remove those base pointers from SubtreeValues which are actually + // used by the kernel. That way, we don't double send base pointers. + // We do need to send base pointers of arrays that are _not directly used_ + // in the kernel, because they can be involved in code that we don't model, + // such as ptrtoint. + // See: * test/GPGPU/ for (auto &SAI : S.arrays()) - SubtreeValues.remove(SAI->getBasePtr()); + if (isSAIModeledByKernel(SAI, Kernel, Prog)) + SubtreeValues.remove(SAI->getBasePtr()); isl_space *Space = S.getParamSpace().release(); for (long i = 0, n = isl_space_dim(Space, isl_dim_param); i < n; i++) { Index: test/GPGPU/invariant-load-hoisting-read-in-kernel.ll =================================================================== --- test/GPGPU/invariant-load-hoisting-read-in-kernel.ll +++ test/GPGPU/invariant-load-hoisting-read-in-kernel.ll @@ -7,7 +7,7 @@ ; Verify that invariant loads used in a kernel statement are correctly forwarded ; as subtree value to the GPU kernel. -; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0({{.*}} float %polly.access.p.load) +; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0({{.*}} float %polly.access.p.load{{.*}}) ; CHECK: store float %polly.access.p.load, float* %indvar2f.phiops define void @foo(float* %A, float* %p) { Index: test/GPGPU/invariant-load-hoisting.ll =================================================================== --- test/GPGPU/invariant-load-hoisting.ll +++ test/GPGPU/invariant-load-hoisting.ll @@ -17,10 +17,10 @@ ; SCOP-NEXT: [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] }; ; SCOP-NEXT: Execution Context: [n, tmp12] -> { : n > 0 } ; SCOP-NEXT: } -; HOST-IR: call void @polly_launchKernel(i8* %209, i32 %215, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; HOST-IR-NEXT: call void @polly_freeKernel(i8* %209) +; HOST-IR: call void @polly_launchKernel(i8* %211, i32 %217, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; HOST-IR-NEXT: call void @polly_freeKernel(i8* %211) -; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge) +; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12, i32* %invariant, i32 %polly.preload.tmp21.merge) ; Check that we generate correct GPU code in case of invariant load hoisting. Index: test/GPGPU/only-remove-arrays-from-subtree-that-are-used-by-kernel.ll =================================================================== --- /dev/null +++ test/GPGPU/only-remove-arrays-from-subtree-that-are-used-by-kernel.ll @@ -0,0 +1,145 @@ +; RUN: opt %loadPolly -polly-scops -analyze \ +; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP +; +; RUN: opt %loadPolly -polly-scops -analyze -polly-invariant-load-hoisting \ +; RUN: -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR +; +; REQUIRES: pollyacc +; +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end10 +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP-NEXT: Invariant Accesses: { +; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: { Stmt_for_body[i0] -> MemRef_A_addr[0] }; +; SCOP-NEXT: Execution Context: { : } +; SCOP-NEXT: } +; SCOP: Arrays { +; SCOP-NEXT: i32* MemRef_A_addr[*]; // Element size 8 +; SCOP-NEXT: i32 MemRef_tmp[*]; [BasePtrOrigin: MemRef_A_addr] // Element size 4 +; SCOP-NEXT: i32 MemRef_B[*]; // Element size 4 +; SCOP-NEXT: } + +; Check that we model the access of A and B but *not* the access of &A in +; Stmt_for_body5 + +; SCOP: Statements { +; SCOP-NEXT: Stmt_for_body +; SCOP-NEXT: Domain := +; SCOP-NEXT: { Stmt_for_body[i0] : 0 <= i0 <= 999 }; +; SCOP-NEXT: Schedule := +; SCOP-NEXT: { Stmt_for_body[i0] -> [0, i0] }; +; SCOP-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: { Stmt_for_body[i0] -> MemRef_tmp[i0] }; +; SCOP-NEXT: Stmt_for_body5 +; SCOP-NEXT: Domain := +; SCOP-NEXT: { Stmt_for_body5[i0] : 0 <= i0 <= 19999 }; +; SCOP-NEXT: Schedule := +; SCOP-NEXT: { Stmt_for_body5[i0] -> [1, i0] }; +; SCOP-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; SCOP-NEXT: { Stmt_for_body5[i0] -> MemRef_B[i0] }; +; SCOP-NEXT: } + + +; KERNEL-IR: polly.stmt.for.body5: ; preds = %entry +; KERNEL-IR-NEXT: %p_tmp4 = ptrtoint i32** %A.addr to i64 +; KERNEL-IR-NEXT: %p_tmp5 = trunc i64 %p_tmp4 to i32 +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_B = bitcast i8 addrspace(1)* %MemRef_B to i32 addrspace(1)* +; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0 +; KERNEL-IR-NEXT: %polly.access.MemRef_B = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_B, i64 %5 +; KERNEL-IR-NEXT: store i32 %p_tmp5, i32 addrspace(1)* %polly.access.MemRef_B, align 4 + + +; Check that we launch a kernel +; HOST-IR: call void @polly_launchKernel +; HOST-IR-NEXT: call void @polly_freeKernel + +; void f(int *A, int *B) { +; Kernel #0 +; for(int i = 0; i < 1000; i++) { +; A[i] = 100; +; } +; +; Kernel #1 +; for(int i = 0; i < 20000; i++) { +; B[i] = &A; +; } +; } + +; Check that we still send A to kernel #1 even though we don't load/store +; from it. +; This is a regression test. + +; Polly constructs a set of values called 'SubtreeValues' for each kernel. +; This refers to the set of values that are potentially used by a kernel, +; and are hence sent over to the kernel. +; This would also include the base pointers of all arrays. +; However, we send arrays separately. So, we remove array base pointers +; from SubtreeValues to prevent double-sending. +; However, we remove *all* arrays from the *Scop* from SubtreeValues. +; We _should_ remove *all modeled (loaded/stored)* arrays in the *Kernel* from SubtreeValues. +; Make sure that in this case, we _do_ actually keep `A` in SubtreeValues, +; because we do not model a load or a store to A, but we _do_ use A in Kernel#1 +; in the form of a ptrtoint. + + + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +define void @f(i32* %A, i32* %B) { +entry: + %A.addr = alloca i32*, align 8 + br label %entry.split + +entry.split: ; preds = %entry + store i32* %A, i32** %A.addr, align 8, !tbaa !3 + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv13 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next2, %for.body ] + %tmp = load i32*, i32** %A.addr, align 8, !tbaa !3 + %arrayidx = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv13 + store i32 100, i32* %arrayidx, align 4, !tbaa !7 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv13, 1 + %exitcond3 = icmp eq i64 %indvars.iv.next2, 1000 + br i1 %exitcond3, label %for.end, label %for.body + +for.end: ; preds = %for.body + br label %for.body5 + +for.body5: ; preds = %for.end, %for.body5 + %indvars.iv2 = phi i64 [ 0, %for.end ], [ %indvars.iv.next, %for.body5 ] + %tmp4 = ptrtoint i32** %A.addr to i64 + %tmp5 = trunc i64 %tmp4 to i32 + %arrayidx7 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv2 + store i32 %tmp5, i32* %arrayidx7, align 4, !tbaa !7 + %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 20000 + br i1 %exitcond, label %for.end10, label %for.body5 + +for.end10: ; preds = %for.body5 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 3d47fbe0d75b2d1bb5353cc73a9484502860b571) (http://llvm.org/git/llvm.git dbf8de9323906a469b4680f06a8f3e9842dc20f4)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"any pointer", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0}