Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1477,6 +1477,28 @@ LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(), &ParamSpace}; + /// Check if a ScopArrayInfo is modelled by the `ppcg_kernel`. + /// + /// @param Needle The ScopArrayInfo to check whether the kernel models. + /// @param Kernel the PPCG representation of the current kernel. + /// @param Program the PPCG god object that knows about all arrays in the + /// scop. + /// @returns whether the ScopArrayInfo is referred to by the ppcg_kernel. + /// + /// Function is a local lambda because there is only one call site, which + /// is in this function. + auto isSAIModeledByKernel = [](const ScopArrayInfo *Needle, + ppcg_kernel *Kernel, gpu_prog *Prog) -> bool { + for (int i = 0; i < Prog->n_array; i++) { + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); + if (SAI == Needle) + return ppcg_kernel_requires_array_argument(Kernel, i); + } + // assert(false && "unable to find Array in known list of arrays"); + return false; // Is this correct? + }; + for (const auto &I : IDToValue) SubtreeValues.insert(I.second); @@ -1497,8 +1519,16 @@ return S.contains(L) || L->contains(S.getEntry()); }); - for (auto &SAI : S.arrays()) - SubtreeValues.remove(SAI->getBasePtr()); + // Only remove those base pointers from SubtreeValues which are actually + // used by the kernel. That way, we don't double send base pointers. + // We do need to send base pointers of arrays that are _not directly used_ + // in the kernel, because they can be involved in code that we don't model, + // such as ptrtoint. + // See: * test/GPGPU/ + for (auto &SAI : S.arrays()) { + if (isSAIModeledByKernel(SAI, Kernel, Prog)) + SubtreeValues.remove(SAI->getBasePtr()); + } isl_space *Space = S.getParamSpace().release(); for (long i = 0, n = isl_space_dim(Space, isl_dim_param); i < n; i++) { Index: test/GPGPU/invariant-load-hoisting-read-in-kernel.ll =================================================================== --- test/GPGPU/invariant-load-hoisting-read-in-kernel.ll +++ test/GPGPU/invariant-load-hoisting-read-in-kernel.ll @@ -7,7 +7,7 @@ ; Verify that invariant loads used in a kernel statement are correctly forwarded ; as subtree value to the GPU kernel. -; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0({{.*}} float %polly.access.p.load) +; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0({{.*}} float %polly.access.p.load{{.*}}) ; CHECK: store float %polly.access.p.load, float* %indvar2f.phiops define void @foo(float* %A, float* %p) { Index: test/GPGPU/invariant-load-hoisting.ll =================================================================== --- test/GPGPU/invariant-load-hoisting.ll +++ test/GPGPU/invariant-load-hoisting.ll @@ -17,10 +17,10 @@ ; SCOP-NEXT: [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] }; ; SCOP-NEXT: Execution Context: [n, tmp12] -> { : n > 0 } ; SCOP-NEXT: } -; HOST-IR: call void @polly_launchKernel(i8* %209, i32 %215, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; HOST-IR-NEXT: call void @polly_freeKernel(i8* %209) +; HOST-IR: call void @polly_launchKernel(i8* %211, i32 %217, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; HOST-IR-NEXT: call void @polly_freeKernel(i8* %211) -; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge) +; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12, i32* %invariant, i32 %polly.preload.tmp21.merge) ; Check that we generate correct GPU code in case of invariant load hoisting.