Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -2632,6 +2632,77 @@ return Names; } + /// Remove unreferenced parameter dimensions from union_map. + isl::union_map removeUnusedParameters(isl::union_map UMap) { + auto New = isl::union_map::empty(isl::space(UMap.get_ctx(), 0, 0)); + + auto RemoveUnusedDims = [&New](isl::map S) -> isl::stat { + int Removed = 0; + int NumDims = S.dim(isl::dim::param); + for (long i = 0; i < NumDims; i++) { + const int Dim = i - Removed; + if (!S.involves_dims(isl::dim::param, Dim, 1)) { + S = S.remove_dims(isl::dim::param, Dim, 1); + Removed++; + } + } + New = New.unite(S); + return isl::stat::ok; + }; + + UMap.foreach_map(RemoveUnusedDims); + return New; + } + + /// Remove unreferenced parameter dimensions from union_set. + isl::union_set removeUnusedParameters(isl::union_set USet) { + auto New = isl::union_set::empty(isl::space(USet.get_ctx(), 0, 0)); + + auto RemoveUnusedDims = [&New](isl::set S) -> isl::stat { + int Removed = 0; + int NumDims = S.dim(isl::dim::param); + for (long i = 0; i < NumDims; i++) { + const int Dim = i - Removed; + if (!S.involves_dims(isl::dim::param, Dim, 1)) { + S = S.remove_dims(isl::dim::param, Dim, 1); + Removed++; + } + } + New = New.unite(S); + return isl::stat::ok; + }; + + USet.foreach_set(RemoveUnusedDims); + return New; + } + + /// Simplify PPCG scop to improve compile time. + /// + /// We drop unused parameter dimensions to reduce the size of the sets we are + /// working with. Especially the computed dependences tend to accumulate a lot + /// of parameters that are present in the input memory accesses, but often are + /// not necessary to express the actual dependences. As isl represents maps + /// and sets with dense matrices, reducing the dimensionality of isl sets + /// commonly reduces code generation performance. + void simplifyPPCGScop(ppcg_scop *PPCGScop) { + PPCGScop->domain = + removeUnusedParameters(isl::manage(PPCGScop->domain)).release(); + + PPCGScop->dep_forced = + removeUnusedParameters(isl::manage(PPCGScop->dep_forced)).release(); + PPCGScop->dep_false = + removeUnusedParameters(isl::manage(PPCGScop->dep_false)).release(); + PPCGScop->dep_flow = + removeUnusedParameters(isl::manage(PPCGScop->dep_flow)).release(); + PPCGScop->tagged_dep_flow = + removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_flow)) + .release(); + + PPCGScop->tagged_dep_order = + removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_order)) + .release(); + } + /// Create a new PPCG scop from the current scop. /// /// The PPCG scop is initialized with data from the current polly::Scop. From @@ -2689,6 +2760,7 @@ compute_tagger(PPCGScop); compute_dependences(PPCGScop); eliminate_dead_code(PPCGScop); + simplifyPPCGScop(PPCGScop); return PPCGScop; } @@ -3130,10 +3202,14 @@ isl_schedule *Schedule = get_schedule(PPCGGen); - int has_permutable = has_any_permutable_node(Schedule); + /// Copy to and from device functions may introduce new parameters, which + /// must be present in the schedule tree root for code generation. Hence, + /// we ensure that all possible parameters are introduced from this point. + if (!PollyManagedMemory) + Schedule = + isl_schedule_align_params(Schedule, S->getFullParamSpace().release()); - Schedule = - isl_schedule_align_params(Schedule, S->getFullParamSpace().release()); + int has_permutable = has_any_permutable_node(Schedule); if (!has_permutable || has_permutable < 0) { Schedule = isl_schedule_free(Schedule); Index: polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll =================================================================== --- polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll +++ polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll @@ -21,7 +21,7 @@ ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B) +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) ; KERNEL-NEXT: entry: ; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 @@ -36,7 +36,7 @@ ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A) +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B) ; KERNEL-NEXT: entry: ; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 @@ -47,13 +47,13 @@ ; KERNEL-NEXT: } -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) ; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) ; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* Index: polly/trunk/test/GPGPU/non-zero-array-offset.ll =================================================================== --- polly/trunk/test/GPGPU/non-zero-array-offset.ll +++ polly/trunk/test/GPGPU/non-zero-array-offset.ll @@ -12,14 +12,14 @@ ; CODE: dim3 k0_dimBlock(8); ; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_B); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: { ; CODE-NEXT: dim3 k1_dimBlock(8); ; CODE-NEXT: dim3 k1_dimGrid(1); -; CODE-NEXT: kernel1 <<>> (dev_MemRef_B); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } @@ -27,10 +27,10 @@ ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); ; CODE: # kernel0 -; CODE-NEXT: Stmt_bb11(t0); +; CODE-NEXT: Stmt_bb3(t0); ; CODE: # kernel1 -; CODE-NEXT: Stmt_bb3(t0); +; CODE-NEXT: Stmt_bb11(t0); ; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32) ; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)