Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -137,7 +137,11 @@ /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } isl::union_map TaggedMustKills; - MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){}; + /// Tagged must kills stripped of the tags. + /// [params] -> { Stmt_phantom[] -> scalar_to_kill[] } + isl::union_map MustKills; + + MustKillsInfo() : KillsSchedule(nullptr) {} }; /// Check if SAI's uses are entirely contained within Scop S. @@ -179,6 +183,7 @@ } Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace)); + Info.MustKills = isl::union_map::empty(isl::space(ParamSpace)); // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the // schedule: @@ -225,6 +230,9 @@ isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar); Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill); + // 2. [param] -> { Stmt[] -> scalar_to_kill[] } + Info.MustKills = Info.TaggedMustKills.domain_factor_domain(); + // 3. Create the kill schedule of the form: // "[param] -> { Stmt_phantom[] }" // Then add this to Info.KillsSchedule. @@ -1004,11 +1012,11 @@ Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); if (!gpu_array_is_scalar(Array)) { - auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); + auto OffsetDimZero = isl_multi_pw_aff_get_pw_aff(Array->bound, 0); isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); for (unsigned int i = 1; i < Array->n_index; i++) { - isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); + isl_pw_aff *Bound_I = isl_multi_pw_aff_get_pw_aff(Array->bound, i); isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); Res = isl_ast_expr_mul(Res, Expr); } @@ -1048,7 +1056,7 @@ for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) { if (i > 0) { - isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]); + isl_pw_aff *Bound_I = isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1); isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); Result = isl_ast_expr_mul(Result, BExpr); } @@ -1152,7 +1160,18 @@ isl_ast_expr_free(Expr); return; } - + if (!strcmp(Str, "init_device")) { + initializeAfterRTH(); + isl_ast_node_free(UserStmt); + isl_ast_expr_free(Expr); + return; + } + if (!strcmp(Str, "clear_device")) { + finalize(); + isl_ast_node_free(UserStmt); + isl_ast_expr_free(Expr); + return; + } if (isPrefix(Str, "to_device")) { if (!ManagedMemory) createDataTransfer(UserStmt, HOST_TO_DEVICE); @@ -1766,7 +1785,7 @@ Sizes.push_back(nullptr); for (long j = 1; j < Kernel->array[i].array->n_index; j++) { isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( - Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); + Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j)); auto V = ExprBuilder.create(DimSize); Sizes.push_back(SE.getSCEV(V)); } @@ -2127,6 +2146,7 @@ Options->debug = DebugOptions; + Options->group_chains = false; Options->reschedule = true; Options->scale_tile_loops = false; Options->wrap = false; @@ -2135,8 +2155,11 @@ Options->ctx = nullptr; Options->sizes = nullptr; + Options->tile = true; Options->tile_size = 32; + Options->isolate_full_tiles = false; + Options->use_private_memory = PrivateMemory; Options->use_shared_memory = SharedMemory; Options->max_shared_memory = 48 * 1024; @@ -2144,8 +2167,14 @@ Options->target = PPCG_TARGET_CUDA; Options->openmp = false; Options->linearize_device_arrays = true; - Options->live_range_reordering = false; + Options->allow_gnu_extensions = false; + + Options->unroll_copy_shared = false; + Options->unroll_gpu_tile = false; + Options->live_range_reordering = true; + Options->live_range_reordering = true; + Options->hybrid = false; Options->opencl_compiler_options = nullptr; Options->opencl_use_gpu = false; Options->opencl_n_include_file = 0; @@ -2260,6 +2289,8 @@ /// /// @returns A new ppcg scop. ppcg_scop *createPPCGScop() { + MustKillsInfo KillsInfo = computeMustKillsInfo(*S); + auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); PPCGScop->options = createPPCGOptions(); @@ -2271,7 +2302,8 @@ PPCGScop->context = S->getContext(); PPCGScop->domain = S->getDomains(); - PPCGScop->call = nullptr; + // TODO: investigate this further. PPCG calls collect_call_domains. + PPCGScop->call = isl_union_set_from_set(S->getContext()); PPCGScop->tagged_reads = getTaggedReads(); PPCGScop->reads = S->getReads(); PPCGScop->live_in = nullptr; @@ -2280,6 +2312,9 @@ PPCGScop->tagged_must_writes = getTaggedMustWrites(); PPCGScop->must_writes = S->getMustWrites(); PPCGScop->live_out = nullptr; + PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take(); + PPCGScop->must_kills = KillsInfo.MustKills.take(); + PPCGScop->tagger = nullptr; PPCGScop->independence = isl_union_map_empty(isl_set_get_space(PPCGScop->context)); @@ -2291,19 +2326,17 @@ PPCGScop->tagged_dep_order = nullptr; PPCGScop->schedule = S->getScheduleTree(); - - MustKillsInfo KillsInfo = computeMustKillsInfo(*S); // If we have something non-trivial to kill, add it to the schedule if (KillsInfo.KillsSchedule.get()) PPCGScop->schedule = isl_schedule_sequence( PPCGScop->schedule, KillsInfo.KillsSchedule.take()); - PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take(); PPCGScop->names = getNames(); PPCGScop->pet = nullptr; compute_tagger(PPCGScop); compute_dependences(PPCGScop); + eliminate_dead_code(PPCGScop); return PPCGScop; } @@ -2458,14 +2491,23 @@ /// @param PPCGArray The array to compute bounds for. /// @param Array The polly array from which to take the information. void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { + isl_pw_aff_list *BoundsList = + isl_pw_aff_list_alloc(S->getIslCtx(), PPCGArray.n_index); + std::vector PwAffs; + + isl_space *AlignSpace = S->getParamSpace(); + AlignSpace = isl_space_add_dims(AlignSpace, isl_dim_set, 1); + if (PPCGArray.n_index > 0) { if (isl_set_is_empty(PPCGArray.extent)) { isl_set *Dom = isl_set_copy(PPCGArray.extent); isl_local_space *LS = isl_local_space_from_space( isl_space_params(isl_set_get_space(Dom))); isl_set_free(Dom); - isl_aff *Zero = isl_aff_zero_on_domain(LS); - PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero); + isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS)); + Zero = isl_pw_aff_align_params(Zero, isl_space_copy(AlignSpace)); + PwAffs.push_back(isl::manage(isl_pw_aff_copy(Zero))); + BoundsList = isl_pw_aff_list_insert(BoundsList, 0, Zero); } else { isl_set *Dom = isl_set_copy(PPCGArray.extent); Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); @@ -2478,7 +2520,9 @@ One = isl_aff_add_constant_si(One, 1); Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); Bound = isl_pw_aff_gist(Bound, S->getContext()); - PPCGArray.bound[0] = Bound; + Bound = isl_pw_aff_align_params(Bound, isl_space_copy(AlignSpace)); + PwAffs.push_back(isl::manage(isl_pw_aff_copy(Bound))); + BoundsList = isl_pw_aff_list_insert(BoundsList, 0, Bound); } } @@ -2487,8 +2531,20 @@ auto LS = isl_pw_aff_get_domain_space(Bound); auto Aff = isl_multi_aff_zero(LS); Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); - PPCGArray.bound[i] = Bound; + Bound = isl_pw_aff_align_params(Bound, isl_space_copy(AlignSpace)); + PwAffs.push_back(isl::manage(isl_pw_aff_copy(Bound))); + BoundsList = isl_pw_aff_list_insert(BoundsList, i, Bound); } + + isl_space_free(AlignSpace); + isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent); + + assert(BoundsSpace && "Unable to access space of array."); + assert(BoundsList && "Unable to access list of bounds."); + + PPCGArray.bound = + isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList); + assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly."); } /// Create the arrays for @p PPCGProg. @@ -2511,8 +2567,6 @@ PPCGArray.name = strdup(Array->getName().c_str()); PPCGArray.extent = nullptr; PPCGArray.n_index = Array->getNumberOfDimensions(); - PPCGArray.bound = - isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); PPCGArray.extent = getExtent(Array); PPCGArray.n_ref = 0; PPCGArray.refs = nullptr; @@ -2527,6 +2581,7 @@ PPCGArray.dep_order = nullptr; PPCGArray.user = Array; + PPCGArray.bound = nullptr; setArrayBounds(PPCGArray, Array); i++; @@ -2570,6 +2625,7 @@ isl_union_map_copy(PPCGScop->tagged_must_kills); PPCGProg->to_inner = getArrayIdentity(); PPCGProg->to_outer = getArrayIdentity(); + // TODO: verify that this assignment is correct. PPCGProg->any_to_outer = nullptr; // this needs to be set when live range reordering is enabled. @@ -2962,15 +3018,16 @@ Condition = isl_ast_expr_and(Condition, SufficientCompute); isl_ast_build_free(Build); + // preload invariant loads. Note: This should happen before the RTC + // because the RTC may depend on values that are invariant load hoisted. + NodeBuilder.preloadInvariantLoads(); + Value *RTC = NodeBuilder.createRTC(Condition); Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); Builder.SetInsertPoint(&*StartBlock->begin()); - NodeBuilder.initializeAfterRTH(); - NodeBuilder.preloadInvariantLoads(); NodeBuilder.create(Root); - NodeBuilder.finalize(); /// In case a sequential kernel has more surrounding loops as any parallel /// kernel, the SCoP is probably mostly sequential. Hence, there is no Index: test/GPGPU/host-control-flow.ll =================================================================== --- test/GPGPU/host-control-flow.ll +++ test/GPGPU/host-control-flow.ll @@ -14,9 +14,7 @@ ; REQUIRES: pollyacc -; CODE: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1) ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); @@ -26,6 +24,7 @@ ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); ; CODE-NEXT: } ; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader Index: test/GPGPU/host-statement.ll =================================================================== --- test/GPGPU/host-statement.ll +++ test/GPGPU/host-statement.ll @@ -18,11 +18,7 @@ ; This test case tests that we can correctly handle a ScopStmt that is ; scheduled on the host, instead of within a kernel. -; CODE-LABEL: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); ; CODE-NEXT: { Index: test/GPGPU/invalid-kernel.ll =================================================================== --- test/GPGPU/invalid-kernel.ll +++ test/GPGPU/invalid-kernel.ll @@ -20,11 +20,7 @@ ; were we still lack proper code-generation support. We check here that we ; detect the invalid IR and bail out gracefully. -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); @@ -34,7 +30,6 @@ ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } ; CODE: # kernel0 ; CODE-NEXT: Stmt_bb2(32 * b0 + t0); Index: test/GPGPU/kernel-params-only-some-arrays.ll =================================================================== --- test/GPGPU/kernel-params-only-some-arrays.ll +++ test/GPGPU/kernel-params-only-some-arrays.ll @@ -21,7 +21,7 @@ ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B) ; KERNEL-NEXT: entry: ; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 @@ -36,7 +36,7 @@ ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B) +; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A) ; KERNEL-NEXT: entry: ; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 @@ -47,18 +47,19 @@ ; KERNEL-NEXT: } -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) ; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) ; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @kernel_params_only_some_arrays(float* %A, float* %B) { Index: test/GPGPU/mostly-sequential.ll =================================================================== --- test/GPGPU/mostly-sequential.ll +++ test/GPGPU/mostly-sequential.ll @@ -2,9 +2,6 @@ ; RUN: -disable-output < %s | \ ; RUN: FileCheck -check-prefix=CODE %s -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - ; REQUIRES: pollyacc ; void foo(float A[]) { @@ -16,11 +13,7 @@ ; A[42] += i + j; ; } -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); ; CODE-NEXT: dim3 k0_dimGrid(4); @@ -28,26 +21,25 @@ ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: for (int c0 = 0; c0 <= 127; c0 += 1) -; CODE-NEXT: for (int c1 = 0; c1 <= 127; c1 += 1) -; CODE-NEXT: { +; CODE: { ; CODE-NEXT: dim3 k1_dimBlock; ; CODE-NEXT: dim3 k1_dimGrid; -; CODE-NEXT: kernel1 <<>> (dev_MemRef_A, c0, c1); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (128) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); ; CODE-NEXT: } ; CODE: # kernel0 ; CODE-NEXT: Stmt_bb4(32 * b0 + t0); ; CODE: # kernel1 -; CODE-NEXT: Stmt_bb14(c0, c1); +; CODE-NEXT: for (int c0 = 0; c0 <= 127; c0 += 1) +; CODE-NEXT: for (int c1 = 0; c1 <= 127; c1 += 1) +; CODE-NEXT: Stmt_bb14(c0, c1); -; Verify that we identified this kernel as non-profitable. -; IR: br i1 false, label %polly.start, label %bb3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/GPGPU/non-read-only-scalars.ll =================================================================== --- test/GPGPU/non-read-only-scalars.ll +++ test/GPGPU/non-read-only-scalars.ll @@ -31,12 +31,7 @@ ; printf("%f\n", sum); ; } -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE: dim3 k0_dimBlock(32); ; CODE-NEXT: dim3 k0_dimGrid(1); ; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); ; CODE-NEXT: cudaCheckKernel(); @@ -49,25 +44,18 @@ ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: for (int c0 = 0; c0 <= 32; c0 += 1) { -; CODE-NEXT: { +; CODE: { ; CODE-NEXT: dim3 k2_dimBlock; ; CODE-NEXT: dim3 k2_dimGrid; -; CODE-NEXT: kernel2 <<>> (dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0); +; CODE-NEXT: kernel2 <<>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: if (c0 <= 31) -; CODE-NEXT: { -; CODE-NEXT: dim3 k3_dimBlock; -; CODE-NEXT: dim3 k3_dimGrid; -; CODE-NEXT: kernel3 <<>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: } -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0__phi)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0)); ; CODE-NEXT: } ; CODE: # kernel0 @@ -80,19 +68,20 @@ ; CODE-NEXT: Stmt_bb17(); ; CODE: # kernel2 -; CODE-NEXT: Stmt_bb18(c0); - -; CODE: # kernel3 -; CODE-NEXT: Stmt_bb20(c0); - -; KERNEL-IR: store float %p_tmp23, float* %sum.0.phiops -; KERNEL-IR-NEXT: [[REGA:%.+]] = addrspacecast i8 addrspace(1)* %MemRef_sum_0__phi to float* -; KERNEL-IR-NEXT: [[REGB:%.+]] = load float, float* %sum.0.phiops -; KERNEL-IR-NEXT: store float [[REGB]], float* [[REGA]] -; KERNEL-IR-NEXT: [[REGC:%.+]] = addrspacecast i8 addrspace(1)* %MemRef_sum_0 to float* -; KERNEL-IR-NEXT: [[REGD:%.+]] = load float, float* %sum.0.s2a -; KERNEL-IR-NEXT: store float [[REGD]], float* [[REGC]] -; KERNEL-IR-NEXT: ret void +; CODE-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { +; CODE-NEXT: Stmt_bb18(c0); +; CODE-NEXT: if (c0 <= 31) +; CODE-NEXT: Stmt_bb20(c0); +; CODE-NEXT: } + +; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_sum_0__phi) +; KERNEL-IR: store float 0.000000e+00, float* %sum.0.phiops +; KERNEL-IR: [[REGA:%.+]] = addrspacecast i8 addrspace(1)* %MemRef_sum_0__phi to float* +; KERNEL-IR: [[REGB:%.+]] = load float, float* %sum.0.phiops +; KERNEL-IR: store float [[REGB]], float* [[REGA]] + +; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2(i8 addrspace(1)* %MemRef_A, i8 addrspace(1)* %MemRef_sum_0__phi, i8 addrspace(1)* %MemRef_sum_0) + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/GPGPU/non-zero-array-offset.ll =================================================================== --- test/GPGPU/non-zero-array-offset.ll +++ test/GPGPU/non-zero-array-offset.ll @@ -7,35 +7,30 @@ ; ; REQUIRES: pollyacc -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(8); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice)); + +; CODE: dim3 k0_dimBlock(8); ; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_B); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: { +; CODE: { ; CODE-NEXT: dim3 k1_dimBlock(8); ; CODE-NEXT: dim3 k1_dimGrid(1); -; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_B); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } ; CODE: # kernel0 -; CODE-NEXT: Stmt_bb3(t0); +; CODE-NEXT: Stmt_bb11(t0); ; CODE: # kernel1 -; CODE-NEXT: Stmt_bb11(t0); +; CODE-NEXT: Stmt_bb3(t0); ; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32) ; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32) Index: test/GPGPU/parametric-loop-bound.ll =================================================================== --- test/GPGPU/parametric-loop-bound.ll +++ test/GPGPU/parametric-loop-bound.ll @@ -14,15 +14,16 @@ ; } ; CODE: if (n >= 1) { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (n) * sizeof(i64), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (n) * sizeof(i64), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(n >= 1048546 ? 32768 : floord(n + 31, 32)); +; CODE-NEXT: dim3 k0_dimGrid(n >= 1048546 ? 32768 : (n + 31) / 32); ; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, n); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (n) * sizeof(i64), cudaMemcpyDeviceToHost)); +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (n) * sizeof(i64), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); ; CODE-NEXT: } ; CODE: # kernel0 Index: test/GPGPU/phi-nodes-in-kernel.ll =================================================================== --- test/GPGPU/phi-nodes-in-kernel.ll +++ test/GPGPU/phi-nodes-in-kernel.ll @@ -32,44 +32,28 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; CODE: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_out_l_055__phi, &MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(2); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_out_l_055__phi, dev_MemRef_out_l_055, dev_MemRef_c); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } +; CODE: cudaCheckReturn(cudaMalloc((void **) &dev_MemRef_c, (50) * sizeof(i32))); + +; CODE: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(2); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_c); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } +; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_c)); ; CODE: # kernel0 -; CODE-NEXT: if (32 * b0 + t0 <= 48) { -; CODE-NEXT: if (b0 == 1 && t0 == 16) -; CODE-NEXT: Stmt_for_cond1_preheader(0); -; CODE-NEXT: Stmt_for_body17(0, 32 * b0 + t0); -; CODE-NEXT: if (b0 == 1 && t0 == 16) -; CODE-NEXT: Stmt_for_cond15_for_cond12_loopexit_crit_edge(0); -; CODE-NEXT: } - -; IR: [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8* -; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4) +; CODE-NEXT: if (32 * b0 + t0 <= 48) +; CODE-NEXT: Stmt_for_body17(0, 32 * b0 + t0); -; IR: [[REGC:%.+]] = bitcast i32* %38 to i8* +; IR: [[REGC:%.+]] = bitcast i32* %27 to i8* ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196) -; KERNEL-IR: entry: -; KERNEL-IR-NEXT: %out_l.055.s2a = alloca i32 -; KERNEL-IR-NEXT: %out_l.055.phiops = alloca i32 -; KERNEL-IR-NEXT: %1 = addrspacecast i8 addrspace(1)* %MemRef_out_l_055__phi to i32* -; KERNEL-IR-NEXT: %2 = load i32, i32* %1 -; KERNEL-IR-NEXT: store i32 %2, i32* %out_l.055.phiops -; KERNEL-IR-NEXT: %3 = addrspacecast i8 addrspace(1)* %MemRef_out_l_055 to i32* -; KERNEL-IR-NEXT: %4 = load i32, i32* %3 -; KERNEL-IR-NEXT: store i32 %4, i32* %out_l.055.s2a - +; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c, i32) #0 { +; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %10 +; KERNEL-IR-NEXT: store i32 %0, i32 addrspace(1)* %polly.access.MemRef_c, align 4 define void @kernel_dynprog([50 x i32]* %c) { entry: Index: test/GPGPU/region-stmt.ll =================================================================== --- test/GPGPU/region-stmt.ll +++ test/GPGPU/region-stmt.ll @@ -5,11 +5,7 @@ ; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ ; RUN: FileCheck %s -check-prefix=IR -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (128) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); @@ -19,7 +15,6 @@ ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (128) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } ; CODE: # kernel0 ; CODE-NEXT: Stmt_for_body__TO__if_end(32 * b0 + t0); Index: test/GPGPU/scheduler-timeout.ll =================================================================== --- test/GPGPU/scheduler-timeout.ll +++ test/GPGPU/scheduler-timeout.ll @@ -27,11 +27,7 @@ ; D[i][j] += tmp[i][k] * C[k][j]; ; } -; CODE:Code -; CODE-NEXT:==== -; CODE-NEXT:# host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); @@ -51,7 +47,6 @@ ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } ; CODE: # kernel0 ; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) Index: test/GPGPU/size-cast.ll =================================================================== --- test/GPGPU/size-cast.ll +++ test/GPGPU/size-cast.ll @@ -9,20 +9,18 @@ ; This test case ensures that we properly sign-extend the types we are using. -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: if (arg >= 1 && arg1 == 0) { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_arg3, MemRef_arg3, (arg) * sizeof(double), cudaMemcpyHostToDevice)); +; CODE: if (arg >= 1 && arg1 == 0) { +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_arg3, MemRef_arg3, (arg) * sizeof(double), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(arg >= 1048546 ? 32768 : floord(arg + 31, 32)); +; CODE-NEXT: dim3 k0_dimGrid(arg >= 1048546 ? 32768 : (arg + 31) / 32); ; CODE-NEXT: kernel0 <<>> (dev_MemRef_arg3, dev_MemRef_arg2, arg, arg1); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_arg2, dev_MemRef_arg2, (arg) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } +; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg3)); +; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg2)); ; CODE: # kernel0 ; CODE-NEXT: for (int c0 = 0; c0 <= (arg - 32 * b0 - 1) / 1048576; c0 += 1) Index: test/GPGPU/untouched-arrays.ll =================================================================== --- test/GPGPU/untouched-arrays.ll +++ test/GPGPU/untouched-arrays.ll @@ -4,11 +4,7 @@ ; REQUIRES: pollyacc -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_global_1, MemRef_global_1, (142) * sizeof(i32), cudaMemcpyHostToDevice)); +; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_global_1, MemRef_global_1, (142) * sizeof(i32), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(10); ; CODE-NEXT: dim3 k0_dimGrid(1); @@ -17,6 +13,7 @@ ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_global_1, dev_MemRef_global_1, (142) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE: cudaCheckReturn(cudaFree(dev_MemRef_global_1)); ; CODE-NEXT: } ; CODE: # kernel0