Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -49,6 +49,8 @@ #include "llvm/Support/Debug.h" +#include + using namespace polly; using namespace llvm; @@ -112,6 +114,122 @@ cl::desc("Minimal number of compute statements to run on GPU."), cl::Hidden, cl::init(10 * 512 * 512)); +/// Used to store information PPCG wants for live range reordering. +/// +/// @see computeLiveRangeReordering +/// @see GPUNodeBuilder::createPPCGScop +/// @see GPUNodeBuilder::createPPCGProg +struct LiveRangeReorderingInfo { + /// Collection of all kill statements that will be grafted to the end of + /// PPCGScop->schedule. + /// + /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set` + /// which merges schedules in *arbitrary* order. + /// (we don't care about the order of the kills anyway). + isl_schedule *KillsSchedule; + /// Map from kill statement instances to scalars that need to be + /// killed. + /// + /// We choose to only only kill phi nodes in a scop. + /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> phi_ref[] } + isl_union_map *TaggedMustKills; + + LiveRangeReorderingInfo() + : KillsSchedule(nullptr), TaggedMustKills(nullptr){}; +}; + +/// Compute information needed to enable live range reordering information +/// with PPCG. +/// +/// @params S The Scop to compute live range reordering information +/// @returns live range reordering information that can be used to setup +/// PPCG. +static LiveRangeReorderingInfo computeLiveRangeReorderingInfo(const Scop &S) { + LiveRangeReorderingInfo Info; + + // 1. Collect phi nodes in scop. + SmallVector KillMemIds; + for (ScopArrayInfo *SAI : S.arrays()) { + if (!SAI->isPHIKind()) + continue; + + KillMemIds.push_back(SAI->getBasePtrId()); + } + + Info.TaggedMustKills = + isl_union_map_from_map(isl_map_empty(S.getParamSpace())); + + // Initialising KillsSchedule to isl_set_empty + // creates an empty node in the schedule node: + // - filter: "[control] -> { }" + // So, we choose to not create this to keep the output a little nicer, + // at the cost of some code complexity. + Info.KillsSchedule = nullptr; + + for (isl_id *phiId : KillMemIds) { + + std::stringstream KillStmtSS; + KillStmtSS << "Skill_phantom_" << isl_id_get_name(phiId); + isl_id *KillStmtId = + isl_id_alloc(S.getIslCtx(), KillStmtSS.str().c_str(), NULL); + + // NOTE: construction of tagged_must_kill + // 2. We need to construct a map + // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> phi_ref } + // To construct this, we use `isl_map_domain_product` on 2 maps` + // 2a. StmtToPhi: + // [param] -> { Stmt_phantom[] -> phi_ref[] } + // 2b. PhantomRefToPhi: + // [param] -> { ref_phantom[] -> phi_ref[] } + // + // Combining these with `isl_map_domain_product` gives us + // TaggedMustKill: + // [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] } + + // 2a. [param] -> { S_2[] -> phi_ref[] } + isl_map *StmtToPhi = isl_map_universe(S.getParamSpace()); + StmtToPhi = + + isl_map_set_tuple_id(StmtToPhi, isl_dim_in, isl_id_copy(KillStmtId)); + StmtToPhi = + isl_map_set_tuple_id(StmtToPhi, isl_dim_out, isl_id_copy(phiId)); + + std::stringstream PhantomRefSS; + PhantomRefSS << "ref_phantom_" << isl_id_get_name(phiId); + isl_id *PhantomRefId = + isl_id_alloc(S.getIslCtx(), PhantomRefSS.str().c_str(), NULL); + + // 2b. [param] -> { phantom_ref[] -> memref[] } + isl_map *PhantomRefToPhi = isl_map_universe(S.getParamSpace()); + PhantomRefToPhi = + isl_map_set_tuple_id(PhantomRefToPhi, isl_dim_in, PhantomRefId); + PhantomRefToPhi = isl_map_set_tuple_id(PhantomRefToPhi, isl_dim_out, phiId); + + // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] } + isl_map *TaggedMustKill = + isl_map_domain_product(StmtToPhi, PhantomRefToPhi); + Info.TaggedMustKills = isl_union_map_union( + Info.TaggedMustKills, isl_union_map_from_map(TaggedMustKill)); + + // 3. Create the kill schedule of the form + // "[param] -> { Stmt_phantom[] }" + // and add to Info.KillsSchedule + isl_space *KillStmtSpace = S.getParamSpace(); + KillStmtSpace = + isl_space_set_tuple_id(KillStmtSpace, isl_dim_set, KillStmtId); + isl_union_set *KillStmtDomain = + isl_union_set_from_set(isl_set_universe(KillStmtSpace)); + + isl_schedule *KillSchedule = isl_schedule_from_domain(KillStmtDomain); + if (Info.KillsSchedule) + Info.KillsSchedule = isl_schedule_set(Info.KillsSchedule, KillSchedule); + else + Info.KillsSchedule = KillSchedule; + } + + return Info; +} + /// Create the ast expressions for a ScopStmt. /// /// This function is a callback for to generate the ast expressions for each @@ -2114,6 +2232,8 @@ auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); PPCGScop->options = createPPCGOptions(); + // enable live range reordering + PPCGScop->options->live_range_reordering = 1; PPCGScop->start = 0; PPCGScop->end = 0; @@ -2129,10 +2249,9 @@ PPCGScop->tagged_must_writes = getTaggedMustWrites(); PPCGScop->must_writes = S->getMustWrites(); PPCGScop->live_out = nullptr; - PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); PPCGScop->tagger = nullptr; - - PPCGScop->independence = nullptr; + PPCGScop->independence = + isl_union_map_empty(isl_set_get_space(PPCGScop->context)); PPCGScop->dep_flow = nullptr; PPCGScop->tagged_dep_flow = nullptr; PPCGScop->dep_false = nullptr; @@ -2141,8 +2260,15 @@ PPCGScop->tagged_dep_order = nullptr; PPCGScop->schedule = S->getScheduleTree(); - PPCGScop->names = getNames(); + LiveRangeReorderingInfo LiveRangeInfo = computeLiveRangeReorderingInfo(*S); + // If we have something non-trivial to kill, add it to the schedule + if (LiveRangeInfo.KillsSchedule) + PPCGScop->schedule = isl_schedule_sequence(PPCGScop->schedule, + LiveRangeInfo.KillsSchedule); + PPCGScop->tagged_must_kills = LiveRangeInfo.TaggedMustKills; + + PPCGScop->names = getNames(); PPCGScop->pet = nullptr; compute_tagger(PPCGScop); @@ -2414,7 +2540,13 @@ PPCGProg->to_inner = getArrayIdentity(); PPCGProg->to_outer = getArrayIdentity(); PPCGProg->any_to_outer = nullptr; - PPCGProg->array_order = nullptr; + + // this needs to be set when live range reordering is enabled. + // NOTE: I believe that is conservatively correct. I'm not sure + // what the semantics of this is. + // Quoting PPCG/gpu.h: "Order dependences on non-scalars." + PPCGProg->array_order = + isl_union_map_empty(isl_set_get_space(PPCGScop->context)); PPCGProg->n_stmts = std::distance(S->begin(), S->end()); PPCGProg->stmts = getStatements(); PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); @@ -2424,7 +2556,6 @@ createArrays(PPCGProg); PPCGProg->may_persist = compute_may_persist(PPCGProg); - return PPCGProg; } Index: test/GPGPU/non-read-only-scalars.ll =================================================================== --- test/GPGPU/non-read-only-scalars.ll +++ test/GPGPU/non-read-only-scalars.ll @@ -67,7 +67,6 @@ ; CODE: } ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: } Index: test/GPGPU/phi-nodes-in-kernel.ll =================================================================== --- test/GPGPU/phi-nodes-in-kernel.ll +++ test/GPGPU/phi-nodes-in-kernel.ll @@ -24,9 +24,8 @@ ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055__phi, dev_MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); ; CODE-NEXT: } ; CODE: # kernel0 @@ -41,9 +40,7 @@ ; IR: [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8* ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4) -; IR: [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8* -; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4) -; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8* +; IR: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8* ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4) ; KERNEL-IR: entry: Index: test/GPGPU/privatization-simple.ll =================================================================== --- /dev/null +++ test/GPGPU/privatization-simple.ll @@ -0,0 +1,56 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; void f(int A[], int B[], int control, int C[]) { +; int x; +; #pragma scop +; for(int i = 0; i < 1000; i ++) { +; x = 0; +; if(control) x = C[i]; +; B[i] = x * A[i]; +; +; } +; #pragma endscop +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %B, i32 %control, i32* %C) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] + %tobool = icmp eq i32 %control, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %tmp4 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp8 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %tmp8, %x.0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %if.end + ret void +} Index: test/GPGPU/privatization.ll =================================================================== --- /dev/null +++ test/GPGPU/privatization.ll @@ -0,0 +1,60 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; SCOP: Function: checkPrivatization +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; +; +; void checkPrivatization(int A[], int B[], int C[], int control) { +; int x; +; #pragma scop +; for (int i = 0; i < 1000; i++) { +; x = 0; +; if (control) +; x += C[i]; +; +; B[i] = x * A[i]; +; } +; #pragma endscop +; } +; +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @checkPrivatization(i32* %A, i32* %B, i32* %C, i32 %control) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] + %tobool = icmp eq i32 %control, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %tmp4 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp9 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %tmp9, %x.0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %if.end + ret void +}