Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -49,6 +49,8 @@ #include "llvm/Support/Debug.h" +#include + using namespace polly; using namespace llvm; @@ -112,6 +114,110 @@ cl::desc("Minimal number of compute statements to run on GPU."), cl::Hidden, cl::init(10 * 512 * 512)); +/// Used to store information PPCG wants for live range reordering. +/// +/// @see computeLiveRangeReordering +/// @see GPUNodeBuilder::createPPCGScop +/// @see GPUNodeBuilder::createPPCGProg +struct MustKillsInfo { + /// Collection of all kill statements that will be sequenced at the end of + /// PPCGScop->schedule. + /// + /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set` + /// which merges schedules in *arbitrary* order. + /// (we don't care about the order of the kills anyway). + isl::schedule KillsSchedule; + /// Map from kill statement instances to scalars that need to be + /// killed. + /// + /// We currently only derive kill information for phi nodes, as phi nodes + /// allow us to easily derive kill information. PHI nodes are not alive + /// outside the scop and can consequently all be "killed". [params] -> { + /// [Stmt_phantom[] -> ref_phantom[]] -> phi_ref[] } + isl::union_map TaggedMustKills; + + MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){}; +}; + +/// Compute must-kills needed to enable live range reordering with PPCG. +/// +/// @params S The Scop to compute live range reordering information +/// @returns live range reordering information that can be used to setup +/// PPCG. +static MustKillsInfo computeMustKillsInfo(const Scop &S) { + const isl::space ParamSpace(isl::manage(S.getParamSpace())); + MustKillsInfo Info; + + // 1. Collect phi nodes in scop. + SmallVector KillMemIds; + for (ScopArrayInfo *SAI : S.arrays()) { + if (!SAI->isPHIKind()) + continue; + + KillMemIds.push_back(isl::manage(SAI->getBasePtrId())); + } + + Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace)); + + // Initialising KillsSchedule to isl_set_empty + // creates an empty node in the schedule: + // - filter: "[control] -> { }" + // So, we choose to not create this to keep the output a little nicer, + // at the cost of some code complexity. + Info.KillsSchedule = nullptr; + + for (isl::id &phiId : KillMemIds) { + isl::id KillStmtId = isl::id::alloc( + S.getIslCtx(), std::string("SKill_phantom_").append(phiId.get_name()), + nullptr); + + // NOTE: construction of tagged_must_kill: + // 2. We need to construct a map: + // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> phi_ref } + // To construct this, we use `isl_map_domain_product` on 2 maps`: + // 2a. StmtToPhi: + // [param] -> { Stmt_phantom[] -> phi_ref[] } + // 2b. PhantomRefToPhi: + // [param] -> { ref_phantom[] -> phi_ref[] } + // + // Combining these with `isl_map_domain_product` gives us + // TaggedMustKill: + // [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] } + + // 2a. [param] -> { S_2[] -> phi_ref[] } + isl::map StmtToPhi = isl::map::universe(isl::space(ParamSpace)); + StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::in, isl::id(KillStmtId)); + StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::out, isl::id(phiId)); + + isl::id PhantomRefId = isl::id::alloc( + S.getIslCtx(), std::string("ref_phantom") + phiId.get_name(), nullptr); + + // 2b. [param] -> { phantom_ref[] -> memref[] } + isl::map PhantomRefToPhi = isl::map::universe(isl::space(ParamSpace)); + PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::in, PhantomRefId); + PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::out, phiId); + + // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] } + isl::map TaggedMustKill = StmtToPhi.domain_product(PhantomRefToPhi); + Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill); + + // 3. Create the kill schedule of the form: + // "[param] -> { Stmt_phantom[] }" + // Then add this to Info.KillsSchedule. + isl::space KillStmtSpace = ParamSpace; + KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId); + isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace); + + isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain); + if (Info.KillsSchedule) + Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule); + else + Info.KillsSchedule = KillSchedule; + } + + return Info; +} + /// Create the ast expressions for a ScopStmt. /// /// This function is a callback for to generate the ast expressions for each @@ -2114,6 +2220,8 @@ auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); PPCGScop->options = createPPCGOptions(); + // enable live range reordering + PPCGScop->options->live_range_reordering = 1; PPCGScop->start = 0; PPCGScop->end = 0; @@ -2129,10 +2237,9 @@ PPCGScop->tagged_must_writes = getTaggedMustWrites(); PPCGScop->must_writes = S->getMustWrites(); PPCGScop->live_out = nullptr; - PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); PPCGScop->tagger = nullptr; - - PPCGScop->independence = nullptr; + PPCGScop->independence = + isl_union_map_empty(isl_set_get_space(PPCGScop->context)); PPCGScop->dep_flow = nullptr; PPCGScop->tagged_dep_flow = nullptr; PPCGScop->dep_false = nullptr; @@ -2141,8 +2248,15 @@ PPCGScop->tagged_dep_order = nullptr; PPCGScop->schedule = S->getScheduleTree(); - PPCGScop->names = getNames(); + MustKillsInfo KillsInfo = computeMustKillsInfo(*S); + // If we have something non-trivial to kill, add it to the schedule + if (KillsInfo.KillsSchedule.get()) + PPCGScop->schedule = isl_schedule_sequence( + PPCGScop->schedule, KillsInfo.KillsSchedule.take()); + PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take(); + + PPCGScop->names = getNames(); PPCGScop->pet = nullptr; compute_tagger(PPCGScop); @@ -2414,7 +2528,13 @@ PPCGProg->to_inner = getArrayIdentity(); PPCGProg->to_outer = getArrayIdentity(); PPCGProg->any_to_outer = nullptr; - PPCGProg->array_order = nullptr; + + // this needs to be set when live range reordering is enabled. + // NOTE: I believe that is conservatively correct. I'm not sure + // what the semantics of this is. + // Quoting PPCG/gpu.h: "Order dependences on non-scalars." + PPCGProg->array_order = + isl_union_map_empty(isl_set_get_space(PPCGScop->context)); PPCGProg->n_stmts = std::distance(S->begin(), S->end()); PPCGProg->stmts = getStatements(); PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); @@ -2424,7 +2544,6 @@ createArrays(PPCGProg); PPCGProg->may_persist = compute_may_persist(PPCGProg); - return PPCGProg; } Index: test/GPGPU/non-read-only-scalars.ll =================================================================== --- test/GPGPU/non-read-only-scalars.ll +++ test/GPGPU/non-read-only-scalars.ll @@ -67,7 +67,6 @@ ; CODE: } ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: } Index: test/GPGPU/phi-nodes-in-kernel.ll =================================================================== --- test/GPGPU/phi-nodes-in-kernel.ll +++ test/GPGPU/phi-nodes-in-kernel.ll @@ -24,9 +24,8 @@ ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } -; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055__phi, dev_MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE: cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); ; CODE-NEXT: } ; CODE: # kernel0 @@ -41,9 +40,7 @@ ; IR: [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8* ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4) -; IR: [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8* -; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4) -; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8* +; IR: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8* ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4) ; KERNEL-IR: entry: Index: test/GPGPU/privatization-simple.ll =================================================================== --- /dev/null +++ test/GPGPU/privatization-simple.ll @@ -0,0 +1,56 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; void f(int A[], int B[], int control, int C[]) { +; int x; +; #pragma scop +; for(int i = 0; i < 1000; i ++) { +; x = 0; +; if(control) x = C[i]; +; B[i] = x * A[i]; +; +; } +; #pragma endscop +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %B, i32 %control, i32* %C) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] + %tobool = icmp eq i32 %control, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %tmp4 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp8 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %tmp8, %x.0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %if.end + ret void +} Index: test/GPGPU/privatization.ll =================================================================== --- /dev/null +++ test/GPGPU/privatization.ll @@ -0,0 +1,60 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR + +; SCOP: Function: checkPrivatization +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + +; +; +; void checkPrivatization(int A[], int B[], int C[], int control) { +; int x; +; #pragma scop +; for (int i = 0; i < 1000; i++) { +; x = 0; +; if (control) +; x += C[i]; +; +; B[i] = x * A[i]; +; } +; #pragma endscop +; } +; +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @checkPrivatization(i32* %A, i32* %B, i32* %C, i32 %control) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %if.end + %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] + %tobool = icmp eq i32 %control, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %tmp4 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp9 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %tmp9, %x.0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %if.end + ret void +}