Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -86,6 +86,27 @@ #define DEBUG_TYPE "polly-opt-isl" + + + +// Inspired directly from PPCGCodeGeneration.cpp + +#include "polly/ScopInfo.h" +#include "isl/union_map.h" + +extern "C" { +#include "ppcg/cuda.h" +#include "ppcg/gpu.h" +#include "ppcg/gpu_print.h" +#include "ppcg/ppcg.h" +#include "ppcg/schedule.h" +} + +using namespace polly; +using namespace llvm; + + + static cl::opt OptimizeDeps("polly-opt-optimize-only", cl::desc("Only a certain kind of dependences (all/raw)"), @@ -272,6 +293,17 @@ "transformations is applied on the schedule tree"), cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +// (The name of the option is inspired from PPCG: see e.g. file polly/lib/External/ppcg/ppcg_options.c) +static cl::opt LiveRangeReordering( + "polly-live-range-reordering", + cl::desc("Polly - Perform live range reordering (similar to PPCG) " + "to allow successive live ranges on the same memory element to be reordered. " + "(in order to avoid serializing the successive uses of memory. " + "This can allow tiling and parallelizing.)"), + cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + + + STATISTIC(ScopsProcessed, "Number of scops processed"); STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); STATISTIC(ScopsOptimized, "Number of scops optimized"); @@ -299,6 +331,422 @@ STATISTIC(MatMulOpts, "Number of matrix multiplication patterns detected and optimized"); + + + + + + + +// Alex: making cl::opt from PPCGCodeGeneration.cpp simple vars: +bool DumpSchedule = false; +bool DumpCode = false; +bool DumpKernelIR = false; +bool DumpKernelASM = false; +bool FastMath = false; +bool SharedMemory = false; +bool PrivateMemory = false; + + +/// Used to store information PPCG wants for kills. This information is +/// used by live range reordering. +/// +/// @see computeLiveRangeReordering +/// @see GPUNodeBuilder::createPPCGScop +/// @see GPUNodeBuilder::createPPCGProg +struct MustKillsInfo { + /// Collection of all kill statements that will be sequenced at the end of + /// PPCGScop->schedule. + /// + /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set` + /// which merges schedules in *arbitrary* order. + /// (we don't care about the order of the kills anyway). + isl::schedule KillsSchedule; + /// Map from kill statement instances to scalars that need to be + /// killed. + /// + /// We currently derive kill information for: + /// 1. phi nodes. PHI nodes are not alive outside the scop and can + /// consequently all be killed. + /// 2. Scalar arrays that are not used outside the Scop. This is + /// checked by `isScalarUsesContainedInScop`. + /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } + isl::union_map TaggedMustKills; + + /// Tagged must kills stripped of the tags. + /// [params] -> { Stmt_phantom[] -> scalar_to_kill[] } + isl::union_map MustKills; + + MustKillsInfo() : KillsSchedule(nullptr) {} +}; + + + + + +/// Check if SAI's uses are entirely contained within Scop S. +/// If a scalar is used only with a Scop, we are free to kill it, as no data +/// can flow in/out of the value any more. +/// @see computeMustKillsInfo +static bool isScalarUsesContainedInScop(const Scop &S, + const ScopArrayInfo *SAI) { + assert(SAI->isValueKind() && "this function only deals with scalars." + " Dealing with arrays required alias analysis"); + + const Region &R = S.getRegion(); + for (User *U : SAI->getBasePtr()->users()) { + Instruction *I = dyn_cast(U); + assert(I && "invalid user of scop array info"); + if (!R.contains(I)) + return false; + } + return true; +} + +/// Compute must-kills needed to enable live range reordering with PPCG. +/// +/// @params S The Scop to compute live range reordering information +/// @returns live range reordering information that can be used to setup +/// PPCG. +static MustKillsInfo computeMustKillsInfo(const Scop &S) { + const isl::space ParamSpace = S.getParamSpace(); + MustKillsInfo Info; + + // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria: + // 1.1 phi nodes in scop. + // 1.2 scalars that are only used within the scop + SmallVector KillMemIds; + for (ScopArrayInfo *SAI : S.arrays()) { + if (SAI->isPHIKind() || + (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI))) + KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release())); + } + + Info.TaggedMustKills = isl::union_map::empty(ParamSpace); + Info.MustKills = isl::union_map::empty(ParamSpace); + + // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the + // schedule: + // - filter: "[control] -> { }" + // So, we choose to not create this to keep the output a little nicer, + // at the cost of some code complexity. + Info.KillsSchedule = nullptr; + + for (isl::id &ToKillId : KillMemIds) { + isl::id KillStmtId = isl::id::alloc( + S.getIslCtx(), + std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr); + + // NOTE: construction of tagged_must_kill: + // 2. We need to construct a map: + // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } + // To construct this, we use `isl_map_domain_product` on 2 maps`: + // 2a. StmtToScalar: + // [param] -> { Stmt_phantom[] -> scalar_to_kill[] } + // 2b. PhantomRefToScalar: + // [param] -> { ref_phantom[] -> scalar_to_kill[] } + // + // Combining these with `isl_map_domain_product` gives us + // TaggedMustKill: + // [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } + + // 2a. [param] -> { Stmt[] -> scalar_to_kill[] } + isl::map StmtToScalar = isl::map::universe(ParamSpace); + StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId)); + StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId)); + + isl::id PhantomRefId = isl::id::alloc( + S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(), + nullptr); + + // 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] } + isl::map PhantomRefToScalar = isl::map::universe(ParamSpace); + PhantomRefToScalar = + PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId); + PhantomRefToScalar = + PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId); + + // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } + isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar); + Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill); + + // 2. [param] -> { Stmt[] -> scalar_to_kill[] } + Info.MustKills = Info.TaggedMustKills.domain_factor_domain(); + + // 3. Create the kill schedule of the form: + // "[param] -> { Stmt_phantom[] }" + // Then add this to Info.KillsSchedule. + isl::space KillStmtSpace = ParamSpace; + KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId); + isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace); + + isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain); + if (Info.KillsSchedule) + Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule); + else + Info.KillsSchedule = KillSchedule; + } + + return Info; +} + + + +/* IMPORTANT: the methods below are defined in PPCGCodeGeneration.cpp: + */ +// PPCGWrapperLRR is very similar to class PPCGCodeGeneration +namespace { +class PPCGWrapperLRR : public ScopPass { +public: + static char ID; + + /// The scop that is currently processed. + Scop *S; + + + + + PPCGWrapperLRR() : ScopPass(ID) {} + + /// Construct compilation options for PPCG. + /// + /// @returns The compilation options. + ppcg_options *createPPCGOptions() { + auto DebugOptions = + (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); + auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); + + DebugOptions->dump_schedule_constraints = false; + DebugOptions->dump_schedule = false; + DebugOptions->dump_final_schedule = false; + DebugOptions->dump_sizes = false; + DebugOptions->verbose = false; + + Options->debug = DebugOptions; + + Options->group_chains = false; + Options->reschedule = true; + Options->scale_tile_loops = false; + Options->wrap = false; + + Options->non_negative_parameters = false; + Options->ctx = nullptr; + Options->sizes = nullptr; + + Options->tile = true; + Options->tile_size = 32; + + Options->isolate_full_tiles = false; + + Options->use_private_memory = PrivateMemory; + Options->use_shared_memory = SharedMemory; + Options->max_shared_memory = 48 * 1024; + + Options->target = PPCG_TARGET_CUDA; + Options->openmp = false; + Options->linearize_device_arrays = true; + Options->allow_gnu_extensions = false; + + Options->unroll_copy_shared = false; + Options->unroll_gpu_tile = false; + Options->live_range_reordering = true; + + Options->live_range_reordering = true; + Options->hybrid = false; + Options->opencl_compiler_options = nullptr; + Options->opencl_use_gpu = false; + Options->opencl_n_include_file = 0; + Options->opencl_include_files = nullptr; + Options->opencl_print_kernel_types = false; + Options->opencl_embed_kernel_code = false; + + Options->save_schedule_file = nullptr; + Options->load_schedule_file = nullptr; + + return Options; + } + + + /// Get a tagged access relation containing all accesses of type @p AccessTy. + /// + /// Instead of a normal access of the form: + /// + /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] + /// + /// a tagged access has the form + /// + /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] + /// + /// where 'id' is an additional space that references the memory access that + /// triggered the access. + /// + /// @param AccessTy The type of the memory accesses to collect. + /// + /// @return The relation describing all tagged memory accesses. + isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { + isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace().release()); + + for (auto &Stmt : *S) + for (auto &Acc : Stmt) + if (Acc->getType() == AccessTy) { + isl_map *Relation = Acc->getAccessRelation().release(); + Relation = + isl_map_intersect_domain(Relation, Stmt.getDomain().release()); + + isl_space *Space = isl_map_get_space(Relation); + Space = isl_space_range(Space); + Space = isl_space_from_range(Space); + Space = + isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release()); + isl_map *Universe = isl_map_universe(Space); + Relation = isl_map_domain_product(Relation, Universe); + Accesses = isl_union_map_add_map(Accesses, Relation); + } + + return Accesses; + } + + /// Get the set of all read accesses, tagged with the access id. + /// + /// @see getTaggedAccesses + isl_union_map *getTaggedReads() { + return getTaggedAccesses(MemoryAccess::READ); + } + + /// Get the set of all may (and must) accesses, tagged with the access id. + /// + /// @see getTaggedAccesses + isl_union_map *getTaggedMayWrites() { + return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), + getTaggedAccesses(MemoryAccess::MUST_WRITE)); + } + + /// Get the set of all must accesses, tagged with the access id. + /// + /// @see getTaggedAccesses + isl_union_map *getTaggedMustWrites() { + return getTaggedAccesses(MemoryAccess::MUST_WRITE); + } + + + /// Collect parameter and array names as isl_ids. + /// + /// To reason about the different parameters and arrays used, ppcg requires + /// a list of all isl_ids in use. As PPCG traditionally performs + /// source-to-source compilation each of these isl_ids is mapped to the + /// expression that represents it. As we do not have a corresponding + /// expression in Polly, we just map each id to a 'zero' expression to match + /// the data format that ppcg expects. + /// + /// @returns Retun a map from collected ids to 'zero' ast expressions. + __isl_give isl_id_to_ast_expr *getNames() { + auto *Names = isl_id_to_ast_expr_alloc( + S->getIslCtx().get(), + S->getNumParams() + std::distance(S->array_begin(), S->array_end())); + auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx().get())); + + for (const SCEV *P : S->parameters()) { + isl_id *Id = S->getIdForParam(P).release(); + Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); + } + + for (auto &Array : S->arrays()) { + auto Id = Array->getBasePtrId().release(); + Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); + } + + isl_ast_expr_free(Zero); + + return Names; + } + + ppcg_scop *createPPCGScop() { + DEBUG(dbgs() << "Entered createPPCGScop()\n"); + + MustKillsInfo KillsInfo = computeMustKillsInfo(*S); + + auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); + + PPCGScop->options = createPPCGOptions(); + // enable live range reordering + PPCGScop->options->live_range_reordering = 1; + + PPCGScop->start = 0; + PPCGScop->end = 0; + + PPCGScop->context = S->getContext().release(); + PPCGScop->domain = S->getDomains().release(); + // TODO TODO: investigate this further. PPCG calls collect_call_domains. + PPCGScop->call = isl_union_set_from_set(S->getContext().release()); + PPCGScop->tagged_reads = getTaggedReads(); + PPCGScop->reads = S->getReads().release(); + PPCGScop->live_in = nullptr; + PPCGScop->tagged_may_writes = getTaggedMayWrites(); + PPCGScop->may_writes = S->getWrites().release(); + PPCGScop->tagged_must_writes = getTaggedMustWrites(); + PPCGScop->must_writes = S->getMustWrites().release(); + PPCGScop->live_out = nullptr; + PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take(); + PPCGScop->must_kills = KillsInfo.MustKills.take(); + + PPCGScop->tagger = nullptr; + PPCGScop->independence = + isl_union_map_empty(isl_set_get_space(PPCGScop->context)); + PPCGScop->dep_flow = nullptr; + PPCGScop->tagged_dep_flow = nullptr; + PPCGScop->dep_false = nullptr; + PPCGScop->dep_forced = nullptr; + PPCGScop->dep_order = nullptr; + PPCGScop->tagged_dep_order = nullptr; + + PPCGScop->schedule = S->getScheduleTree().release(); + // If we have something non-trivial to kill, add it to the schedule + if (KillsInfo.KillsSchedule.get()) + PPCGScop->schedule = isl_schedule_sequence( + PPCGScop->schedule, KillsInfo.KillsSchedule.take()); + + PPCGScop->names = getNames(); + PPCGScop->pet = nullptr; + + compute_tagger(PPCGScop); + compute_dependences(PPCGScop); + eliminate_dead_code(PPCGScop); + + DEBUG(dbgs() << "Exiting createPPCGScop()\n"); + + return PPCGScop; + } // END createPPCGScop() + + + /// Free the options in the ppcg scop structure. + /// + /// ppcg is not freeing these options for us. To avoid leaks we do this + /// ourselves. + /// + /// @param PPCGScop The scop referencing the options to free. + void freeOptions(ppcg_scop *PPCGScop) { + free(PPCGScop->options->debug); + PPCGScop->options->debug = nullptr; + free(PPCGScop->options); + PPCGScop->options = nullptr; + } + + + // Alex: this is just a bogus definition of runScop() + bool runOnScop(Scop &CurrentScop) override { + } + +}; // END: class PPCGWrapperLRR +} // END: namespace + +char PPCGWrapperLRR::ID = 1; +// Alex: END NEW code + + + + + + /// Create an isl::union_set, which describes the isolate option based on /// IsolateDomain. /// @@ -1470,6 +1918,49 @@ &Version); } + + +PPCGWrapperLRR ppcgObj; +ppcg_scop *ps = NULL; +void FreePPCGObjects() { + if (LiveRangeReordering == true) { + /* Implementing the deallocation functionality at the end of + PPCGCodeGeneration::runOnScop() */ + + ppcgObj.freeOptions(ps); + + // ppcg_scop_free(ps); + // Implementing the functionality of ppcg_scop_free(ps); + isl_set_free(ps->context); + isl_union_set_free(ps->domain); + isl_union_set_free(ps->call); + isl_union_map_free(ps->tagged_reads); + isl_union_map_free(ps->reads); + isl_union_map_free(ps->live_in); + isl_union_map_free(ps->tagged_may_writes); + isl_union_map_free(ps->tagged_must_writes); + isl_union_map_free(ps->may_writes); + isl_union_map_free(ps->must_writes); + isl_union_map_free(ps->live_out); + isl_union_map_free(ps->tagged_must_kills); + isl_union_map_free(ps->must_kills); +// TODO TODO: free these objects also +// Avoiding various deallocation errors because the ISL objects were not allocated + //isl_union_map_free(ps->tagged_dep_flow); + //isl_union_map_free(ps->dep_flow); + isl_union_map_free(ps->dep_false); + // isl_union_map_free(ps->dep_forced); + //isl_union_map_free(ps->tagged_dep_order); + //isl_union_map_free(ps->dep_order); + isl_schedule_free(ps->schedule); + isl_union_pw_multi_aff_free(ps->tagger); + isl_union_map_free(ps->independence); + isl_id_to_ast_expr_free(ps->names); + + DEBUG(dbgs() << "FreePPCGObjects(): (LiveRangeReordering == true) exiting... \n"); + } +} + bool IslScheduleOptimizer::runOnScop(Scop &S) { // Skip SCoPs in case they're already optimised by PPCGCodeGeneration if (S.isToBeSkipped()) @@ -1541,11 +2032,12 @@ "or 'no'. Falling back to default: 'yes'\n"; } - DEBUG(dbgs() << "\n\nCompute schedule from: "); - DEBUG(dbgs() << "Domain := " << Domain << ";\n"); - DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); - DEBUG(dbgs() << "Validity := " << Validity << ";\n"); - + if (LiveRangeReordering == false) { + DEBUG(dbgs() << "\n\nCompute schedule from: "); + DEBUG(dbgs() << "Domain := " << Domain << ";\n"); + DEBUG(dbgs() << "Proximity := " << Proximity << ";\n"); + DEBUG(dbgs() << "Validity := " << Validity << ";\n"); + } unsigned IslSerializeSCCs; if (FusionStrategy == "max") { @@ -1589,15 +2081,76 @@ isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient); + // NEW code + // From PPCGCodeGeneration.cpp: Set scheduling strategy to same strategy PPCG is using. + //isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); + //isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); + isl_options_set_schedule_whole_component(Ctx, false); + // END NEW code isl_options_set_tile_scale_tile_loops(Ctx, 0); auto OnErrorStatus = isl_options_get_on_error(Ctx); isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE); auto SC = isl::schedule_constraints::on_domain(Domain); - SC = SC.set_proximity(Proximity); - SC = SC.set_validity(Validity); - SC = SC.set_coincidence(Validity); + + + + // NEW code starts here + if (LiveRangeReordering == false) { + DEBUG(dbgs() << "LiveRangeReordering == false\n"); + + SC = SC.set_proximity(Proximity); + SC = SC.set_validity(Validity); + SC = SC.set_coincidence(Validity); + } + else { + DEBUG(dbgs() << "LiveRangeReordering == true\n"); + + ppcgObj.S = &S; + + // This code is inspired from PPCGCodeGeneration::runOnScop() + ps = ppcgObj.createPPCGScop(); + + // IMPORTANT NOTE: from isl-noexceptions.h: inline isl::union_map give(__isl_take isl_union_map *ptr); + isl::union_map psTaggedDepFlow = give(ps->tagged_dep_flow); + isl::union_map psTaggedDepOrder = give(ps->tagged_dep_order); + // + SC = SC.set_conditional_validity( + psTaggedDepFlow, + psTaggedDepOrder + ); + + isl::union_map psDepFlow = give(ps->dep_flow); + Validity = give(psDepFlow.copy()); + isl::union_map psDepForced = give(ps->dep_forced); + Validity = Validity.unite(psDepForced); + + // TODO TODO: if (ps->options->openmp) + isl::union_map Coincidence = give(Validity.copy()); + isl::union_map psDepOrder = give(ps->dep_order); + Coincidence = Coincidence.unite(psDepOrder); + SC = SC.set_coincidence(Coincidence); + + SC = SC.set_proximity(psDepFlow); //give(ps->dep_flow)); + /* From https://lirias.kuleuven.be/bitstream/123456789/519489/1/live_range_reordering.pdf: + "The proximity constraints are set to the union of the flow and the + false dependences." + TODO TODO: therefore, we need to SC.set_proximity(union(psDepFlow, dep_false)); */ + + SC = SC.set_validity(Validity); + + + DEBUG(dbgs() << "\n\nCompute schedule from: "); + DEBUG(dbgs() << "Domain := " << Domain << ";\n"); + DEBUG(dbgs() << "Proximity := " << psDepFlow << ";\n"); + DEBUG(dbgs() << "Validity := " << Validity << ";\n"); + DEBUG(dbgs() << "Coincidence := " << Coincidence << ";\n"); + DEBUG(dbgs() << "psTaggedDepFlow := " << psTaggedDepFlow << ";\n"); + DEBUG(dbgs() << "psTaggedDepOrder := " << psTaggedDepOrder << ";\n"); + } + // END new code + auto Schedule = SC.compute_schedule(); isl_options_set_on_error(Ctx, OnErrorStatus); @@ -1605,8 +2158,10 @@ // In cases the scheduler is not able to optimize the code, we just do not // touch the schedule. - if (!Schedule) + if (!Schedule) { + FreePPCGObjects(); return false; + } ScopsRescheduled++; @@ -1626,8 +2181,10 @@ auto NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); walkScheduleTreeForStatistics(NewSchedule, 2); - if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) + if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) { + FreePPCGObjects(); return false; + } auto ScopStats = S.getStatistics(); ScopsOptimized++; @@ -1640,6 +2197,7 @@ if (OptimizedScops) errs() << S; + FreePPCGObjects(); return false; } @@ -1681,3 +2239,4 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl", "Polly - Optimize schedule of SCoP", false, false) +