diff --git a/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h --- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h +++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h @@ -142,6 +142,8 @@ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); void printPipeline(raw_ostream &OS, function_ref MapClassName2PassName); + + ~LoopUnrollPass(); }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -28,11 +28,17 @@ #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopPropertiesAnalysis.h" #include "llvm/Analysis/LoopUnrollAnalyzer.h" +#include "llvm/Analysis/MLModelRunner.h" +#include "llvm/Analysis/NoInferenceModelRunner.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/TensorSpec.h" +#include "llvm/Analysis/Utils/TFUtils.h" +#include "llvm/Analysis/Utils/TrainingLogger.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" @@ -51,6 +57,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" @@ -174,6 +181,185 @@ cl::desc("Default threshold (max size of unrolled " "loop), used in all but O3 optimizations")); +static cl::opt + EnableMLGOUnroll("mlgo-unroll", cl::init(false), cl::Hidden, + cl::desc("Allows MLGO to operate on LoopUnroll")); + +static cl::opt + TrainingLog("mlgo-unroll-training-log", cl::Hidden, + cl::desc("Training log for loop partial unroll")); + +namespace mlgo_loop_unroll { + +/// These features are extracted by LoopPropertiesAnalysis +/// Tuple of (data_type, variable_name, shape, description) +#define LOOP_UNROLL_FEATURES_LIST(M) \ + M(int64_t, loop_size, {1}, "size of loop") \ + M(int64_t, trip_count, {1}, "static trip count of loop") \ + M(int64_t, is_innermost_loop, {1}, "whether the loop is the innermost loop") \ + M(int64_t, preheader_blocksize, {1}, "preheader blocksize (by instruction)") \ + M(int64_t, bb_count, {1}, "number of basic blocks (ignoring subloops)") \ + M(int64_t, num_of_loop_latch, {1}, "number of loop latches") \ + M(int64_t, load_inst_count, {1}, "load instruction count") \ + M(int64_t, store_inst_count, {1}, "store instruction count") \ + M(int64_t, logical_inst_count, {1}, "logical instruction count") \ + M(int64_t, cast_inst_count, {1}, "cast instruction count") + +// The model learns to decide whether or not to partial unroll a loop. +// If unroll_count == 0, a loop is not unrolled, otherwise it is unrolled by the +// factor of provided decision. +#define DecisionName "unroll_count" + +enum FeatureIDs { +#define _FEATURE_IDX(_, name, __, ___) name, + LOOP_UNROLL_FEATURES_LIST(_FEATURE_IDX) +#undef _FEATURE_IDX + FeatureCount +}; + +#define DECL_FEATURES(type, name, shape, _) \ + TensorSpec::createSpec(#name, shape), + +/// Input features for training. Note that in the future we can attempt to have +/// multiple sets of features for different purpose. For example a set of +/// input features in release mode. +static const std::vector InputFeatures{ + LOOP_UNROLL_FEATURES_LIST(DECL_FEATURES)}; + +/// A single output feature, the decision whether or not to loop partial unroll +static const TensorSpec OutputFeature = + TensorSpec::createSpec("unroll_count", {1}); + +/// Currently a dummy reward at the moment. Required to be provided for +/// construction of logger. +static const TensorSpec RewardFeature = + TensorSpec::createSpec("reward", {1}); + +/// Class for MLGO in loop unroll. Since LoopUnrollPass is a LoopPass and we +/// will flush the logs after all features all collected, the compiler will +/// hold a local copy of this class. Logs will be dumped at the end of +/// compilation, which will be at the dtor of LoopUnrollPass. +struct MLGOLoopUnrollAnalysis { +public: + /// Ctor for MLGO in loop unroll. + MLGOLoopUnrollAnalysis(LLVMContext &Ctx) : Ctx(Ctx) { + Runner = std::make_unique(Ctx, InputFeatures); + } + + MLGOLoopUnrollAnalysis() = delete; + + /// Set features for loop + void setFeatures(const unsigned LoopSize, const unsigned TripCount, + LoopInfo &LI, ScalarEvolution &SE, Loop &L); + + /// Log features and partial unroll decision for loop + void logFeaturesAndDecision(const unsigned PartialUnrollCount, Loop &L); + + /// Save **ALL** the logs + bool flush(); + + /// Runner for MLModel (training(default) / training(model) / release) + std::unique_ptr Runner; + +private: + template size_t getTotalSize(const std::vector &Shape) { + size_t Ret = sizeof(T); + for (const auto V : Shape) + Ret *= V; + return Ret; + } + + void resetInputs() { +#define _RESET(type, name, shape, _) \ + std::memset(Runner->getTensorUntyped(FeatureIDs::name), 0, \ + getTotalSize(shape)); + LOOP_UNROLL_FEATURES_LIST(_RESET) +#undef _RESET + } + + LLVMContext &Ctx; + + /// A logger for each loop. Key = "$(MODULE)###$(FUNCTION)###$(LOOP)" + StringMap> LogMap; +}; + +void MLGOLoopUnrollAnalysis::setFeatures(const unsigned LoopSize, + const unsigned TripCount, LoopInfo &LI, + ScalarEvolution &SE, Loop &L) { + resetInputs(); + + LoopPropertiesInfo LPI = + LoopPropertiesInfo::getLoopPropertiesInfo(&L, &LI, &SE); + +#define SET(id, type, val) \ + *Runner->getTensor(FeatureIDs::id) = static_cast(val); + SET(loop_size, int64_t, LoopSize); + SET(trip_count, int64_t, TripCount); + SET(is_innermost_loop, int64_t, LPI.IsInnerMostLoop); + SET(preheader_blocksize, int64_t, LPI.PreheaderBlocksize); + SET(bb_count, int64_t, LPI.BasicBlockCount); + SET(num_of_loop_latch, int64_t, LPI.LoopLatchCount); + SET(load_inst_count, int64_t, LPI.LoadInstCount); + SET(store_inst_count, int64_t, LPI.StoreInstCount); + SET(logical_inst_count, int64_t, LPI.LogicalInstCount); + SET(cast_inst_count, int64_t, LPI.CastInstCount); +#undef SET +} + +void MLGOLoopUnrollAnalysis::logFeaturesAndDecision(const unsigned UnrollCount, + Loop &L) { + // Key = $(MODULE)###$(FUNCTION)###$(LOOP) + std::string Key = L.getHeader()->getModule()->getName().str() + "###" + + L.getHeader()->getParent()->getName().str() + "###" + + L.getName().str(); + + assert(!LogMap.count(Key) && + "Should only extract feature for every loop once"); + + std::vector LFS; + for (const auto &IF : InputFeatures) + LFS.push_back({IF, None}); + LFS.push_back({OutputFeature, None}); + + // Create Logger for loop and insert it to LogMap + auto I = LogMap.insert((std::make_pair( + Key, std::make_unique(LFS, RewardFeature, + /* IncludeReward */ false)))); + assert(I.second && "Should be unique insertion"); + + Logger *Log = I.first->second.get(); + size_t CurrentFeature = 0; + for (; CurrentFeature < FeatureIDs::FeatureCount; ++CurrentFeature) + Log->logSpecifiedTensorValue(CurrentFeature, + reinterpret_cast( + Runner->getTensorUntyped(CurrentFeature))); + + Log->logInt32Value(CurrentFeature, + reinterpret_cast(&UnrollCount)); + + LLVM_DEBUG( + dbgs() << "(MLGO) Logged features and loop partial unroll decision = " + << UnrollCount << " for loop '" << Key << "'\n"); +} + +bool MLGOLoopUnrollAnalysis::flush() { + std::error_code EC; + // The append flag is specified here because modules is compiled at the same + // time during usual compulations and we want the logs to be in the same file. + auto OS = std::make_unique(TrainingLog, EC, + llvm::sys::fs::OF_Append); + if (EC) { + Ctx.emitError(EC.message() + ":" + TrainingLog); + return false; + } + Logger::flushLogs(*OS, LogMap); + return true; +} + +} // namespace mlgo_loop_unroll + +static mlgo_loop_unroll::MLGOLoopUnrollAnalysis *MLGOAnalysis = nullptr; + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. @@ -833,7 +1019,8 @@ static Optional shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, - const TargetTransformInfo::UnrollingPreferences &UP) { + const TargetTransformInfo::UnrollingPreferences &UP, + ScalarEvolution &SE, LoopInfo &LI, Loop &L) { if (!TripCount) return None; @@ -843,6 +1030,10 @@ << "-unroll-allow-partial not given\n"); return 0; } + + if (EnableMLGOUnroll) + MLGOAnalysis->setFeatures(LoopSize, TripCount, LI, SE, L); + unsigned count = UP.Count; if (count == 0) count = TripCount; @@ -876,6 +1067,9 @@ LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n"); + if (EnableMLGOUnroll) + MLGOAnalysis->logFeaturesAndDecision(count, L); + return count; } // Returns true if unroll count was set explicitly. @@ -993,7 +1187,8 @@ // 6th priority is partial unrolling. // Try partial unroll only when TripCount could be statically calculated. - if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) { + if (auto UnrollFactor = + shouldPartialUnroll(LoopSize, TripCount, UCE, UP, SE, *LI, *L)) { UP.Count = *UnrollFactor; if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && @@ -1577,6 +1772,13 @@ auto *BFI = (PSI && PSI->hasProfileSummary()) ? &AM.getResult(F) : nullptr; + // Creating the MLGOAnalysis object at first sight + if (EnableMLGOUnroll && !MLGOAnalysis) { + LLVM_DEBUG(dbgs() << "(MLGO) Create MLGOAnalysis for loop unroll\n"); + MLGOAnalysis = + new mlgo_loop_unroll::MLGOLoopUnrollAnalysis(SE.getContext()); + } + bool Changed = false; // The unroller requires loops to be in simplified form, and also needs LCSSA. @@ -1661,3 +1863,11 @@ OS << "O" << UnrollOpts.OptLevel; OS << ">"; } + +LoopUnrollPass::~LoopUnrollPass() { + // At end of compilation, the LoopUnrollPass dtor is triggered. All logs of + // loop features and partial unroll decisions will be collected at this point + // and we now dump the logs. + if (EnableMLGOUnroll) + MLGOAnalysis->flush(); +}