Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -181,6 +181,7 @@ void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &); void initializeIPSCCPLegacyPassPass(PassRegistry&); void initializeIRCELegacyPassPass(PassRegistry&); +void initializeIROutlinerLegacyPassPass(PassRegistry&); void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry&); void initializeIRTranslatorPass(PassRegistry&); void initializeIVUsersWrapperPassPass(PassRegistry&); Index: llvm/include/llvm/Transforms/IPO.h =================================================================== --- llvm/include/llvm/Transforms/IPO.h +++ llvm/include/llvm/Transforms/IPO.h @@ -208,6 +208,11 @@ /// function(s). ModulePass *createHotColdSplittingPass(); +//===----------------------------------------------------------------------===// +/// createIROutlinerPass - This pass finds similar code regions and factors +/// those regions out into functions. +ModulePass *createIROutlinerPass(); + //===----------------------------------------------------------------------===// /// createPartialInliningPass - This pass inlines parts of functions. /// Index: llvm/include/llvm/Transforms/IPO/IROutliner.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/IPO/IROutliner.h @@ -0,0 +1,186 @@ +//===- IROutliner.h - Extract similar IR regions into functions ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// The interface file for the IROutliner which is used by the IROutliner Pass. +// +// The outliner uses the IRSimilarityIdentifier to identify the similar regions +// of code. It evaluates each set of IRSimilarityCandidates with an estimate of +// whether it will provide code size reduction. Each region is extracted using +// the code extractor. These extracted functions are consolidated into a single +// function and called from the extracted call site. +// +// For example: +// \code +// %1 = add i32 %a, %b +// %2 = add i32 %b, %a +// %3 = add i32 %b, %a +// %4 = add i32 %a, %b +// \endcode +// would become function +// \code +// define internal void outlined_ir_function(i32 %0, i32 %1) { +// %1 = add i32 %0, %1 +// %2 = add i32 %1, %0 +// ret void +// } +// \endcode +// with calls: +// \code +// call void outlined_ir_function(i32 %a, i32 %b) +// call void outlined_ir_function(i32 %b, i32 %a) +// \endcode +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_IROUTLINER_H +#define LLVM_TRANSFORMS_IPO_IROUTLINER_H + +#include "llvm/Analysis/IRSimilarityIdentifier.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" +#include + +struct OutlinableGroup; + +namespace llvm { +using namespace IRSimilarity; + +class Module; +class TargetTransformInfo; +class OptimizationRemarkEmitter; + +/// The OutlinableRegion holds all the information for a specific region, or +/// sequence of instructions. This includes what values need to be hoisted to +/// arguments from the extracted function, inputs and outputs to the region, and +/// mapping from the extracted function arguments to overall function arguments. +struct OutlinableRegion { + /// Describes the region of code. + IRSimilarityCandidate *Candidate; + + /// If this region is outlined, the front and back IRInstructionData could + /// potentially become invalidated if the only new instruction is a call. + /// This ensures that we replace in the instruction in the IRInstructionData. + IRInstructionData *NewFront = nullptr; + IRInstructionData *NewBack = nullptr; + + /// Used to create an outlined function. + CodeExtractor *CE = nullptr; + + /// The call site of the extracted region. + CallInst *Call = nullptr; + + /// The function for the extracted region. + Function *ExtractedFunction = nullptr; + + /// Flag for whether we have split out the IRSimilarityCanidate. That is, + /// make the region contained the IRSimilarityCandidate its own BasicBlock. + bool CandidateSplit = false; + + /// Flag for whether we should not consider this region for extraction. + bool IgnoreRegion = false; + + /// The BasicBlock that is before the start of the region BasicBlock, + /// only defined when the region has been split. + BasicBlock *PrevBB = nullptr; + + /// The BasicBlock that contains the starting instruction of the region. + BasicBlock *StartBB = nullptr; + + /// The BasicBlock that contains the ending instruction of the region. + BasicBlock *EndBB = nullptr; + + /// The BasicBlock that is after the start of the region BasicBlock, + /// only defined when the region has been split. + BasicBlock *FollowBB = nullptr; + + /// The Outlinable Group that contains this region and structurally similar + /// regions to this region. + OutlinableGroup *Parent = nullptr; + + OutlinableRegion(IRSimilarityCandidate &C, OutlinableGroup &Group) + : Candidate(&C), Parent(&Group) { + StartBB = C.getStartBB(); + EndBB = C.getEndBB(); + } + + /// For the contained region, split the parent BasicBlock at the starting and + /// ending instructions of the contained IRSimilarityCandidate. + void splitCandidate(); + + /// For the contained region, reattach the BasicBlock at the starting and + /// ending instructions of the contained IRSimilarityCandidate, or if the + /// function has been extracted, the start and end of the BasicBlock + /// containing the called function. + void reattachCandidate(); +}; + +/// This class is a pass that identifies similarity in a Module, extracts +/// instances of the similarity, and then consolidating the similar regions +/// in an effort to reduce code size. It uses the IRSimilarityIdentifier pass +/// to identify the similar regions of code, and then extracts the similar +/// sections into a single function. See the above for an example as to +/// how code is extracted and consolidated into a single function. +class IROutliner { +public: + IROutliner(function_ref GTTI, + function_ref GIRSI) + : getTTI(GTTI), getIRSI(GIRSI) {} + bool run(Module &M); + +private: + /// Find repeated similar code sequences in \p M and outline them into new + /// Functions. + /// + /// \param [in] M - The module to outline from. + /// \returns The number of Functions created. + unsigned doOutline(Module &M); + + /// Remove all the IRSimilarityCandidates from \p CandidateVec that have + /// instructions contained in a previously outlined region and put the + /// remaining regions in \p CurrentGroup. + /// + /// \param [in] CandidateVec - List of similarity candidates for regions with + /// the same similarity structure. + /// \param [in,out] CurrentGroup - Contains the potential sections to + /// be outlined. + void + pruneIncompatibleRegions(std::vector &CandidateVec, + OutlinableGroup &CurrentGroup); + + /// Extract \p Region into its own function. + /// + /// \param [in] Region - The region to be extracted into its own function. + /// \returns True if it was successfully outlined. + bool extractSection(OutlinableRegion &Region); + + /// The set of outlined Instructions, identified by their location in the + /// sequential ordering of instructions in a Module. + DenseSet Outlined; + + /// TargetTransformInfo lambda for target specific information. + function_ref getTTI; + + /// IRSimilarityIdentifier lambda to retrieve IRSimilarityIdentifier. + function_ref getIRSI; + + /// The memory allocator used to allocate the OutlinableRegions and + /// CodeExtractors. + BumpPtrAllocator OutlinerAllocator; +}; + +/// Pass to outline similar regions. +class IROutlinerPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_IPO_IROUTLINER_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -93,6 +93,7 @@ #include "llvm/Transforms/IPO/InferFunctionAttrs.h" #include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/IPO/Internalize.h" +#include "llvm/Transforms/IPO/IROutliner.h" #include "llvm/Transforms/IPO/LowerTypeTests.h" #include "llvm/Transforms/IPO/MergeFunctions.h" #include "llvm/Transforms/IPO/OpenMPOpt.h" @@ -278,6 +279,7 @@ } extern cl::opt EnableHotColdSplit; +extern cl::opt EnableIROutliner; extern cl::opt EnableOrderFileInstrumentation; extern cl::opt FlattenedProfileUsed; @@ -1204,6 +1206,13 @@ if (EnableHotColdSplit && !LTOPreLink) MPM.addPass(HotColdSplittingPass()); + // Search the code for similar regions of code. If enough similar regions can + // be found where extracting the regions into their own function will decrease + // the size of the program, we extract the regions, a deduplicate the + // structurally similar regions. + if (EnableIROutliner) + MPM.addPass(IROutlinerPass()); + // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -66,6 +66,7 @@ MODULE_PASS("internalize", InternalizePass()) MODULE_PASS("invalidate", InvalidateAllAnalysesPass()) MODULE_PASS("ipsccp", IPSCCPPass()) +MODULE_PASS("iroutliner", IROutlinerPass()) MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(dbgs())) MODULE_PASS("lowertypetests", LowerTypeTestsPass(nullptr, nullptr)) MODULE_PASS("mergefunc", MergeFunctionsPass()) Index: llvm/lib/Transforms/IPO/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/IPO/CMakeLists.txt +++ llvm/lib/Transforms/IPO/CMakeLists.txt @@ -19,6 +19,7 @@ GlobalSplit.cpp HotColdSplitting.cpp IPO.cpp + IROutliner.cpp InferFunctionAttrs.cpp InlineSimple.cpp Inliner.cpp Index: llvm/lib/Transforms/IPO/IPO.cpp =================================================================== --- llvm/lib/Transforms/IPO/IPO.cpp +++ llvm/lib/Transforms/IPO/IPO.cpp @@ -35,6 +35,7 @@ initializeGlobalOptLegacyPassPass(Registry); initializeGlobalSplitPass(Registry); initializeHotColdSplittingLegacyPassPass(Registry); + initializeIROutlinerLegacyPassPass(Registry); initializeAlwaysInlinerLegacyPassPass(Registry); initializeSimpleInlinerPass(Registry); initializeInferFunctionAttrsLegacyPassPass(Registry); Index: llvm/lib/Transforms/IPO/IROutliner.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/IPO/IROutliner.cpp @@ -0,0 +1,369 @@ +//===- IROutliner.cpp -- Outline Similar Regions ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +// Implementation for the IROutliner which is used by the IROutliner Pass. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/IROutliner.h" +#include "llvm/Analysis/IRSimilarityIdentifier.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/IPO.h" +#include +#include +#include + +#define DEBUG_TYPE "iroutliner" + +using namespace llvm; +using namespace IRSimilarity; + +/// The OutlinableGroup holds all the overarching information for outlining +/// a set of regions that are structurally similar to one another, such as the +/// types of the overall function, the output blocks, the sets of stores needed +/// and a list of the different regions. This information is used in the +/// deduplication of extracted regions with the same structure. +struct OutlinableGroup { + /// The sections that could be outlined + std::vector Regions; + + /// Flag for whether we should not consider this group of OutlinableRegions + /// for extraction. + bool IgnoreGroup = false; +}; + +/// Move the contents of \p SourceBB to before the last instruction of \p +/// TargetBB. +/// \param SourceBB - the BasicBlock to pull Instructions from. +/// \param TargetBB - the BasicBlock to put Instruction into. +static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB) { + BasicBlock::iterator BBCurr, BBEnd, BBNext; + for (BBCurr = SourceBB.begin(), BBEnd = SourceBB.end(); BBCurr != BBEnd; + BBCurr = BBNext) { + BBNext = std::next(BBCurr); + BBCurr->moveBefore(TargetBB, TargetBB.end()); + } +} + +void OutlinableRegion::splitCandidate() { + assert(!CandidateSplit && "Candidate already split!"); + + Instruction *StartInst = (*Candidate->begin()).Inst; + Instruction *EndInst = (*Candidate->end()).Inst; + assert(StartInst && EndInst && "Expected a start and end instruction?"); + StartBB = StartInst->getParent(); + PrevBB = StartBB; + + // The basic block gets split like so: + // block: block: + // inst1 inst1 + // inst2 inst2 + // region1 br block_to_outline + // region2 block_to_outline: + // region3 -> region1 + // region4 region2 + // inst3 region3 + // inst4 region4 + // br block_after_outline + // block_after_outline: + // inst3 + // inst4 + + std::string OriginalName = PrevBB->getName().str(); + + StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline"); + + // This is the case for the inner block since we do not have to include + // multiple blocks. + EndBB = StartBB; + FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline"); + + CandidateSplit = true; +} + +void OutlinableRegion::reattachCandidate() { + assert(CandidateSplit && "Candidate is not split!"); + + // The basic block gets reattached like so: + // block: block: + // inst1 inst1 + // inst2 inst2 + // br block_to_outline region1 + // block_to_outline: -> region2 + // region1 region3 + // region2 region4 + // region3 inst3 + // region4 inst4 + // br block_after_outline + // block_after_outline: + // inst3 + // inst4 + assert(StartBB != nullptr && "StartBB for Candidate is not defined!"); + assert(FollowBB != nullptr && "StartBB for Candidate is not defined!"); + + // StartBB should only have one predecessor since we put an unconditional + // branch at the end of PrevBB when we split the BasicBlock. + PrevBB = StartBB->getSinglePredecessor(); + assert(PrevBB != nullptr && + "No Predecessor for the region start basic block!"); + + assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!"); + assert(EndBB->getTerminator() && "Terminator removed from EndBB!"); + PrevBB->getTerminator()->eraseFromParent(); + EndBB->getTerminator()->eraseFromParent(); + + moveBBContents(*StartBB, *PrevBB); + + BasicBlock *PlacementBB = PrevBB; + if (StartBB != EndBB) + PlacementBB = EndBB; + moveBBContents(*FollowBB, *PlacementBB); + + PrevBB->replaceSuccessorsPhiUsesWith(StartBB, PrevBB); + PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB); + StartBB->eraseFromParent(); + FollowBB->eraseFromParent(); + + // Make sure to save changes back to the StartBB. + StartBB = PrevBB; + EndBB = nullptr; + PrevBB = nullptr; + FollowBB = nullptr; + + CandidateSplit = false; +} + +void IROutliner::pruneIncompatibleRegions( + std::vector &CandidateVec, + OutlinableGroup &CurrentGroup) { + bool PreviouslyOutlined; + + // Sort from beginning to end, so the IRSimilarityCandidates are in order. + stable_sort(CandidateVec, [](const IRSimilarityCandidate &LHS, + const IRSimilarityCandidate &RHS) { + return LHS.getStartIdx() < RHS.getStartIdx(); + }); + + unsigned CurrentEndIdx = 0; + for (IRSimilarityCandidate &IRSC : CandidateVec) { + PreviouslyOutlined = false; + unsigned StartIdx = IRSC.getStartIdx(); + unsigned EndIdx = IRSC.getEndIdx(); + + for (unsigned Idx = StartIdx; Idx <= EndIdx; Idx++) + if (Outlined.find(Idx) != Outlined.end()) { + PreviouslyOutlined = true; + break; + } + + if (PreviouslyOutlined) + continue; + + // TODO: If in the future we can outline across BasicBlocks, we will need to + // check all BasicBlocks contained in the region. + if (IRSC.getStartBB()->hasAddressTaken()) + continue; + + // Greedily prune out any regions that will overlap with already chosen + // regions. + if (CurrentEndIdx != 0 && StartIdx <= CurrentEndIdx) + continue; + + OutlinableRegion *OS = new (OutlinerAllocator.Allocate()) + OutlinableRegion(IRSC, CurrentGroup); + CurrentGroup.Regions.push_back(OS); + + CurrentEndIdx = EndIdx; + } +} + +bool IROutliner::extractSection(OutlinableRegion &Region) { + assert(Region.StartBB != nullptr && + "StartBB for the OutlinableRegion is nullptr!"); + assert(Region.FollowBB != nullptr && + "StartBB for the OutlinableRegion is nullptr!"); + Function *OrigF = Region.StartBB->getParent(); + CodeExtractorAnalysisCache CEAC(*OrigF); + Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC); + + // If the extraction was successful, find the BasicBlock, and reassign the + // OutlinableRegion blocks + if (!Region.ExtractedFunction) { + LLVM_DEBUG(dbgs() << "CodeExtractor failed to outline " << Region.StartBB + << "\n"); + Region.reattachCandidate(); + return false; + } + + BasicBlock *RewrittenBB = Region.FollowBB->getSinglePredecessor(); + Region.StartBB = RewrittenBB; + Region.EndBB = RewrittenBB; + + // The sequences of outlinable regions has now changed. We must fix the + // IRInstructionDataList for consistency. Although they may not be illegal + // instructions, they should not be compared with anything else as they + // should not be outlined in this round. So marking these as illegal is + // allowed. + IRInstructionDataList *IDL = Region.Candidate->front()->IDL; + Region.NewFront = new (OutlinerAllocator.Allocate()) + IRInstructionData(*RewrittenBB->begin(), false, *IDL); + Region.NewBack = new (OutlinerAllocator.Allocate()) + IRInstructionData(*std::prev(std::prev(RewrittenBB->end())), false, *IDL); + + // Insert the first IRInstructionData of the new region in front of the + // first IRInstructionData of the IRSimilarityCandidate. + IDL->insert(Region.Candidate->begin(), *Region.NewFront); + // Insert the first IRInstructionData of the new region after the + // last IRInstructionData of the IRSimilarityCandidate. + IDL->insert(Region.Candidate->end(), *Region.NewBack); + // Remove the IRInstructionData from the IRSimilarityCandidate. + IDL->erase(Region.Candidate->begin(), std::prev(Region.Candidate->end())); + + assert(RewrittenBB != nullptr && + "Could not find a predecessor after extraction!"); + + // Iterate over the new set of instructions to find the new call + // instruction. + for (Instruction &I : *RewrittenBB) + if (CallInst *CI = dyn_cast(&I)) + if (Region.ExtractedFunction == CI->getCalledFunction()) { + Region.Call = CI; + break; + } + Region.reattachCandidate(); + return true; +} + +unsigned IROutliner::doOutline(Module &M) { + // Find the possibile similarity sections. + IRSimilarityIdentifier &Identifier = getIRSI(M); + SimilarityGroupList &SimilarityCandidates = Identifier.getSimilarity(); + + // Sort them by size of extracted sections + unsigned OutlinedFunctionNum = 0; + // If we only have one SimilarityGroup in SimilarityCandidates, we do not have + // to sort them by the potential number of instructions to be outlined + if (SimilarityCandidates.size() > 1) + llvm::stable_sort(SimilarityCandidates, + [](const std::vector &LHS, + const std::vector &RHS) { + return LHS[0].getLength() * LHS.size() > + RHS[0].getLength() * RHS.size(); + }); + + // Iterate over the possible sets of similarity. + for (SimilarityGroup &CandidateVec : SimilarityCandidates) { + OutlinableGroup CurrentGroup; + + // Remove entries that were previously outlined + pruneIncompatibleRegions(CandidateVec, CurrentGroup); + + // We pruned the number of regions to 0 to 1, meaning that it's not worth + // trying to outlined since there is no compatible similar instance of this + // code. + if (CurrentGroup.Regions.size() < 2) + continue; + + // Create a CodeExtractor for each outlinable region. + for (OutlinableRegion *OS : CurrentGroup.Regions) { + // Break the outlinable region out of its parent BasicBlock into its own + // BasicBlocks (see function implementation). + OS->splitCandidate(); + std::vector BE = {OS->StartBB}; + OS->CE = new (OutlinerAllocator.Allocate()) + CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, + false, "outlined"); + } + + // Create functions out of all the sections, and mark them as outlined + std::vector OutlinedRegions; + for (OutlinableRegion *OS : CurrentGroup.Regions) { + OutlinedFunctionNum++; + bool FunctionOutlined = extractSection(*OS); + if (FunctionOutlined) { + unsigned StartIdx = OS->Candidate->getStartIdx(); + unsigned EndIdx = OS->Candidate->getEndIdx(); + for (unsigned Idx = StartIdx; Idx <= EndIdx; Idx++) + Outlined.insert(Idx); + + OutlinedRegions.push_back(OS); + } + } + + CurrentGroup.Regions = std::move(OutlinedRegions); + } + + return OutlinedFunctionNum; +} + +bool IROutliner::run(Module &M) { return doOutline(M) > 0; } + +// Pass Manager Boilerplate +class IROutlinerLegacyPass : public ModulePass { +public: + static char ID; + IROutlinerLegacyPass() : ModulePass(ID) { + initializeIROutlinerLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + + bool runOnModule(Module &M) override; +}; + +bool IROutlinerLegacyPass::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + auto GTTI = [this](Function &F) -> TargetTransformInfo & { + return this->getAnalysis().getTTI(F); + }; + + auto GIRSI = [this](Module &) -> IRSimilarityIdentifier & { + return this->getAnalysis().getIRSI(); + }; + + return IROutliner(GTTI, GIRSI).run(M); +} + +PreservedAnalyses IROutlinerPass::run(Module &M, ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult(M).getManager(); + + std::function GTTI = + [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + + std::function GIRSI = + [&AM](Module &M) -> IRSimilarityIdentifier & { + return AM.getResult(M); + }; + + if (IROutliner(GTTI, GIRSI).run(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +char IROutlinerLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(IROutlinerLegacyPass, "iroutliner", "IR Outliner", false, + false) +INITIALIZE_PASS_DEPENDENCY(IRSimilarityIdentifierWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(IROutlinerLegacyPass, "iroutliner", "IR Outliner", false, + false) + +ModulePass *llvm::createIROutlinerPass() { return new IROutlinerLegacyPass(); } Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -103,6 +103,9 @@ cl::opt EnableHotColdSplit("hot-cold-split", cl::init(false), cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass")); +cl::opt EnableIROutliner("ir-outliner", cl::init(false), cl::Hidden, + cl::desc("Enable ir outliner pass")); + static cl::opt UseLoopVersioningLICM( "enable-loop-versioning-licm", cl::init(false), cl::Hidden, cl::desc("Enable the experimental Loop Versioning LICM pass")); @@ -846,6 +849,9 @@ if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO)) MPM.add(createHotColdSplittingPass()); + if (EnableIROutliner) + MPM.add(createIROutlinerPass()); + if (MergeFunctions) MPM.add(createMergeFunctionsPass()); Index: llvm/test/Transforms/IROutliner/extraction.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/extraction.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -verify -iroutliner < %s | FileCheck %s + +; This test makes sure we are extracting the found similarity sections +; correctly at the call site. + +define void @extract1() { +; CHECK-LABEL: @extract1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @extract1.outlined(i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +define void @extract2() { +; CHECK-LABEL: @extract2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @extract2.outlined(i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +define void @extract_outs1() #0 { +; CHECK-LABEL: @extract_outs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @extract_outs1.outlined(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]]) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4 +; CHECK-NEXT: call void @extract_outs1.outlined.1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %add = add i32 %0, %1 + store i32 %add, i32* %output, align 4 + %2 = load i32, i32* %output, align 4 + %3 = load i32, i32* %output, align 4 + %mul = mul i32 %2, %add + store i32 %mul, i32* %result, align 4 + ret void +} + +define void @extract_outs2() #0 { +; CHECK-LABEL: @extract_outs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @extract_outs2.outlined(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]]) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @extract_outs2.outlined.2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %add = add i32 %0, %1 + store i32 %add, i32* %output, align 4 + %2 = load i32, i32* %output, align 4 + %mul = mul i32 %2, %add + store i32 %mul, i32* %result, align 4 + ret void +} Index: llvm/test/Transforms/IROutliner/outlining-address-taken.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-address-taken.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -verify -iroutliner < %s | FileCheck %s + +; This test shows that we do not outline from basic blocks with their address +; taken. + +@ba1 = constant i8* blockaddress (@dontoutline, %new_block) + +define void @outline_1() { +; CHECK-LABEL: @outline_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @[[FUNCTION_1:.*]](i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +define void @outline_2() { +; CHECK-LABEL: @outline_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @[[FUNCTION_2:.*]](i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +define void @dontoutline() { +; CHECK-LABEL: @dontoutline( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[NEW_BLOCK:%.*]] +; CHECK: new_block: +; CHECK-NEXT: store i32 2, i32* [[A]], align 4 +; CHECK-NEXT: store i32 3, i32* [[B]], align 4 +; CHECK-NEXT: store i32 4, i32* [[C]], align 4 +; CHECK-NEXT: [[AL:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[BL:%.*]] = load i32, i32* [[B]], align 4 +; CHECK-NEXT: [[CL:%.*]] = load i32, i32* [[C]], align 4 +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + br label %new_block +new_block: + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +; CHECK: define internal void @[[FUNCTION_2]](i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]]) +; CHECK: entry_to_outline: +; CHECK-NEXT: store i32 2, i32* [[ARG0]], align 4 +; CHECK-NEXT: store i32 3, i32* [[ARG1]], align 4 +; CHECK-NEXT: store i32 4, i32* [[ARG2]], align 4 +; CHECK-NEXT: [[AL:%.*]] = load i32, i32* [[ARG0]], align 4 +; CHECK-NEXT: [[BL:%.*]] = load i32, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[CL:%.*]] = load i32, i32* [[ARG2]], align 4 Index: llvm/test/Transforms/IROutliner/outlining-different-structure.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-different-structure.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -verify -iroutliner < %s | FileCheck %s + +; This is a negative case to show that when we have the same set of +; instructions, but in a different order, they are not outlined in the same way. +; In this case, the arguments passed into the function are in a different order. + +define void @outline_constants1() { +; CHECK-LABEL: @outline_constants1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 2, i32* [[A]], align 4 +; CHECK-NEXT: store i32 3, i32* [[B]], align 4 +; CHECK-NEXT: store i32 4, i32* [[C]], align 4 +; CHECK-NEXT: call void @[[FUNCTION_0:.*]](i32* [[A]], i32* [[C]], i32* [[B]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %cl = load i32, i32* %c + %bl = load i32, i32* %b + ret void +} + +define void @outline_constants2() { +; CHECK-LABEL: @outline_constants2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 2, i32* [[A]], align 4 +; CHECK-NEXT: store i32 3, i32* [[B]], align 4 +; CHECK-NEXT: store i32 4, i32* [[C]], align 4 +; CHECK-NEXT: call void @[[FUNCTION_1:.*]](i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +; CHECK: define internal void @[[FUNCTION_0]](i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]]) +; CHECK: entry_to_outline: +; CHECK-NEXT: [[AL:%.*]] = load i32, i32* [[ARG0]], align 4 +; CHECK-NEXT: [[BL:%.*]] = load i32, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[CL:%.*]] = load i32, i32* [[ARG2]], align 4 + +; CHECK: define internal void @[[FUNCTION_1]](i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]]) +; CHECK: entry_to_outline: +; CHECK-NEXT: [[AL:%.*]] = load i32, i32* [[ARG0]], align 4 +; CHECK-NEXT: [[BL:%.*]] = load i32, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[CL:%.*]] = load i32, i32* [[ARG2]], align 4 Index: llvm/test/Transforms/IROutliner/outlining-same-constants.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-same-constants.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -verify -iroutliner < %s | FileCheck %s + +; This test looks at the constants in the regions, and if it they are the +; same it outlines them as constants rather than elevating them to arguments. + +define void @outline_constants1() { +; CHECK-LABEL: @outline_constants1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @[[FUNCTION_0:.*]](i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +define void @outline_constants2() { +; CHECK-LABEL: @outline_constants2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @[[FUNCTION_1:.*]](i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +; CHECK: define internal void @[[FUNCTION_0]](i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]]) +; CHECK: entry_to_outline: +; CHECK-NEXT: store i32 2, i32* [[ARG0]], align 4 +; CHECK-NEXT: store i32 3, i32* [[ARG1]], align 4 +; CHECK-NEXT: store i32 4, i32* [[ARG2]], align 4 +; CHECK-NEXT: [[AL:%.*]] = load i32, i32* [[ARG0]], align 4 +; CHECK-NEXT: [[BL:%.*]] = load i32, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[CL:%.*]] = load i32, i32* [[ARG2]], align 4 + +; CHECK: define internal void @[[FUNCTION_1]](i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]]) +; CHECK: entry_to_outline: +; CHECK-NEXT: store i32 2, i32* [[ARG0]], align 4 +; CHECK-NEXT: store i32 3, i32* [[ARG1]], align 4 +; CHECK-NEXT: store i32 4, i32* [[ARG2]], align 4 +; CHECK-NEXT: [[AL:%.*]] = load i32, i32* [[ARG0]], align 4 +; CHECK-NEXT: [[BL:%.*]] = load i32, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[CL:%.*]] = load i32, i32* [[ARG2]], align 4 Index: llvm/test/Transforms/IROutliner/outlining-same-globals.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-same-globals.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -verify -iroutliner < %s | FileCheck %s + +@global1 = global i32 1, align 4 +@global2 = global i32 2, align 4 + +; This test looks at the globals in the regions, and if it they are the +; same it outlines the region without elevating the globals to arguments. + +define void @outline_globals1() { +; CHECK-LABEL: @outline_globals1( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @[[FUNCTION_0:.*]]() +; CHECK-NEXT: ret void +; +entry: + %0 = load i32, i32* @global1 + %1 = load i32, i32* @global2 + %2 = add i32 %0, %1 + ret void +} + +define void @outline_globals2() { +; CHECK-LABEL: @outline_globals2( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @[[FUNCTION_1:.*]]() +; CHECK-NEXT: ret void +; +entry: + %0 = load i32, i32* @global1 + %1 = load i32, i32* @global2 + %2 = add i32 %0, %1 + ret void +} + +; CHECK: define internal void @[[FUNCTION_0]]() +; CHECK: entry_to_outline: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @global1, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* @global2, align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] + + +; CHECK: define internal void @[[FUNCTION_1]]() +; CHECK: entry_to_outline: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @global1, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* @global2, align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] +