Index: llvm/include/llvm/Analysis/IRSimilarityIdentifier.h =================================================================== --- llvm/include/llvm/Analysis/IRSimilarityIdentifier.h +++ llvm/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -127,6 +127,8 @@ /// to a less than form. It is None otherwise. Optional RevisedPredicate; + SmallVector RelativeBlockLocations; + /// Gather the information that is difficult to gather for an Instruction, or /// is changed. i.e. the operands of an Instruction and the Types of those /// operands. This extra information allows for similarity matching to make @@ -145,6 +147,9 @@ /// \return the consistent comparison predicate. static CmpInst::Predicate predicateForConsistency(CmpInst *CI); + void + setBranchSuccessors(DenseMap &BasicBlockToInteger); + /// Hashes \p Value based on its opcode, types, and operand types. /// Two IRInstructionData instances produce the same hash when they perform /// the same operation. @@ -189,6 +194,11 @@ llvm::hash_value(ID.Inst->getType()), llvm::hash_value(CI->getCalledFunction()->getName().str()), llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); + else if (BranchInst *BI = dyn_cast(ID.Inst)) + return hash_combine( + llvm::hash_value(ID.Inst->getOpcode()), + llvm::hash_value(ID.Inst->getType()), + llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); return llvm::hash_combine( llvm::hash_value(ID.Inst->getOpcode()), llvm::hash_value(ID.Inst->getType()), @@ -284,10 +294,14 @@ /// The next available integer to assign to a legal Instruction to. unsigned LegalInstrNumber = 0; + unsigned BBNumber = 0; + /// Correspondence from IRInstructionData to unsigned integers. DenseMap InstructionIntegerMap; + DenseMap BasicBlockToInteger; + /// Set if we added an illegal number in the previous step. /// Since each illegal number is unique, we only need one of them between /// each range of legal numbers. This lets us make sure we don't add more @@ -329,6 +343,16 @@ IRInstructionDataList *IDL = nullptr; + void initializeForBBs(Function &F) { + for (BasicBlock &BB : F) + BasicBlockToInteger.insert(std::make_pair(&BB, BBNumber++)); + } + + void initializeForBBs(Module &M) { + for (Function &F : M) + initializeForBBs(F); + } + /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers /// determined by \p InstrType. Two Instructions are mapped to the same value /// if they are close as defined by the InstructionData class above. @@ -386,7 +410,11 @@ InstructionClassification() {} // TODO: Determine a scheme to resolve when the label is similar enough. - InstrType visitBranchInst(BranchInst &BI) { return Illegal; } + InstrType visitBranchInst(BranchInst &BI) { + if (EnableBranches) + return Legal; + return Illegal; + } // TODO: Determine a scheme to resolve when the labels are similar enough. InstrType visitPHINode(PHINode &PN) { return Illegal; } // TODO: Handle allocas. @@ -419,6 +447,8 @@ // TODO: Handle interblock similarity. InstrType visitTerminator(Instruction &I) { return Illegal; } InstrType visitInstruction(Instruction &I) { return Legal; } + + bool EnableBranches = false; }; /// Maps an Instruction to a member of InstrType. @@ -496,6 +526,8 @@ DenseMap CanonNumToNumber; /// @} + DenseSet BasicBlocks; + public: /// \param StartIdx - The starting location of the region. /// \param Len - The length of the region. @@ -631,6 +663,9 @@ /// \returns The BasicBlock the IRSimilarityCandidate ends in. BasicBlock *getEndBB() { return LastInst->Inst->getParent(); } + /// \returns The BasicBlock the IRSimilarityCandidate ends in. + DenseSet &getBasicBlocks() { return BasicBlocks; } + /// \returns The Function that the IRSimilarityCandidate is located in. Function *getFunction() { return getStartBB()->getParent(); } Index: llvm/include/llvm/Transforms/IPO/IROutliner.h =================================================================== --- llvm/include/llvm/Transforms/IPO/IROutliner.h +++ llvm/include/llvm/Transforms/IPO/IROutliner.h @@ -90,6 +90,14 @@ /// call. bool ChangedArgOrder = false; + /// Marks whether this region ends in a branch, there is special handling + /// required for the following basic blocks in this case. + bool EndsInBranch = false; + + /// The PHIBlocks with their corresponding return block based on the return + /// value as the key. + DenseMap PHIBlocks; + /// Mapping of the argument number in the deduplicated function /// to a given constant, which is used when creating the arguments to the call /// to the newly created deduplicated function. This is handled separately @@ -179,6 +187,10 @@ /// for extraction. bool IgnoreGroup = false; + /// Keeping track of how many different branches outside the region the + /// regions in the group perform. + unsigned BranchesToOutside = 0; + /// The return blocks for the overall function. DenseMap EndBBs; @@ -391,7 +403,7 @@ InstructionAllowed() {} // TODO: Determine a scheme to resolve when the label is similar enough. - bool visitBranchInst(BranchInst &BI) { return false; } + bool visitBranchInst(BranchInst &BI) { return true; } // TODO: Determine a scheme to resolve when the labels are similar enough. bool visitPHINode(PHINode &PN) { return false; } // TODO: Handle allocas. @@ -430,6 +442,8 @@ // TODO: Handle interblock similarity. bool visitTerminator(Instruction &I) { return false; } bool visitInstruction(Instruction &I) { return true; } + + bool EnableBranches = false; }; /// A InstVisitor used to exclude certain instructions from being outlined. Index: llvm/lib/Analysis/IRSimilarityIdentifier.cpp =================================================================== --- llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -23,6 +23,10 @@ using namespace llvm; using namespace IRSimilarity; +cl::opt EnableBranches( + "ir-sim-branches", cl::init(false), cl::ReallyHidden, + cl::desc("enable similarity mactching across branches.")); + IRInstructionData::IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDList) : Inst(&I), Legal(Legality), IDL(&IDList) { @@ -49,6 +53,30 @@ } } +void IRInstructionData::setBranchSuccessors( + DenseMap &BasicBlockToInteger) { + assert(isa(Inst) && "Instruction must be branch"); + + BranchInst *BI = cast(Inst); + DenseMap::iterator BBNumIt; + + BBNumIt = BasicBlockToInteger.find(BI->getParent()); + assert(BBNumIt != BasicBlockToInteger.end() && + "Could not find location for BasicBlock!"); + + int CurrentBlockNumber = static_cast(BBNumIt->second); + + for (BasicBlock *Successor : BI->successors()) { + BBNumIt = BasicBlockToInteger.find(Successor); + assert(BBNumIt != BasicBlockToInteger.end() && + "Could not find number for BasicBlock!"); + int OtherBlockNumber = static_cast(BBNumIt->second); + + int Relative = OtherBlockNumber - CurrentBlockNumber; + RelativeBlockLocations.push_back(Relative); + } +} + CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) { switch (CI->getPredicate()) { case CmpInst::FCMP_OGT: @@ -143,6 +171,11 @@ return false; } + if (isa(A.Inst) && isa(B.Inst)) { + if (A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size()) + return false; + } + return true; } @@ -156,10 +189,6 @@ std::vector IntegerMappingForBB; std::vector InstrListForBB; - HaveLegalRange = false; - CanCombineWithPrevInstr = false; - AddedIllegalLastTime = true; - for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) { switch (InstClassifier.visit(*It)) { case InstrType::Legal: @@ -175,7 +204,8 @@ } if (HaveLegalRange) { - mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true); + if (AddedIllegalLastTime) + mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true); for (IRInstructionData *ID : InstrListForBB) this->IDL->push_back(*ID); llvm::append_range(InstrList, InstrListForBB); @@ -203,6 +233,9 @@ IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL); InstrListForBB.push_back(ID); + if (isa(*It)) + ID->setBranchSuccessors(BasicBlockToInteger); + // Add to the instruction list bool WasInserted; DenseMap::iterator @@ -322,6 +355,9 @@ NumberToValue.try_emplace(LocalValNumber, ID->Inst); LocalValNumber++; } + + BasicBlock *BB = ID->Inst->getParent(); + BasicBlocks.insert(BB); } // Setting the first and last instruction data pointers for the candidate. If @@ -640,6 +676,43 @@ {A, OperValsA, ValueNumberMappingA}, {B, OperValsB, ValueNumberMappingB})) return false; + + // Here we check that between two corresponding branch instructions, + // when the branch targets a basic block within the same region, the + // relative locations are the same, and otherwise, are targeting basic + // blocks outside the region at the same locations. + if (isa(ItA->Inst) && isa(ItB->Inst)) { + if (ItA->RelativeBlockLocations.size() != + ItB->RelativeBlockLocations.size()) + return false; + + if (OperValsA.size() != OperValsB.size()) + return false; + + detail::zippy &, + SmallVector &, ArrayRef &, + ArrayRef &> + ZippedRelativeLocations = + zip(ItA->RelativeBlockLocations, ItB->RelativeBlockLocations, + OperValsA, OperValsB); + if (!all_of(ZippedRelativeLocations, + [&A, &B](std::tuple R) { + BasicBlock *ABB = static_cast(std::get<2>(R)); + BasicBlock *BBB = static_cast(std::get<3>(R)); + bool AContained = false; + bool BContained = false; + if (A.BasicBlocks.find(ABB) != A.BasicBlocks.end()) + AContained = true; + if (B.BasicBlocks.find(BBB) != B.BasicBlocks.end()) + BContained = true; + if (AContained != BContained) + return false; + if (AContained) + return std::get<0>(R) == std::get<1>(R); + return true; + })) + return false; + } } return true; } @@ -665,6 +738,8 @@ std::vector IntegerMappingForModule; // Iterate over the functions in the module to map each Instruction in each // BasicBlock to an unsigned integer. + Mapper.initializeForBBs(M); + for (Function &F : M) { if (F.empty()) @@ -672,15 +747,16 @@ for (BasicBlock &BB : F) { - if (BB.sizeWithoutDebug() < 2) - continue; - // BB has potential to have similarity since it has a size greater than 2 // and can therefore match other regions greater than 2. Map it to a list // of unsigned integers. Mapper.convertToUnsignedVec(BB, InstrListForModule, IntegerMappingForModule); } + + BasicBlock::iterator It = F.begin()->end(); + Mapper.mapToIllegalUnsigned(It, IntegerMappingForModule, InstrListForModule, + true); } // Insert the InstrListForModule at the end of the overall InstrList so that @@ -914,6 +990,7 @@ void IRSimilarityIdentifier::findCandidates( std::vector &InstrList, std::vector &IntegerMapping) { + SuffixTree ST(IntegerMapping); std::vector CandsForRepSubstring; @@ -932,12 +1009,15 @@ continue; findCandidateStructures(CandsForRepSubstring, StructuralGroups); - for (std::pair &Group : StructuralGroups) + for (std::pair &Group : StructuralGroups) { // We only add the group if it contains more than one // IRSimilarityCandidate. If there is only one, that means there is no // other repeated subsequence with the same structure. - if (Group.second.size() > 1) + + if (Group.second.size() > 1) { SimilarityCandidates->push_back(Group.second); + } + } CandsForRepSubstring.clear(); StructuralGroups.clear(); @@ -951,6 +1031,7 @@ std::vector InstrList; std::vector IntegerMapping; + Mapper.InstClassifier.EnableBranches = EnableBranches; populateMapper(Modules, InstrList, IntegerMapping); findCandidates(InstrList, IntegerMapping); @@ -960,6 +1041,7 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) { resetSimilarityCandidates(); + Mapper.InstClassifier.EnableBranches = EnableBranches; std::vector InstrList; std::vector IntegerMapping; Index: llvm/lib/Transforms/IPO/IROutliner.cpp =================================================================== --- llvm/lib/Transforms/IPO/IROutliner.cpp +++ llvm/lib/Transforms/IPO/IROutliner.cpp @@ -34,6 +34,8 @@ using namespace llvm; using namespace IRSimilarity; +extern cl::opt EnableBranches; + // Set to true if the user wants the ir outliner to run on linkonceodr linkage // functions. This is false by default because the linker can dedupe linkonceodr // functions. Since the outliner is confined to a single module (modulo LTO), @@ -67,13 +69,14 @@ void OutlinableRegion::splitCandidate() { assert(!CandidateSplit && "Candidate already split!"); + Instruction *EndInst = Candidate->end()->Inst; + Instruction *BackInst = Candidate->backInstruction(); - if (Candidate->end()->Inst != - Candidate->backInstruction()->getNextNonDebugInstruction()) - return; + if (!BackInst->isTerminator()) + if (EndInst != BackInst->getNextNonDebugInstruction()) + return; Instruction *StartInst = (*Candidate->begin()).Inst; - Instruction *EndInst = (*Candidate->end()).Inst; assert(StartInst && EndInst && "Expected a start and end instruction?"); StartBB = StartInst->getParent(); PrevBB = StartBB; @@ -97,10 +100,14 @@ StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline"); - // This is the case for the inner block since we do not have to include - // multiple blocks. - EndBB = StartBB; - FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline"); + if (!BackInst->isTerminator()) { + EndBB = EndInst->getParent(); + FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline"); + } else { + EndBB = BackInst->getParent(); + EndsInBranch = true; + FollowBB = nullptr; + } CandidateSplit = true; } @@ -123,7 +130,6 @@ // inst3 // inst4 assert(StartBB != nullptr && "StartBB for Candidate is not defined!"); - assert(FollowBB != nullptr && "StartBB for Candidate is not defined!"); // StartBB should only have one predecessor since we put an unconditional // branch at the end of PrevBB when we split the BasicBlock. @@ -132,21 +138,27 @@ "No Predecessor for the region start basic block!"); assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!"); - assert(EndBB->getTerminator() && "Terminator removed from EndBB!"); PrevBB->getTerminator()->eraseFromParent(); - EndBB->getTerminator()->eraseFromParent(); + if (!EndsInBranch) { + assert(FollowBB != nullptr && "FollowBB for Candidate is not defined!"); + assert(EndBB->getTerminator() && "Terminator removed from EndBB!"); + EndBB->getTerminator()->eraseFromParent(); + } moveBBContents(*StartBB, *PrevBB); BasicBlock *PlacementBB = PrevBB; if (StartBB != EndBB) PlacementBB = EndBB; - moveBBContents(*FollowBB, *PlacementBB); + if (!EndsInBranch) + moveBBContents(*FollowBB, *PlacementBB); PrevBB->replaceSuccessorsPhiUsesWith(StartBB, PrevBB); - PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB); + if (!EndsInBranch) { + PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB); + FollowBB->eraseFromParent(); + } StartBB->eraseFromParent(); - FollowBB->eraseFromParent(); // Make sure to save changes back to the StartBB. StartBB = PrevBB; @@ -204,19 +216,27 @@ // division instruction for targets that have a native division instruction. // To be overly conservative, we only add 1 to the number of instructions for // each division instruction. - for (Instruction &I : *StartBB) { - switch (I.getOpcode()) { - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::UDiv: - case Instruction::URem: - Benefit += 1; - break; - default: - Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); - break; + DenseSet CandidateBlocks; + for (IRInstructionData &ID : *Candidate) { + BasicBlock *BB = ID.Inst->getParent(); + CandidateBlocks.insert(BB); + } + + for (BasicBlock *BB : CandidateBlocks) { + for (Instruction &I : *BB) { + switch (I.getOpcode()) { + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::UDiv: + case Instruction::URem: + Benefit += 1; + break; + default: + Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); + break; + } } } @@ -316,8 +336,25 @@ unsigned FunctionNameSuffix) { assert(!Group.OutlinedFunction && "Function is already defined!"); + Type *RetTy = Type::getVoidTy(M.getContext()); + // All extracted functions _should_ have the same return type at this point. + // since the similarity identifier ensures that all branches outside of the + // region occur in the same place. + + // NOTE: Should we ever move to the model that uses a switch at every point + // needed, meaning that we could branch within the region or out, it is + // possible that we will need to switch to using the most general case all of + // the time. + for (OutlinableRegion *R : Group.Regions) { + Type *ExtractedFuncType = R->ExtractedFunction->getReturnType(); + if (RetTy->isVoidTy() && !ExtractedFuncType->isVoidTy()) + RetTy = ExtractedFuncType; + else if (RetTy->isIntegerTy(1) && RetTy->isIntegerTy(16)) + RetTy = ExtractedFuncType; + } + Group.OutlinedFunctionType = FunctionType::get( - Type::getVoidTy(M.getContext()), Group.ArgumentTypes, false); + RetTy, Group.ArgumentTypes, false); // These functions will only be called from within the same module, so // we can set an internal linkage. @@ -378,6 +415,7 @@ static void moveFunctionData(Function &Old, Function &New, DenseMap &NewEnds) { Function::iterator CurrBB, NextBB, FinalBB; + // BasicBlock * NewEnds = nullptr; std::vector DebugInsts; for (CurrBB = Old.begin(), FinalBB = Old.end(); CurrBB != FinalBB; CurrBB = NextBB) { @@ -691,10 +729,51 @@ /// \param [in] Outputs - The values found by the code extractor. static void findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, - ArrayRef Outputs) { + SetVector &Outputs) { OutlinableGroup &Group = *Region.Parent; IRSimilarityCandidate &C = *Region.Candidate; + std::vector BE; + DenseSet BBSet; + for (IRInstructionData &ID : C) { + BasicBlock *BB = ID.Inst->getParent(); + if (BBSet.contains(BB)) + continue; + BE.push_back(BB); + BBSet.insert(BB); + } + + SmallPtrSet Exits; + for (BasicBlock *Block : BE) { + for (BasicBlock *Succ : successors(Block)) { + if (!BBSet.contains(Succ)) { + Exits.insert(Succ); + } + } + } + + // After determining which blocks exit to PHINodes, we add these PHINodes to + // the set of outputs to be processed. + DenseSet PHIWrapped; + for (BasicBlock *ExitBB : Exits) { + for (PHINode &PN : ExitBB->phis()) { + // Find all incoming values from the outlining region. + SmallVector IncomingVals; + for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) + if (BBSet.contains(PN.getIncomingBlock(i))) + IncomingVals.push_back(i); + + // Do not process PHI if there is one (or fewer) predecessor from region. + if (IncomingVals.size() <= 1) + continue; + + Outputs.insert(&PN); + + for (unsigned i : IncomingVals) + PHIWrapped.insert(PN.getIncomingValue(i)); + } + } + // This counts the argument number in the extracted function. unsigned OriginalIndex = Region.NumExtractedInputs; @@ -709,6 +788,7 @@ // type to the overall argument type list. We also store the GVNs used for // stores to identify which values will need to be moved into an special // block that holds the stores to the output registers. + SmallVector GlobalValues; for (Value *Output : Outputs) { TypeFound = false; // We can do this since it is a result value, and will have a number @@ -716,9 +796,35 @@ // do not have to be in same order, but are functionally the same, we will // have to use a different scheme, as one-to-one correspondence is not // guaranteed. - unsigned GlobalValue = C.getGVN(Output).getValue(); + GlobalValues.clear(); + + // Since values outside the region can be combined into PHINode when we + // have multiple exists, we check if we have a PHINode in the output, to + // mark that both of these outputs are getting caught. + + // If we cannot find a GVN, this means that the input to the PHINode is not + // included in the region we are trying to analyze, meaning, that if it was + // outlined, we would be adding an extra input. We ignore this case for + // now, and so ignore the region. + + // TODO: Adapt to the extra input from the PHINode. + if (PHINode *PN = dyn_cast(Output)) { + for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) { + Value *V = PN->getIncomingValue(i); + if (!C.getGVN(V).hasValue()) { + Region.IgnoreRegion = true; + return; + } + GlobalValues.push_back(C.getGVN(V).getValue()); + } + } else + GlobalValues.push_back(C.getGVN(Output).getValue()); unsigned ArgumentSize = Group.ArgumentTypes.size(); + // If the output is combined in a PHINode, we make sure to skip over it. + if (PHIWrapped.contains(Output)) + continue; + for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) { if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType())) continue; @@ -730,7 +836,8 @@ AggArgsUsed.insert(Jdx); Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx)); Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + for (unsigned GlobalValue : GlobalValues) + Region.GVNStores.push_back(GlobalValue); break; } @@ -744,7 +851,8 @@ std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1)); Region.AggArgToExtracted.insert( std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + for (unsigned GlobalValue : GlobalValues) + Region.GVNStores.push_back(GlobalValue); } stable_sort(Region.GVNStores); @@ -770,7 +878,7 @@ // Map the outputs found by the CodeExtractor to the arguments found for // the overall function. - findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef()); + findExtractedOutputToOverallOutputMapping(Region, Outputs); } /// Replace the extracted function in the Region with a call to the overall @@ -869,6 +977,9 @@ // Transfer any debug information. Call->setDebugLoc(Region.Call->getDebugLoc()); + // Since our output may determine which branch we go to, we make sure to + // propogate this new call value through the module. + OldCall->replaceAllUsesWith(Call); // Remove the old instruction. OldCall->eraseFromParent(); @@ -945,16 +1056,96 @@ continue; ReturnInst *RI = cast(Term); Value *RetVal = RI->getReturnValue(); - DenseMap::iterator VBBIt = OutputBBs.find(RetVal); + DenseMap::iterator VBBIt, PHIVBBIt; + VBBIt = OutputBBs.find(RetVal); assert(VBBIt != OutputBBs.end() && "Could not find output value!"); Instruction *NewI = I->clone(); NewI->setDebugLoc(DebugLoc()); BasicBlock *OutputBB = VBBIt->second; OutputBB->getInstList().push_back(NewI); - LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " - << *OutputBB << "\n"); + LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " + << *OutputBB << "\n"); + + // If this is storing a PHINode, we must make sure it is included in the + // overall function. + StoreInst *SI = static_cast(NewI); + if (!isa(SI->getValueOperand())) + continue; + PHINode *PN = cast(SI->getValueOperand()); + // If it has a value, it was not split by the code extractor, which + // is what we are looking for. + if (Region.Candidate->getGVN(PN).hasValue()) + continue; + + // Find if a PHIBlock exists for this return value already. If it is + // the first time we are anaylzing this, we will not, so we record it. + PHIVBBIt = Group.PHIBlocks.find(RetVal); + bool Inserted = false; + if (FirstFunction) { + BasicBlock *PHIBlock = PN->getParent(); + std::tie(PHIVBBIt, Inserted) = + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); + } + + Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent())); + + if (PHIVBBIt != Group.PHIBlocks.end()) + continue; + + // If we did not find a block, we create one, and insert it into the + // overall function and record it. + BasicBlock *PHIBlock = BasicBlock::Create(I->getContext(), "phi_block", + OutputBB->getParent()); + std::tie(PHIVBBIt, Inserted) = + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); + BranchInst::Create(OutputBB, PHIBlock); + // We find the predecessors in the overall function, and make sure + // we now branch to this new block. + for (BasicBlock *Pred : predecessors(PHIBlock)) { + Instruction *Term = Pred->getTerminator(); + BranchInst *BI = static_cast(Term); + BI->setSuccessor(0, PHIBlock); + } + BasicBlock *OverallPhiBlock = PHIVBBIt->second; + // If this is the first function, we do not need to worry about mergiing + // this with any other block in the overall outlined function + if (FirstFunction) + continue; + + // For our PHINode, we find the combined canonical numbering, and + // attempt to find a matching PHINode in the overall PHIBlock. If we + // cannot, we copy the PHINode and move it into this new block. + DenseSet PNGVNs; + for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) { + Value *IVal = PN->getIncomingValue(Idx); + Optional GVN = Region.Candidate->getGVN(IVal); + assert(GVN.hasValue() && "No GVN for incoming value"); + Optional CanonNum = Region.Candidate->getCanonicalNum(*GVN); + PNGVNs.insert(*CanonNum); + } + for (Instruction &Inst : *OverallPhiBlock) { + if (!isa(Inst)) + continue; + bool Found = true; + PHINode *CurrPN = static_cast(&Inst); + for (unsigned Idx = 0; Idx < CurrPN->getNumIncomingValues(); Idx++) { + Value *IVal = CurrPN->getIncomingValue(Idx); + OutlinableRegion *First = Group.Regions[0]; + Optional GVN = First->Candidate->getGVN(IVal); + assert(GVN.hasValue() && "No GVN for incoming value"); + Optional CanonNum = First->Candidate->getCanonicalNum(*GVN); + if (!PNGVNs.contains(*CanonNum)) { + Found = false; + break; + } + } + if (!Found) { + PHINode *NewPN = static_cast(PN->clone()); + NewPN->insertBefore(&*OverallPhiBlock->begin()); + } + } } // If we added an edge for basic blocks without a predecessor, we remove it @@ -1058,24 +1249,23 @@ BasicBlock *CompBB = VToB.second; BasicBlock *OutputBB = OutputBBIt->second; - if (CompBB->size() - 1 != OutputBB->size()) { - WrongSize = true; + if (CompBB->size() - 1 != OutputBB->size()) { + WrongSize = true; break; - } + } - WrongSize = false; - BasicBlock::iterator NIt = OutputBB->begin(); - for (Instruction &I : *CompBB) { - if (isa(&I)) - continue; + BasicBlock::iterator NIt = OutputBB->begin(); + for (Instruction &I : *CompBB) { + if (isa(&I)) + continue; - if (!I.isIdenticalTo(&(*NIt))) { - WrongInst = true; - break; - } + if (!I.isIdenticalTo(&(*NIt))) { + WrongInst = true; + break; + } - NIt++; - } + NIt++; + } } if (!WrongInst && !WrongSize) @@ -1103,10 +1293,11 @@ OutlinableGroup &OG, OutlinableRegion &Region, DenseMap OutputBBs, DenseMap EndBBs, - const DenseMap &OutputMappings, + const DenseMap &OutputMappings, std::vector> &OutputStoreBBs) { DenseSet ValuesToFind(Region.GVNStores.begin(), Region.GVNStores.end()); + //errs() << "Num values to find " << ValuesToFind.size() << "\n"; // We iterate over the instructions in the extracted function, and find the // global value number of the instructions. If we find a value that should @@ -1119,6 +1310,13 @@ ExcludeBBs.insert(VBPair.second); for (std::pair &VBPair : OutputBBs) ExcludeBBs.insert(VBPair.second); + // Add PHIBlocks to exclusionary zones since it is possible they may not be + // same across the extracted functions. + for (std::pair &VBPair : Region.PHIBlocks) + ExcludeBBs.insert(VBPair.second); + for (std::pair &VBPair : Region.Parent->PHIBlocks) + ExcludeBBs.insert(VBPair.second); + std::vector ExtractedFunctionInsts = collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs); std::vector OverallFunctionInsts = @@ -1138,6 +1336,30 @@ // If we have found one of the stored values for output, replace the value // with the corresponding one from the overall function. if (GVN.hasValue() && ValuesToFind.erase(GVN.getValue())) { + Value *NewV = OverallFunctionInsts[Idx]; + PHINode *OverallPN = nullptr; + // If there is a PHINode following the instruction, it means it is not + // really an output, but the following PHINode is, we make sure to update + // the uses of the PHINode as well to ensure that our output blocks in the + // overall function are consistent. + for (User *U : NewV->users()) { + if (PHINode *PN = dyn_cast(U)) { + OverallPN = PN; + } + } + + for (User *U : V->users()) { + if (PHINode *PN = dyn_cast(U)) { + if (!OverallPN) { + BasicBlock *Old = static_cast(V)->getParent(); + BasicBlock *New = static_cast(NewV)->getParent(); + PN->replaceIncomingBlockWith(Old, New); + continue; + } + PN->replaceAllUsesWith(OverallPN); + } + } + V->replaceAllUsesWith(OverallFunctionInsts[Idx]); if (ValuesToFind.size() == 0) break; @@ -1201,8 +1423,8 @@ NewBB = VtoBB.second; DenseMap::iterator VBBIt = EndBBs.find(RetValueForBB); - LLVM_DEBUG(dbgs() << "Create output block for region in" - << Region.ExtractedFunction << " to " + LLVM_DEBUG(dbgs() << "Create output block for region in" + << Region.ExtractedFunction << " to " << *NewBB); BranchInst::Create(VBBIt->second, NewBB); OutputStoreBBs.back().insert(std::make_pair(RetValueForBB, NewBB)); @@ -1233,17 +1455,17 @@ M.getContext(), "final_block_" + Twine(static_cast(FinalBlockIdx++)), AggFunc); - Instruction *Term = EndBB->getTerminator(); - Term->moveBefore(*ReturnBlock, ReturnBlock->end()); + Instruction *Term = EndBB->getTerminator(); + Term->moveBefore(*ReturnBlock, ReturnBlock->end()); // Put the switch statement in the old end basic block for the function // with a fall through to the new return block. - LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for " - << OutputStoreBBs.size() << "\n"); - SwitchInst *SwitchI = - SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1), - ReturnBlock, OutputStoreBBs.size(), EndBB); + LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for " + << OutputStoreBBs.size() << "\n"); + SwitchInst *SwitchI = + SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1), + ReturnBlock, OutputStoreBBs.size(), EndBB); - unsigned Idx = 0; + unsigned Idx = 0; for (DenseMap OutputStoreBB : OutputStoreBBs) { DenseMap::iterator OSBBIt = OutputStoreBB.find(OutputBlock.first); @@ -1253,9 +1475,9 @@ BasicBlock *BB = OSBBIt->second; SwitchI->addCase( ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), BB); - Term = BB->getTerminator(); - Term->setSuccessor(0, ReturnBlock); - Idx++; + Term = BB->getTerminator(); + Term->setSuccessor(0, ReturnBlock); + Idx++; } } return; @@ -1273,10 +1495,10 @@ BasicBlock *EndBB = EndBBIt->second; BasicBlock *OutputBB = VBPair.second; Instruction *Term = OutputBB->getTerminator(); - Term->eraseFromParent(); - Term = EndBB->getTerminator(); + Term->eraseFromParent(); + Term = EndBB->getTerminator(); moveBBContents(*OutputBB, *EndBB); - Term->moveBefore(*EndBB, EndBB->end()); + Term->moveBefore(*EndBB, EndBB->end()); OutputBB->eraseFromParent(); } } @@ -1316,11 +1538,11 @@ DenseMap NewBBs; unsigned Idx = 0; for (std::pair &VtoBB : CurrentGroup.EndBBs) { - BasicBlock *NewBB = BasicBlock::Create( + BasicBlock *NewBB = BasicBlock::Create( M.getContext(), Twine("output_block_") + Twine(static_cast(0)) + Twine("_") + Twine(static_cast(Idx++)), - CurrentGroup.OutlinedFunction); + CurrentGroup.OutlinedFunction); NewBBs.insert(std::make_pair(VtoBB.first, NewBB)); } CurrentOS->OutputBlockNum = 0; @@ -1340,13 +1562,12 @@ RetValueForBB = VtoBB.first; NewBB = VtoBB.second; - if (NewBB->size() == 0) { - CurrentOS->OutputBlockNum = -1; - NewBB->eraseFromParent(); + if (NewBB->size() == 0) { + NewBB->eraseFromParent(); ToRemove.push_back(RetValueForBB); continue; - } - + } + AllRemoved = false; DenseMap::iterator VBBIt = CurrentGroup.EndBBs.find(RetValueForBB); @@ -1387,11 +1608,11 @@ DenseMap NewBBs; unsigned BBIdx = 0; for (std::pair &VtoBB : CurrentGroup.EndBBs) { - BasicBlock *NewBB = BasicBlock::Create( + BasicBlock *NewBB = BasicBlock::Create( M.getContext(), Twine("output_block_") + Twine(static_cast(Idx)) + Twine("_") + Twine(static_cast(BBIdx++)), - CurrentGroup.OutlinedFunction); + CurrentGroup.OutlinedFunction); NewBBs.insert(std::make_pair(VtoBB.first, NewBB)); } replaceArgumentUses(*CurrentOS, NewBBs); @@ -1419,19 +1640,21 @@ return false; } - if (Region.Candidate->end()->Inst != - Region.Candidate->backInstruction()->getNextNonDebugInstruction()) { - IRInstructionDataList *IDL = Region.Candidate->front()->IDL; - Instruction *NewEndInst = - Region.Candidate->backInstruction()->getNextNonDebugInstruction(); - IRInstructionData *NewEndIRID = new (InstDataAllocator.Allocate()) - IRInstructionData(*NewEndInst, - InstructionClassifier.visit(*NewEndInst), *IDL); - - // Insert the first IRInstructionData of the new region after the - // last IRInstructionData of the IRSimilarityCandidate. - IDL->insert(Region.Candidate->end(), *NewEndIRID); - } + if (!Region.Candidate->backInstruction()->isTerminator()) { + if (Region.Candidate->end()->Inst != + Region.Candidate->backInstruction()->getNextNonDebugInstruction()) { + IRInstructionDataList *IDL = Region.Candidate->front()->IDL; + Instruction *NewEndInst = + Region.Candidate->backInstruction()->getNextNonDebugInstruction(); + IRInstructionData *NewEndIRID = new (InstDataAllocator.Allocate()) + IRInstructionData(*NewEndInst, + InstructionClassifier.visit(*NewEndInst), *IDL); + + // Insert the first IRInstructionData of the new region after the + // last IRInstructionData of the IRSimilarityCandidate. + IDL->insert(Region.Candidate->end(), *NewEndIRID); + } + } bool BadInst = any_of(*IRSC, [this](IRInstructionData &ID) { // We check if there is a discrepancy between the InstructionDataList @@ -1441,9 +1664,10 @@ // Since we do not have any similarity data about this particular // instruction, we cannot confidently outline it, and must discard this // candidate. - if (std::next(ID.getIterator())->Inst != - ID.Inst->getNextNonDebugInstruction()) - return true; + if (!ID.Inst->isTerminator()) + if (std::next(ID.getIterator())->Inst != + ID.Inst->getNextNonDebugInstruction()) + return true; return !this->InstructionClassifier.visit(ID.Inst); }); @@ -1476,9 +1700,14 @@ if (PreviouslyOutlined) continue; - // TODO: If in the future we can outline across BasicBlocks, we will need to - // check all BasicBlocks contained in the region. - if (IRSC.getStartBB()->hasAddressTaken()) + bool BBHasAddressTaken = false; + for (IRInstructionData &ID : IRSC) { + if (ID.Inst->getParent()->hasAddressTaken()) { + BBHasAddressTaken = true; + break; + } + } + if (BBHasAddressTaken) continue; if (IRSC.front()->Inst->getFunction()->hasLinkOnceODRLinkage() && @@ -1498,9 +1727,10 @@ // Since we do not have any similarity data about this particular // instruction, we cannot confidently outline it, and must discard this // candidate. - if (std::next(ID.getIterator())->Inst != - ID.Inst->getNextNonDebugInstruction()) - return true; + if (!ID.Inst->isTerminator()) + if (std::next(ID.getIterator())->Inst != + ID.Inst->getNextNonDebugInstruction()) + return true; return !this->InstructionClassifier.visit(ID.Inst); }); @@ -1567,10 +1797,35 @@ OutlinableGroup &CurrentGroup, TargetTransformInfo &TTI) { InstructionCost OutputCost = 0; + unsigned NumOutputBranches = 0; + + IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate; + DenseSet CandidateBlocks; + for (IRInstructionData &ID : Candidate) { + BasicBlock *BB = ID.Inst->getParent(); + CandidateBlocks.insert(BB); + } + + for (IRInstructionData &ID : Candidate) { + if (!isa(ID.Inst)) + continue; + + for (Value *V : ID.OperVals) { + BasicBlock *BB = static_cast(V); + DenseSet::iterator CBIt = CandidateBlocks.find(BB); + if (CBIt != CandidateBlocks.end()) + continue; + NumOutputBranches++; + } + } + + if (NumOutputBranches == 0) + NumOutputBranches++; + + CurrentGroup.BranchesToOutside = NumOutputBranches; for (const ArrayRef &OutputUse : CurrentGroup.OutputGVNCombinations) { - IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate; for (unsigned GVN : OutputUse) { Optional OV = Candidate.fromGVN(GVN); assert(OV.hasValue() && "Could not find value for GVN?"); @@ -1585,14 +1840,14 @@ LLVM_DEBUG(dbgs() << "Adding: " << StoreCost << " instructions to cost for output of type " << *V->getType() << "\n"); - OutputCost += StoreCost; + OutputCost += StoreCost * NumOutputBranches; } InstructionCost BranchCost = TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize); LLVM_DEBUG(dbgs() << "Adding " << BranchCost << " to the current cost for" << " a branch instruction\n"); - OutputCost += BranchCost; + OutputCost += BranchCost * NumOutputBranches; } // If there is more than one output scheme, we must have a comparison and @@ -1611,7 +1866,7 @@ LLVM_DEBUG(dbgs() << "Adding: " << TotalCost << " instructions for each switch case for each different" << " output path in a function\n"); - OutputCost += TotalCost; + OutputCost += TotalCost * NumOutputBranches; } return OutputCost; @@ -1702,7 +1957,6 @@ Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands); assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!"); - assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!"); Function *OrigF = Region.StartBB->getParent(); CodeExtractorAnalysisCache CEAC(*OrigF); Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC); @@ -1716,7 +1970,9 @@ return false; } - BasicBlock *RewrittenBB = Region.FollowBB->getSinglePredecessor(); + User *InstAsUser = Region.ExtractedFunction->user_back(); + BasicBlock *RewrittenBB = cast(InstAsUser)->getParent(); + Region.PrevBB = RewrittenBB->getSinglePredecessor(); Region.StartBB = RewrittenBB; Region.EndBB = RewrittenBB; @@ -1759,6 +2015,7 @@ unsigned IROutliner::doOutline(Module &M) { // Find the possible similarity sections. + InstructionClassifier.EnableBranches = EnableBranches; IRSimilarityIdentifier &Identifier = getIRSI(M); SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity(); @@ -1795,7 +2052,7 @@ // each section in the set. NotSame.clear(); CurrentGroup->findSameConstants(NotSame); - + if (CurrentGroup->IgnoreGroup) continue; @@ -1810,7 +2067,15 @@ if (!OS->CandidateSplit) continue; - std::vector BE = {OS->StartBB}; + std::vector BE; + DenseSet BBSet; + for (IRInstructionData &ID : *OS->Candidate) { + BasicBlock *BB = ID.Inst->getParent(); + if (BBSet.find(BB) != BBSet.end()) + continue; + BE.push_back(BB); + BBSet.insert(BB); + } OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, false, "outlined"); @@ -1867,12 +2132,11 @@ ExtractorAllocator.DestroyAll(); - if (NegativeCostGroups.size() > 1) - llvm::stable_sort(NegativeCostGroups, - [](const OutlinableGroup *LHS, - const OutlinableGroup *RHS) { - return LHS->Benefit - LHS->Cost > RHS->Benefit - RHS->Cost; - }); + if (NegativeCostGroups.size() > 1 && CostModel) + llvm::stable_sort(NegativeCostGroups, [](const OutlinableGroup *LHS, + const OutlinableGroup *RHS) { + return LHS->Benefit - LHS->Cost > RHS->Benefit - RHS->Cost; + }); std::vector FuncsToRemove; for (OutlinableGroup *CG : NegativeCostGroups) { @@ -1901,7 +2165,15 @@ // Create functions out of all the sections, and mark them as outlined. OutlinedRegions.clear(); for (OutlinableRegion *OS : CurrentGroup.Regions) { - std::vector BE = {OS->StartBB}; + std::vector BE; + DenseSet BBSet; + for (IRInstructionData &ID : *OS->Candidate) { + BasicBlock *BB = ID.Inst->getParent(); + if (BBSet.find(BB) != BBSet.end()) + continue; + BE.push_back(BB); + BBSet.insert(BB); + } OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, false, "outlined"); Index: llvm/test/Transforms/IROutliner/outlining-across-branch.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-across-branch.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; This checks that we are able to outline exactly the same branch structure +; while also outlining similar items on either side of the branch. + +define void @outline_outputs1() #0 { +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + br label %next +next: + store i32 2, i32* %output, align 4 + store i32 3, i32* %result, align 4 + ret void +} + +define void @outline_outputs2() #0 { +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + br label %next +next: + store i32 2, i32* %output, align 4 + store i32 3, i32* %result, align 4 + ret void +} +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @outline_outputs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +; +; CHECK: define internal void @outlined_ir_func_0( +; CHECK: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_after_outline.exitStub: +; CHECK-NEXT: ret void +; CHECK: entry_to_outline: +; CHECK-NEXT: store i32 2, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: store i32 3, i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: +; CHECK-NEXT: store i32 2, i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: store i32 3, i32* [[TMP3:%.*]], align 4 +; CHECK-NEXT: br label [[ENTRY_AFTER_OUTLINE_EXITSTUB:%.*]] +; Index: llvm/test/Transforms/IROutliner/outlining-basic-branches.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-basic-branches.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; This checks that we are able to outline exactly the same structure without +; any other items to outline. + +define void @outline_outputs1() #0 { +entry: + br label %next +next: + br label %next2 +next2: + br label %next +next3: + %a = alloca i32, align 4 + br label %next4 +next4: + br label %next3 +next5: + br label %next6 +next6: + %b = alloca i32, align 4 + ret void +} + +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: +; CHECK-NEXT: call void @outlined_ir_func_0() +; CHECK-NEXT: br label [[NEXT]] +; CHECK: next3: +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @outlined_ir_func_0() +; CHECK-NEXT: br label [[NEXT3:%.*]] +; CHECK: next5: +; CHECK-NEXT: br label [[NEXT6:%.*]] +; CHECK: next6: +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK: define internal void @outlined_ir_func_0( +; CHECK: newFuncRoot: +; CHECK-NEXT: br label [[NEXT_TO_OUTLINE:%.*]] +; CHECK: next.exitStub: +; CHECK-NEXT: ret void +; CHECK: next_to_outline: +; CHECK-NEXT: br label [[NEXT2:%.*]] +; CHECK: next2: +; CHECK-NEXT: br label [[NEXT_EXITSTUB:%.*]] +; Index: llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Here we have multiple exits, but the different sources, same outputs are +; needed, this checks that they are compressed, and moved into the appropriate +; output blocks. + +define void @outline_outputs1() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_6 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_6 +block_6: + %diff = phi i32 [%aval, %block_4], [%a2val, %block_5] + ret void +} + +define void @outline_outputs2() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_6 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_6 +block_6: + %diff = phi i32 [%aval, %block_4], [%a2val, %block_5] + ret void +} + +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIFF_CE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]]) +; CHECK-NEXT: [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br label [[BLOCK_6:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @outline_outputs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIFF_CE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]]) +; CHECK-NEXT: [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br label [[BLOCK_6:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ] +; CHECK-NEXT: ret void +; +; +; CHECK: define internal void @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[BLOCK_2_TO_OUTLINE:%.*]] +; CHECK: block_6.exitStub: +; CHECK-NEXT: store i32 [[DIFF_CE:%.*]], i32* [[TMP4:%.*]], align 4 +; CHECK-NEXT: ret void +; CHECK: block_2_to_outline: +; CHECK-NEXT: [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 2, [[A2VAL]] +; CHECK-NEXT: [[MUL2:%.*]] = mul i32 2, [[B2VAL]] +; CHECK-NEXT: br label [[BLOCK_5:%.*]] +; CHECK: block_3: +; CHECK-NEXT: [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 2, [[AVAL]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 2, [[BVAL]] +; CHECK-NEXT: br label [[BLOCK_4:%.*]] +; CHECK: block_4: +; CHECK-NEXT: store i32 [[ADD]], i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: store i32 [[MUL]], i32* [[TMP3:%.*]], align 4 +; CHECK-NEXT: br label [[BLOCK_6_SPLIT:%.*]] +; CHECK: block_5: +; CHECK-NEXT: store i32 [[ADD2]], i32* [[TMP2]], align 4 +; CHECK-NEXT: store i32 [[MUL2]], i32* [[TMP3]], align 4 +; CHECK-NEXT: br label [[BLOCK_6_SPLIT]] +; CHECK: block_6.split: +; CHECK-NEXT: [[DIFF_CE]] = phi i32 [ [[AVAL]], [[BLOCK_4]] ], [ [[A2VAL]], [[BLOCK_5]] ] +; CHECK-NEXT: br label [[BLOCK_6_EXITSTUB:%.*]] +; Index: llvm/test/Transforms/IROutliner/outlining-conditional-match.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-conditional-match.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; This checks that we are able to outline branches that use a conditional branch +; rather than just a implicit branch. + +define void @outline_outputs1() #0 { +entry: + %0 = icmp eq i32 2, 0 + br i1 %0, label %next, label %next2 +next: + br label %next3 +next2: + br label %next +next3: + br label %next4 +next4: + %1 = icmp eq i32 2, 0 + br i1 %0, label %next5, label %next6 +next5: + br label %next7 +next6: + br label %next5 +next7: + ret void +} +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 2, 0 +; CHECK-NEXT: call void @outlined_ir_func_0(i1 [[TMP0]]) +; CHECK-NEXT: br label [[NEXT3:%.*]] +; CHECK: next3: +; CHECK-NEXT: br label [[NEXT4:%.*]] +; CHECK: next4: +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 2, 0 +; CHECK-NEXT: call void @outlined_ir_func_0(i1 [[TMP0]]) +; CHECK-NEXT: br label [[NEXT7:%.*]] +; CHECK: next7: +; CHECK-NEXT: ret void +; +; +; CHECK: define internal void @outlined_ir_func_0( +; CHECK: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: next3.exitStub: +; CHECK-NEXT: ret void +; CHECK: entry_to_outline: +; CHECK-NEXT: br i1 [[TMP0:%.*]], label [[NEXT:%.*]], label [[NEXT2:%.*]] +; CHECK: next: +; CHECK-NEXT: br label [[NEXT3_EXITSTUB:%.*]] +; CHECK: next2: +; CHECK-NEXT: br label [[NEXT]] +; Index: llvm/test/Transforms/IROutliner/outlining-multiple-exits-diff-outputs.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-multiple-exits-diff-outputs.ll @@ -0,0 +1,229 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Here we have multiple exits, but the different sources, different outputs are +; needed, this checks that they are handled by separate switch statements. + +define void @outline_outputs1() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_6 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_7 +block_6: + %div = udiv i32 %aval, %bval + ret void +block_7: + %sub = sub i32 %a2val, %b2val + ret void +} + +define void @outline_outputs2() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_7 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_6 +block_6: + %diff = sub i32 %a2val, %b2val + ret void +block_7: + %quot = udiv i32 %add, %mul + ret void +} +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BVAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[AVAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]], i32 0) +; CHECK-NEXT: [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4 +; CHECK-NEXT: [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4 +; CHECK-NEXT: [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4 +; CHECK-NEXT: [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: br i1 [[TMP0]], label [[BLOCK_6:%.*]], label [[BLOCK_7:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]] +; CHECK-NEXT: ret void +; CHECK: block_7: +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @outline_outputs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MUL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[LT_CAST2:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: [[LT_CAST3:%.*]] = bitcast i32* [[MUL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[ADD_LOC]], i32* [[MUL_LOC]], i32 1) +; CHECK-NEXT: [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4 +; CHECK-NEXT: [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4 +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[MUL_RELOAD:%.*]] = load i32, i32* [[MUL_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: br i1 [[TMP0]], label [[BLOCK_7:%.*]], label [[BLOCK_6:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]] +; CHECK-NEXT: ret void +; CHECK: block_7: +; CHECK-NEXT: [[QUOT:%.*]] = udiv i32 [[ADD_RELOAD]], [[MUL_RELOAD]] +; CHECK-NEXT: ret void +; +; +; CHECK: define internal i1 @outlined_ir_func_0( +; CHECK: newFuncRoot: +; CHECK-NEXT: br label [[BLOCK_2_TO_OUTLINE:%.*]] +; CHECK: block_6.exitStub: +; CHECK-NEXT: switch i32 [[TMP8:%.*]], label [[FINAL_BLOCK_1:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_0_1:%.*]] +; CHECK-NEXT: i32 1, label [[OUTPUT_BLOCK_1_1:%.*]] +; CHECK-NEXT: ] +; CHECK: block_7.exitStub: +; CHECK-NEXT: switch i32 [[TMP8]], label [[FINAL_BLOCK_0:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_0_0:%.*]] +; CHECK-NEXT: i32 1, label [[OUTPUT_BLOCK_1_0:%.*]] +; CHECK-NEXT: ] +; CHECK: block_2_to_outline: +; CHECK-NEXT: [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 2, [[A2VAL]] +; CHECK-NEXT: [[MUL2:%.*]] = mul i32 2, [[B2VAL]] +; CHECK-NEXT: br label [[BLOCK_5:%.*]] +; CHECK: block_3: +; CHECK-NEXT: [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 2, [[AVAL]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 2, [[BVAL]] +; CHECK-NEXT: br label [[BLOCK_4:%.*]] +; CHECK: block_4: +; CHECK-NEXT: store i32 [[ADD]], i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: store i32 [[MUL]], i32* [[TMP3:%.*]], align 4 +; CHECK-NEXT: br label [[BLOCK_6_EXITSTUB:%.*]] +; CHECK: block_5: +; CHECK-NEXT: store i32 [[ADD2]], i32* [[TMP2]], align 4 +; CHECK-NEXT: store i32 [[MUL2]], i32* [[TMP3]], align 4 +; CHECK-NEXT: br label [[BLOCK_7_EXITSTUB:%.*]] +; CHECK: output_block_0_0: +; CHECK-NEXT: store i32 [[A2VAL]], i32* [[TMP4:%.*]], align 4 +; CHECK-NEXT: store i32 [[B2VAL]], i32* [[TMP5:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: output_block_0_1: +; CHECK-NEXT: store i32 [[AVAL]], i32* [[TMP6:%.*]], align 4 +; CHECK-NEXT: store i32 [[BVAL]], i32* [[TMP7:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_1]] +; CHECK: output_block_1_0: +; CHECK-NEXT: store i32 [[A2VAL]], i32* [[TMP4]], align 4 +; CHECK-NEXT: store i32 [[B2VAL]], i32* [[TMP5]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: output_block_1_1: +; CHECK-NEXT: store i32 [[ADD]], i32* [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[MUL]], i32* [[TMP7]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_1]] +; CHECK: final_block_0: +; CHECK-NEXT: ret i1 false +; CHECK: final_block_1: +; CHECK-NEXT: ret i1 true +; Index: llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll @@ -0,0 +1,208 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Here we have multiple exits, but the different sources, same outputs are +; needed, this checks that they are compressed, and moved into the appropriate +; output blocks. + +define void @outline_outputs1() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_6 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_7 +block_6: + %div = udiv i32 %aval, %bval + ret void +block_7: + %sub = sub i32 %a2val, %b2val + ret void +} + +define void @outline_outputs2() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_7 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_6 +block_6: + %diff = sub i32 %a2val, %b2val + ret void +block_7: + %quot = udiv i32 %aval, %bval + ret void +} +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BVAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[AVAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]]) +; CHECK-NEXT: [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4 +; CHECK-NEXT: [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4 +; CHECK-NEXT: [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4 +; CHECK-NEXT: [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: br i1 [[TARGETBLOCK]], label [[BLOCK_6:%.*]], label [[BLOCK_7:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]] +; CHECK-NEXT: ret void +; CHECK: block_7: +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @outline_outputs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BVAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[AVAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2VAL_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]]) +; CHECK-NEXT: [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4 +; CHECK-NEXT: [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4 +; CHECK-NEXT: [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4 +; CHECK-NEXT: [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]]) +; CHECK-NEXT: br i1 [[TARGETBLOCK]], label [[BLOCK_7:%.*]], label [[BLOCK_6:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]] +; CHECK-NEXT: ret void +; CHECK: block_7: +; CHECK-NEXT: [[QUOT:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]] +; CHECK-NEXT: ret void +; +; +; CHECK: define internal i1 @outlined_ir_func_0( +; CHECK: newFuncRoot: +; CHECK-NEXT: br label [[BLOCK_2_TO_OUTLINE:%.*]] +; CHECK: block_6.exitStub: +; CHECK-NEXT: store i32 [[AVAL:%.*]], i32* [[TMP6:%.*]], align 4 +; CHECK-NEXT: store i32 [[BVAL:%.*]], i32* [[TMP7:%.*]], align 4 +; CHECK-NEXT: ret i1 true +; CHECK: block_7.exitStub: +; CHECK-NEXT: store i32 [[A2VAL:%.*]], i32* [[TMP4:%.*]], align 4 +; CHECK-NEXT: store i32 [[B2VAL:%.*]], i32* [[TMP5:%.*]], align 4 +; CHECK-NEXT: ret i1 false +; CHECK: block_2_to_outline: +; CHECK-NEXT: [[A2VAL]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: [[B2VAL]] = load i32, i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 2, [[A2VAL]] +; CHECK-NEXT: [[MUL2:%.*]] = mul i32 2, [[B2VAL]] +; CHECK-NEXT: br label [[BLOCK_5:%.*]] +; CHECK: block_3: +; CHECK-NEXT: [[AVAL]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[BVAL]] = load i32, i32* [[TMP1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 2, [[AVAL]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 2, [[BVAL]] +; CHECK-NEXT: br label [[BLOCK_4:%.*]] +; CHECK: block_4: +; CHECK-NEXT: store i32 [[ADD]], i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: store i32 [[MUL]], i32* [[TMP3:%.*]], align 4 +; CHECK-NEXT: br label [[BLOCK_6_EXITSTUB:%.*]] +; CHECK: block_5: +; CHECK-NEXT: store i32 [[ADD2]], i32* [[TMP2]], align 4 +; CHECK-NEXT: store i32 [[MUL2]], i32* [[TMP3]], align 4 +; CHECK-NEXT: br label [[BLOCK_7_EXITSTUB:%.*]] +; Index: llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp =================================================================== --- llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp +++ llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp @@ -726,22 +726,8 @@ ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]); } -// In most cases, the illegal instructions we are collecting don't require any -// sort of setup. In these cases, we can just only have illegal instructions, -// and the mapper will create 0 length vectors, and we can check that. - -// In cases where we have legal instructions needed to set up the illegal -// instruction, to check illegal instructions are assigned unsigned integers -// from the maximum value decreasing to 0, it will be greater than a legal -// instruction that comes after. So to check that we have an illegal -// instruction, we place a legal instruction after an illegal instruction, and -// check that the illegal unsigned integer is greater than the unsigned integer -// of the legal instruction. - -// Checks that the branch is mapped to be illegal since there is extra checking -// needed to ensure that a branch in one region is branching to an isomorphic -// location in a different region. -TEST(IRInstructionMapper, BranchIllegal) { +// Checks that the branch is mapped to legal when the option is set. +TEST(IRInstructionMapper, BranchLegal) { StringRef ModuleString = R"( define i32 @f(i32 %a, i32 %b) { bb0: @@ -759,12 +745,28 @@ SpecificBumpPtrAllocator InstDataAllocator; SpecificBumpPtrAllocator IDLAllocator; IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator); + Mapper.InstClassifier.EnableBranches = true; + Mapper.initializeForBBs(*M); getVectors(*M, Mapper, InstrList, UnsignedVec); ASSERT_EQ(InstrList.size(), UnsignedVec.size()); - ASSERT_EQ(UnsignedVec.size(), static_cast(0)); + ASSERT_EQ(UnsignedVec.size(), static_cast(3)); + ASSERT_TRUE(UnsignedVec[1] > UnsignedVec[0]); + ASSERT_TRUE(UnsignedVec[1] < UnsignedVec[2]); } +// In most cases, the illegal instructions we are collecting don't require any +// sort of setup. In these cases, we can just only have illegal instructions, +// and the mapper will create 0 length vectors, and we can check that. + +// In cases where we have legal instructions needed to set up the illegal +// instruction, to check illegal instructions are assigned unsigned integers +// from the maximum value decreasing to 0, it will be greater than a legal +// instruction that comes after. So to check that we have an illegal +// instruction, we place a legal instruction after an illegal instruction, and +// check that the illegal unsigned integer is greater than the unsigned integer +// of the legal instruction. + // Checks that a PHINode is mapped to be illegal since there is extra checking // needed to ensure that a branch in one region is bin an isomorphic // location in a different region. @@ -1445,8 +1447,8 @@ getVectors(*M, Mapper, InstrList, UnsignedVec); ASSERT_EQ(InstrList.size(), UnsignedVec.size()); - ASSERT_EQ(UnsignedVec.size(), static_cast(6)); - ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]); + ASSERT_EQ(UnsignedVec.size(), static_cast(7)); + ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[0]); } // Checks that a memcpy instruction is mapped to an illegal value. @@ -1476,8 +1478,10 @@ getVectors(*M, Mapper, InstrList, UnsignedVec); ASSERT_EQ(InstrList.size(), UnsignedVec.size()); - ASSERT_EQ(UnsignedVec.size(), static_cast(6)); - ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]); + ASSERT_EQ(UnsignedVec.size(), static_cast(7)); + errs() << UnsignedVec[1] << " " << UnsignedVec[2] << " " << UnsignedVec[3] << "\n"; + ASSERT_TRUE(UnsignedVec[2] > UnsignedVec[3]); + ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[0]); } // Checks that a memmove instruction is mapped to an illegal value. @@ -1507,8 +1511,8 @@ getVectors(*M, Mapper, InstrList, UnsignedVec); ASSERT_EQ(InstrList.size(), UnsignedVec.size()); - ASSERT_EQ(UnsignedVec.size(), static_cast(6)); - ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]); + ASSERT_EQ(UnsignedVec.size(), static_cast(7)); + ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[0]); } // Checks that a variable argument instructions are mapped to an illegal value. @@ -1555,11 +1559,10 @@ getVectors(*M, Mapper, InstrList, UnsignedVec); ASSERT_EQ(InstrList.size(), UnsignedVec.size()); - ASSERT_EQ(UnsignedVec.size(), static_cast(16)); - ASSERT_TRUE(UnsignedVec[4] < UnsignedVec[3]); - ASSERT_TRUE(UnsignedVec[7] < UnsignedVec[6]); - ASSERT_TRUE(UnsignedVec[10] < UnsignedVec[9]); - ASSERT_TRUE(UnsignedVec[13] < UnsignedVec[12]); + ASSERT_EQ(UnsignedVec.size(), static_cast(17)); + ASSERT_TRUE(UnsignedVec[7] < UnsignedVec[0]); + ASSERT_TRUE(UnsignedVec[13] < UnsignedVec[10]); + ASSERT_TRUE(UnsignedVec[16] < UnsignedVec[13]); } // Check the length of adding two illegal instructions one after th other. We @@ -1599,21 +1602,24 @@ // two blocks of two legal instructions and terminator, and checks them for // instruction similarity. static bool longSimCandCompare(std::vector &InstrList, - bool Structure = false) { + bool Structure = false, + unsigned Length=2, + unsigned StartIdxOne=0, + unsigned StartIdxTwo=3) { std::vector::iterator Start, End; Start = InstrList.begin(); End = InstrList.begin(); - std::advance(End, 1); - IRSimilarityCandidate Cand1(0, 2, *Start, *End); + std::advance(End, StartIdxOne + Length - 1); + IRSimilarityCandidate Cand1(StartIdxOne, Length, *Start, *End); Start = InstrList.begin(); End = InstrList.begin(); - std::advance(Start, 3); - std::advance(End, 4); - IRSimilarityCandidate Cand2(3, 2, *Start, *End); + std::advance(Start, StartIdxTwo); + std::advance(End, StartIdxTwo + Length - 1); + IRSimilarityCandidate Cand2(StartIdxTwo, Length, *Start, *End); if (Structure) return IRSimilarityCandidate::compareStructure(Cand1, Cand2); return IRSimilarityCandidate::isSimilar(Cand1, Cand2); @@ -2083,6 +2089,206 @@ ASSERT_TRUE(longSimCandCompare(InstrList, true)); } +// Checks that the same structure is recognized between two candidates when +// the branches target other blocks inside the same region, the relative +// distance between the blocks must be the same. +TEST(IRSimilarityCandidate, SameBranchStructureInternal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + ret i32 0 + } + + define i32 @f2(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + SpecificBumpPtrAllocator InstDataAllocator; + SpecificBumpPtrAllocator IDLAllocator; + IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator); + Mapper.InstClassifier.EnableBranches = true; + Mapper.initializeForBBs(*M); + getVectors(*M, Mapper, InstrList, UnsignedVec); + + // Check to make sure that we have a long enough region. + ASSERT_EQ(InstrList.size(), static_cast(12)); + // Check that the instructions were added correctly to both vectors. + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + + ASSERT_TRUE(longSimCandCompare(InstrList, true, 5, 0, 6)); +} + +// Checks that the different structure is recognized between two candidates, +// when the branches target other blocks inside the same region, the relative +// distance between the blocks must be the same. +TEST(IRSimilarityCandidate, DifferentBranchStructureInternal) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb2 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + br label %bb2 + bb2: + %4 = add i32 %b, %a + %5 = add i32 %a, %b + ret i32 0 + } + + define i32 @f2(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + br label %bb2 + bb2: + %4 = add i32 %b, %a + %5 = add i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + SpecificBumpPtrAllocator InstDataAllocator; + SpecificBumpPtrAllocator IDLAllocator; + IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator); + Mapper.InstClassifier.EnableBranches = true; + Mapper.initializeForBBs(*M); + getVectors(*M, Mapper, InstrList, UnsignedVec); + + // Check to make sure that we have a long enough region. + ASSERT_EQ(InstrList.size(), static_cast(18)); + // Check that the instructions were added correctly to both vectors. + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + + ASSERT_FALSE(longSimCandCompare(InstrList, true, 6, 0, 9)); +} + +// Checks that the same structure is recognized between two candidates, when +// the branches target other blocks outside region, the relative distance +// does not need to be the same. +TEST(IRSimilarityCandidate, SameBranchStructureOutside) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + ret i32 0 + } + + define i32 @f2(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + SpecificBumpPtrAllocator InstDataAllocator; + SpecificBumpPtrAllocator IDLAllocator; + IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator); + Mapper.InstClassifier.EnableBranches = true; + Mapper.initializeForBBs(*M); + getVectors(*M, Mapper, InstrList, UnsignedVec); + + // Check to make sure that we have a long enough region. + ASSERT_EQ(InstrList.size(), static_cast(12)); + // Check that the instructions were added correctly to both vectors. + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + + ASSERT_TRUE(longSimCandCompare(InstrList, true, 3, 0, 6)); +} + +// Checks that the same structure is recognized between two candidates, when +// the branches target other blocks outside region, the relative distance +// does not need to be the same. +TEST(IRSimilarityCandidate, DifferentBranchStructureOutside) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + ret i32 0 + } + + define i32 @f2(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb2 + bb1: + %2 = add i32 %b, %a + %3 = add i32 %a, %b + br label %bb2 + bb2: + %4 = add i32 %b, %a + %5 = add i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector InstrList; + std::vector UnsignedVec; + + SpecificBumpPtrAllocator InstDataAllocator; + SpecificBumpPtrAllocator IDLAllocator; + IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator); + Mapper.InstClassifier.EnableBranches = true; + Mapper.initializeForBBs(*M); + getVectors(*M, Mapper, InstrList, UnsignedVec); + + // Check to make sure that we have a long enough region. + ASSERT_EQ(InstrList.size(), static_cast(15)); + // Check that the instructions were added correctly to both vectors. + ASSERT_TRUE(InstrList.size() == UnsignedVec.size()); + + ASSERT_TRUE(longSimCandCompare(InstrList, true, 3, 0, 6)); +} + // Checks that two sets of identical instructions are found to be the same. // Both sequences of adds have the same operand ordering, and the same // instructions, making them strcturally equivalent.