Index: llvm/include/llvm/Transforms/IPO/IROutliner.h =================================================================== --- llvm/include/llvm/Transforms/IPO/IROutliner.h +++ llvm/include/llvm/Transforms/IPO/IROutliner.h @@ -95,6 +95,10 @@ /// required for the following basic blocks in this case. bool EndsInBranch = false; + /// The PHIBlocks with their corresponding return block based on the return + /// value as the key. + DenseMap PHIBlocks; + /// Mapping of the argument number in the deduplicated function /// to a given constant, which is used when creating the arguments to the call /// to the newly created deduplicated function. This is handled separately @@ -182,7 +186,14 @@ IROutliner(function_ref GTTI, function_ref GIRSI, function_ref GORE) - : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {} + : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) { + + // Check that the DenseMap implementation has not changed. + assert(DenseMapInfo::getEmptyKey() == (unsigned)-1 && + "DenseMapInfo's empty key isn't -1!"); + assert(DenseMapInfo::getTombstoneKey() == (unsigned)-2 && + "DenseMapInfo's tombstone key isn't -2!"); + } bool run(Module &M); private: Index: llvm/lib/Transforms/IPO/IROutliner.cpp =================================================================== --- llvm/lib/Transforms/IPO/IROutliner.cpp +++ llvm/lib/Transforms/IPO/IROutliner.cpp @@ -104,6 +104,16 @@ /// of the region. unsigned BranchesToOutside = 0; + /// Tracker counting backwards from the highest unsigned value possible to + /// avoid conflicting with the GVNs of assigned values. We start at -3 since + /// -2 and -1 are assigned by the DenseMap. + unsigned PHINodeGVNTracker = -3; + + DenseMap, SmallVector>> + PHINodeGVNToGVNs; + DenseMap GVNsToPHINodeGVN; + /// The number of instructions that will be outlined by extracting \ref /// Regions. InstructionCost Benefit = 0; @@ -358,6 +368,22 @@ return Benefit; } +/// Check the \p OutputMappings structure for value \p Input, if it exists +/// it has been used as an output for outlining, and has been renamed, and we +/// return the new value, otherwise, we return the same value. +/// +/// \param OutputMappings [in] - The mapping of values to their renamed value +/// after being used as an output for an outlined region. +/// \param Input [in] - The value to find the remapped value of, if it exists. +/// \return The remapped value if it has been renamed, and the same value if has +/// not. +static Value *findOutputMapping(const DenseMap OutputMappings, + Value *Input) { + if (OutputMappings.find(Input) != OutputMappings.end()) + return OutputMappings.find(Input)->second; + return Input; +} + /// Find whether \p Region matches the global value numbering to Constant /// mapping found so far. /// @@ -838,6 +864,208 @@ Region.NumExtractedInputs = OriginalIndex; } +/// We check if the \p V has any uses outside of the region other than \p PN. +/// If it does, we return true. If it does not, we return false. +/// +/// \param V [in] - The value to check. +/// \param PHILoc [in] - The location in the PHINode of \p V. +/// \param PN [in] - The PHINode using \p V. +/// \param Exits [in] - The potential blocks we exit to from the outlined +/// region. +/// \param BBSet [in] - The basic blocks contained in the region. +/// \return whether there are non phi node uses for this value outside of the +/// region. +static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN, + SmallPtrSet &Exits, + DenseSet &BBSet) { + // We check to see if the value is used by the PHINode from some other + // predecessor not included in the region. If it is, we make sure + // to keep it as an output. + for (unsigned Idx = 0, Edx = PN.getNumIncomingValues(); Idx < Edx; Idx++) { + if (Idx == PHILoc) + continue; + if (V != PN.getIncomingValue(Idx)) + continue; + if (BBSet.contains(PN.getIncomingBlock(Idx))) + continue; + return true; + } + + // We examine the other uses of the value to see if it used outside the + // region. + for (User *U : V->users()) { + Instruction *I = dyn_cast(U); + if (!I) + continue; + + // If the use of the item is inside the region, we skip it since use + // inside the region doesn't tell us anything about the uses outside + // non exit blocks outside the region. + BasicBlock *Parent = I->getParent(); + if (BBSet.contains(Parent)) + continue; + + // If it's not a PHINode then we definitely know the use matters. + if (!isa(I)) + return true; + + // If we have a PHINode outside one of the exit locations, then it + // can be considered an outside use as well. If there is a PHINode + // contained in the Exit where this values use matters, it will be + // caught when we analyze that PHINode. + if (!Exits.contains(Parent)) + return true; + } + + return false; +} + +/// We check the exit block, and test whether it has any PHINodes that should +/// be considered outputs as it combines a value used as an output in the +/// region. +/// +/// \param ExitBB [in] - The block to analyze. +/// \param ExitBBs [in] - The potential exit blocks from the region. +/// \param BBSet [in] - The basic blocks in the region. +/// \param Outputs [in, out] - The existing outputs for the region, we may add +/// PHINodes to this as we find that they replace output values. +/// \param PHIWrapped [out] - A set containing outputs that are totally replaced +/// by a PHINode. +/// \param NotPHIWrapped [out] - A set containing outputs that are used in +/// PHINodes, but have other uses, and should still be considered outputs. +static void analyzeExitPHIsForOutputUses(BasicBlock *ExitBB, + SmallPtrSet &Exits, + DenseSet &BBSet, + SetVector &Outputs, + DenseSet &PHIWrapped, + DenseSet &NotPHIWrapped) { + for (PHINode &PN : ExitBB->phis()) { + // Find all incoming values from the outlining region. + SmallVector IncomingVals; + for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I) + if (BBSet.contains(PN.getIncomingBlock(I))) + IncomingVals.push_back(I); + + // Do not process PHI if there are no predecessors from region. + unsigned NumIncomingVals = IncomingVals.size(); + if (NumIncomingVals == 0) + continue; + + // If there is one predecessor, we mark it as a value that needs to be kept + // as an output. + if (NumIncomingVals == 1) { + Value *V = PN.getIncomingValue(*IncomingVals.begin()); + NotPHIWrapped.insert(V); + PHIWrapped.erase(V); + continue; + } + + // This PHINode will be used as an output value, so we add it to our list. + Outputs.insert(&PN); + + // Not all of the incoming values should be ignored as other inputs and + // outputs may have uses in outlined region. If they have other uses + // outside of the single PHINode we should not skip over it. + for (unsigned Idx : IncomingVals) { + Value *V = PN.getIncomingValue(Idx); + if (outputHasNonPHI(V, Idx, PN, Exits, BBSet)) { + NotPHIWrapped.insert(V); + PHIWrapped.erase(V); + continue; + } + if (!NotPHIWrapped.contains(V)) + PHIWrapped.insert(V); + } + } +} + +// Represents the type for the unsigned number denoting the output number for +// phi node, along with the canonical number for the exit block. +using ArgLocWithBBCanon = std::pair; +// The list of canonical numbers for the incoming values to a PHINode. +using CanonList = SmallVector; +// The pair type representing the set of canonical values being combined in the +// PHINode, along with the location data for the PHINode. +using PHINodeData = std::pair; + +/// Encode \p PND as an integer for easy lookup based on the argument location, +/// the parent BasicBlock canonical numbering, and the canonical numbering of +/// the values stored in the PHINode. +/// +/// \param PND - The data to hash. +/// \returns The hash code of \p PND. +static hash_code encodePHINodeData(PHINodeData &PND) { + return llvm::hash_combine( + llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second), + llvm::hash_combine_range(PND.second.begin(), PND.second.end())); +} + +/// We need to create a special GVN for PHINodes that will be used outside of +/// the region. We create a hash code based on the Canonical number of the +/// parent BasicBlock, the canonical numbering of the values stored in the +/// PHINode and the aggregate argument location. This is used to find whether +/// this PHINode type has been given a canonical numbering already. If not, we +/// assign it a value and store it for later use. The value is returned to +/// identify different output schemes for the set of regions. +/// +/// \param Region - The region that \p PN is an output for. +/// \param BBSet - The BasicBlocks containsed in the region. +/// \param PN - The PHINode we are analyzing. +/// \param AggArgIdx - The argument \p PN will be stored into. +/// \returns An optional holding the assigned canonical number, or None if +/// there is some attribute of the PHINode blocking it from being used. +static Optional getGVNForPHINode(OutlinableRegion &Region, + DenseSet BBSet, + PHINode *PN, unsigned AggArgIdx) { + OutlinableGroup &Group = *Region.Parent; + IRSimilarityCandidate &C = *Region.Candidate; + BasicBlock *PHIBB = PN->getParent(); + CanonList PHIGVNs; + for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; ++Idx) { + Value *V = PN->getIncomingValue(Idx); + // If we cannot find a GVN, this means that the input to the PHINode is + // not included in the region we are trying to analyze, meaning, that if + // it was outlined, we would be adding an extra input. We ignore this + // case for now, and so ignore the region. + if (!C.getGVN(V).hasValue()) { + Region.IgnoreRegion = true; + return None; + } + + // Collect the canonical numbers of the values in the PHINode. + unsigned GVN = C.getGVN(V).getValue(); + GVN = *C.getCanonicalNum(GVN); + PHIGVNs.push_back(GVN); + } + DenseMap::iterator GVNToPHIIt; + DenseMap::iterator PHIToGVNIt; + Optional BBGVN = C.getGVN(PHIBB); + assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!"); + + BBGVN = C.getCanonicalNum(BBGVN.getValue()); + assert(BBGVN.hasValue() && + "Could not find canonical number for the incoming block!"); + // Create a pair of the exit block canonical value, and the aggregate + // argument location, connected to the canonical numbers stored in the + // PHINode. + PHINodeData TemporaryPair = + std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs); + hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair); + + // Look for and create a new entry in our connection between canonical + // numbers for PHINodes, and the set of objects we just created. + GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash); + if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) { + bool Inserted = false; + std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert( + std::make_pair(Group.PHINodeGVNTracker, TemporaryPair)); + std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert( + std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--)); + } + + return GVNToPHIIt->second; +} + /// Create a mapping of the output arguments for the \p Region to the output /// arguments of the overall outlined function. /// @@ -863,22 +1091,11 @@ // After determining which blocks exit to PHINodes, we add these PHINodes to // the set of outputs to be processed. We also check the incoming values of // the PHINodes for whether they should no longer be considered outputs. - for (BasicBlock *ExitBB : Exits) { - for (PHINode &PN : ExitBB->phis()) { - // Find all incoming values from the outlining region. - SmallVector IncomingVals; - for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx) - if (BBSet.contains(PN.getIncomingBlock(Idx))) - IncomingVals.push_back(Idx); - - // Do not process PHI if there is one (or fewer) predecessor from region. - if (IncomingVals.size() <= 1) - continue; - - Region.IgnoreRegion = true; - return; - } - } + DenseSet PHIWrapped; + DenseSet NotPHIWrapped; + for (BasicBlock *ExitBB : Exits) + analyzeExitPHIsForOutputUses(ExitBB, Exits, BBSet, Outputs, PHIWrapped, + NotPHIWrapped); // This counts the argument number in the extracted function. unsigned OriginalIndex = Region.NumExtractedInputs; @@ -901,9 +1118,13 @@ // do not have to be in same order, but are functionally the same, we will // have to use a different scheme, as one-to-one correspondence is not // guaranteed. - unsigned GlobalValue = C.getGVN(Output).getValue(); unsigned ArgumentSize = Group.ArgumentTypes.size(); + // If the output is combined in a PHINode, we make sure to skip over it. + if (PHIWrapped.contains(Output)) + continue; + + unsigned AggArgIdx = 0; for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) { if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType())) continue; @@ -915,7 +1136,7 @@ AggArgsUsed.insert(Jdx); Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx)); Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + AggArgIdx = Jdx; break; } @@ -929,13 +1150,40 @@ std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1)); Region.AggArgToExtracted.insert( std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + AggArgIdx = Group.ArgumentTypes.size() - 1; } - stable_sort(Region.GVNStores); + // TODO: Adapt to the extra input from the PHINode. + PHINode *PN = dyn_cast(Output); + + Optional GVN; + if (PN && !BBSet.contains(PN->getParent())) { + // Since values outside the region can be combined into PHINode when we + // have multiple exists, we collect both of these into a list to identify + // which values are being used in the PHINode. Each list identifies a + // different PHINode, and a different output, so we store it as it's own + // canonical value. These canonical values are also dependent on the + // output argument it is saved to. + + // If a PHINode as the same canonical values, but a different aggregate + // argument location, they will have distinct Canonical Values, to be used + // for the store sets. + GVN = getGVNForPHINode(Region, BBSet, PN, AggArgIdx); + if (!GVN.hasValue()) + return; + } else { + GVN = C.getGVN(Output); + GVN = C.getCanonicalNum(*GVN); + } + Region.GVNStores.push_back(*GVN); + OriginalIndex++; TypeIndex++; } + + // We sort the stored values to make sure that we are not affected by analysis + // order when determining what combination of items were stored. + stable_sort(Region.GVNStores); } void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region, @@ -1071,6 +1319,196 @@ return Call; } +/// Attempt to find a block containing PHIBlocks for the given \p RetVal, and +/// create one to add to the overall function if it does not exist. +/// +/// \param Group - The OutlinableGroup containing the information about the +/// overall outlined function. +/// \param RetVal - The return value or exit option that we are currently +/// evaluating. +/// \returns The found or newly created BasicBlock to contain the needed +/// PHINodes to be used as outputs. +static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) { + DenseMap::iterator PHIVBBIt, RetBBIt; + PHIVBBIt = Group.PHIBlocks.find(RetVal); + RetBBIt = Group.EndBBs.find(RetVal); + assert(RetBBIt != Group.EndBBs.end() && "Could not find output value!"); + BasicBlock *ReturnBB = RetBBIt->second; + + // Find if a PHIBlock exists for this return value already. If it is + // the first time we are anaylzing this, we will not, so we record it. + PHIVBBIt = Group.PHIBlocks.find(RetVal); + if (PHIVBBIt != Group.PHIBlocks.end()) + return PHIVBBIt->second; + + // If we did not find a block, we create one, and insert it into the + // overall function and record it. + BasicBlock *PHIBlock; + bool Inserted = false; + PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block", + ReturnBB->getParent()); + std::tie(PHIVBBIt, Inserted) = + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); + + // We find the predecessors in the overall function, and make sure + // we now branch to this new block. + SmallVector BranchesToChange; + for (BasicBlock *Pred : predecessors(ReturnBB)) { + Instruction *Term = Pred->getTerminator(); + BranchInst *BI = static_cast(Term); + BranchesToChange.push_back(BI); + } + for (BranchInst *BI : BranchesToChange) { + for (unsigned Succ = 0; Succ < BI->getNumSuccessors(); Succ++) { + if (BI->getSuccessor(Succ) != ReturnBB) + continue; + BI->setSuccessor(Succ, PHIBlock); + } + } + + BranchInst::Create(ReturnBB, PHIBlock); + + return PHIVBBIt->second; +} + +/// For the function call now representing the \p Region, find the passed value +/// to that call that represents Argument \p A at the call location. +/// +/// \param A - The Argument to get the passed value for. +/// \param Region - The extracted Region to get the passed Argument to. +/// \param AdjustArgNo - The boolean value, if true, we must adjust the +/// argument location as we are not analyzing the overall function call, but +/// extracted functionn call. If false, we do not have to make this adjustment. +/// \returns The passed value to the function now representing the region for +/// the argument \p A. +static Value *getPassedArgument(Argument *A, OutlinableRegion &Region, + bool AdjustArgNo) { + CallInst *CI = Region.Call; + unsigned ArgNum = A->getArgNo(); + + // If we don't need to adjust the argument number at all (since the call + // has already been replaced by a call to the overall outlined function) + // we can just get the specified argument. + if (!AdjustArgNo) + return CI->getArgOperand(ArgNum); + + // If it is a constant, we can look at our mapping from when we created + // the outputs to figure out what the constant value is. + if (Region.AggArgToConstant.count(ArgNum)) + return Region.AggArgToConstant.find(ArgNum)->second; + + // If it is not a constant, and we are not looking at the overall function, we + // need to adjust which argument we are looking at. + ArgNum = Region.AggArgToExtracted.find(ArgNum)->second; + return CI->getArgOperand(ArgNum); +} + +/// Find the canonical numbering for the incoming Values into the PHINode \p PN. +/// +/// \param PN [in] - The PHINode that we are finding the canonical numbers for. +/// \param Region [in] - The OutlinableRegion containing \p PN. +/// \param OutputMappings [in] - The mapping of output values from outlined +/// region to their original values. +/// \param CanonNums [out] - The canonical numbering for the incoming values to +/// \p PN. +/// \param AdjustArgNo - A flag to use the extracted function call of \p Region +/// rather than the overall function's call. +static void findCanonNumsForPHI( + PHINode * PN, OutlinableRegion & Region, + const DenseMap &OutputMappings, + DenseSet &CanonNums, bool AdjustArgNo = false) { + // Iterate over the incoming values. + for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) { + Value *IVal = PN->getIncomingValue(Idx); + // If we have an argument as incoming value, we need to grab the passed + // value from the call itself. + if (Argument *A = dyn_cast(IVal)) + IVal = getPassedArgument(A, Region, AdjustArgno); + + // Get the original value if it has been replaced by an output value. + IVal = findOutputMapping(OutputMappings, IVal); + + // Find and add the canonical number for the incoming value. + Optional GVN = Region.Candidate->getGVN(IVal); + assert(GVN.hasValue() && "No GVN for incoming value"); + Optional CanonNum = Region.Candidate->getCanonicalNum(*GVN); + CanonNums.insert(*CanonNum); + } +} + +/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock +/// in order to condense the number of instructions added to the outlined +/// function. +/// +/// \param PN [in] - The PHINode that we are finding the canonical numbers for. +/// \param Region [in] - The OutlinableRegion containing \p PN. +/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find +/// \p PN in. +/// \param OutputMappings [in] - The mapping of output values from outlined +/// region to their original values. +/// \return the newly found or created PHINode in \p OverallPhiBlock. +static PHINode* +findOrCreatePHIInBlock(PHINode *PN, OutlinableRegion &Region, + BasicBlock *OverallPhiBlock, + const DenseMap &OutputMappings) { + OutlinableGroup &Group = *Region.Parent; + + DenseSet PNCanonNums; + // We have to use the extracted function since we have merged this region into + // the overall function yet. We make sure to reassign the argument numbering + // since it is possible that + findCanonNumsForPHI(PN, Region, OutputMappings, PNCanonNums, true); + + OutlinableRegion *First = Group.Regions[0]; + DenseSet CurrentCanonNums; + // Find the Canonical Numbering for each PHINode, if it matches, we replace + // the uses of the PHINode we are searching for, with the found PHINode. + for (PHINode &CurrPN : OverallPhiBlock->phis()) { + CurrentCanonNums.clear(); + findCanonNumsForPHI(&CurrPN, *First, OutputMappings, CurrentCanonNums); + + if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) { + return CurrentCanonNums.contains(CanonNum); + })) + return &CurrPN; + } + + // If we've made it here, it means we weren't able to replace the PHINode, so + // we must insert it ourselves. + PHINode *NewPN = static_cast(PN->clone()); + NewPN->insertBefore(&*OverallPhiBlock->begin()); + for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx; + Idx++) { + Value *IVal = NewPN->getIncomingValue(Idx); + BasicBlock *IBlock = NewPN->getIncomingBlock(Idx); + + // Find corresponding basic block in the overall function for the incoming + // block. + Instruction *Inst = IBlock->getFirstNonPHI(); + assert(Inst && "Incoming block is empty?"); + Value * TempVal = Region.findCorrespondingValueIn(*First, Inst); + assert(TempVal && "Value is nullptr?"); + BasicBlock *BlockToUse = cast(TempVal)->getParent(); + NewPN->setIncomingBlock(Idx, BlockToUse); + + // If we have an argument we make sure we replace using the argument from + // the correct function. + if (Argument *A = dyn_cast(IVal)) { + unsigned ArgNum = A->getArgNo(); + Value *Val = Group.OutlinedFunction->getArg(ArgNum); + NewPN->setIncomingValue(Idx, Val); + continue; + } + + // Find the corresponding value in the overall function. + IVal = findOutputMapping(OutputMappings, IVal); + Value *Val = Region.findCorrespondingValueIn(*First, IVal); + assert(Val && "Value is nullptr?"); + NewPN->setIncomingValue(Idx, Val); + } + return NewPN; +} + // Within an extracted function, replace the argument uses of the extracted // region with the arguments of the function for an OutlinableGroup. // @@ -1083,6 +1521,7 @@ static void replaceArgumentUses(OutlinableRegion &Region, DenseMap &OutputBBs, + const DenseMap &OutputMappings, bool FirstFunction = false) { OutlinableGroup &Group = *Region.Parent; assert(Region.ExtractedFunction && "Region has no extracted function?"); @@ -1136,8 +1575,7 @@ if (!RI) continue; Value *RetVal = RI->getReturnValue(); - DenseMap::iterator VBBIt, PHIVBBIt; - VBBIt = OutputBBs.find(RetVal); + DenseMap::iterator VBBIt = OutputBBs.find(RetVal); assert(VBBIt != OutputBBs.end() && "Could not find output value!"); // If this is storing a PHINode, we must make sure it is included in the @@ -1153,12 +1591,46 @@ LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " << *OutputBB << "\n"); - if (FirstFunction) + // If this is storing a PHINode, we must make sure it is included in the + // overall function. + if (!isa(ValueOperand)) { + if (FirstFunction) + continue; + Value *CorrVal = + Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand); + assert(CorrVal && "Value is nullptr?"); + NewI->setOperand(0, CorrVal); + continue; + } + PHINode *PN = cast(SI->getValueOperand()); + // If it has a value, it was not split by the code extractor, which + // is what we are looking for. + if (Region.Candidate->getGVN(PN).hasValue()) + continue; + + // We record the parent block for the PHINode in the Region so that + // we can exclude it from checks later on. + Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent())); + + // If this is the first function, we do not need to worry about mergiing + // this with any other block in the overall outlined function, so we can + // just continue. + if (FirstFunction) { + BasicBlock *PHIBlock = PN->getParent(); + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); continue; - Value *CorrVal = - Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand); - assert(CorrVal && "Value is nullptr?"); - NewI->setOperand(0, CorrVal); + } + + // We look for the aggregate block that contains the PHINodes leading into + // this exit path. If we can't find one, we create one. + BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal); + + // For our PHINode, we find the combined canonical numbering, and + // attempt to find a matching PHINode in the overall PHIBlock. If we + // cannot, we copy the PHINode and move it into this new block. + PHINode *NewPN = + findOrCreatePHIInBlock(PN, Region, OverallPhiBlock, OutputMappings); + NewI->setOperand(0, NewPN); } // If we added an edge for basic blocks without a predecessor, we remove it @@ -1399,7 +1871,12 @@ Module &M, OutlinableGroup &OG, DenseMap &EndBBs, std::vector> &OutputStoreBBs) { // We only need the switch statement if there is more than one store - // combination. + // combination, or there is more than one set of output blocks. The first + // will occur when we store different sets of values for two different + // regions. The second will occur when we have two outputs that are combined + // in a PHINode outside of the region in one outlined instance, and are used + // seaparately in another. This will create the same set of OutputGVNs, but + // will generate two different output schemes. if (OG.OutputGVNCombinations.size() > 1) { Function *AggFunc = OG.OutlinedFunction; // Create a final block for each different return block. @@ -1442,8 +1919,14 @@ return; } + assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!"); + // If there needs to be stores, move them from the output blocks to their - // corresponding ending block. + // corresponding ending block. We do not check that the OutputGVNCombinations + // is equal to 1 here since that could just been the case where there are 0 + // outputs. Instead, we check whether there is more than one set of output + // blocks since this is the only case where we would have to move the + // stores, and erase the extraneous blocks. if (OutputStoreBBs.size() == 1) { LLVM_DEBUG(dbgs() << "Move store instructions to the end block in " << *OG.OutlinedFunction << "\n"); @@ -1475,10 +1958,13 @@ /// set of stores needed for the different functions. /// \param [in,out] FuncsToRemove - Extracted functions to erase from module /// once outlining is complete. +/// \param [in] OutputMappings - Extracted functions to erase from module +/// once outlining is complete. static void fillOverallFunction( Module &M, OutlinableGroup &CurrentGroup, std::vector> &OutputStoreBBs, - std::vector &FuncsToRemove) { + std::vector &FuncsToRemove, + const DenseMap &OutputMappings) { OutlinableRegion *CurrentOS = CurrentGroup.Regions[0]; // Move first extracted function's instructions into new function. @@ -1499,7 +1985,7 @@ CurrentGroup.OutlinedFunction, "output_block_0"); CurrentOS->OutputBlockNum = 0; - replaceArgumentUses(*CurrentOS, NewBBs, true); + replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true); replaceConstants(*CurrentOS); // We first identify if any output blocks are empty, if they are we remove @@ -1533,7 +2019,8 @@ OutlinableRegion *CurrentOS; - fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove); + fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove, + OutputMappings); std::vector SortedKeys; for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) { @@ -1547,8 +2034,7 @@ createAndInsertBasicBlocks( CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction, "output_block_" + Twine(static_cast(Idx))); - - replaceArgumentUses(*CurrentOS, NewBBs); + replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings); alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs, CurrentGroup.EndBBs, OutputMappings, OutputStoreBBs); @@ -1716,6 +2202,33 @@ return RegionBenefit; } +/// For the \p OutputCanon number passed in find the value represented by this, +/// canonical number. If it is from a PHINode, we pick one the first incoming +/// value and return that. +/// +/// \param Region - The OutlinableRegion to get the Value from. +/// \param OutputCanon - The canonical number to find the Value from. +/// \returns The Value extracted from \p OutputCanon. +static Value *extractValueFromPhiNumOrCanonNum(OutlinableRegion &Region, + unsigned OutputCanon) { + OutlinableGroup &CurrentGroup = *Region.Parent; + // If the value is greater than the value in the tracker, we have a + // PHINode and will instead use one of the incoming values to find the + // type. + if (OutputCanon > CurrentGroup.PHINodeGVNTracker) { + auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon); + assert(It != CurrentGroup.PHINodeGVNToGVNs.end() && + "Could not find GVN set for PHINode number!"); + assert(It->second.second.size() > 0 && "PHINode does not have any values!"); + OutputCanon = *It->second.second.begin(); + } + Optional OGVN = Region.Candidate->fromCanonicalNum(OutputCanon); + assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?"); + Optional OV = Region.Candidate->fromGVN(*OGVN); + assert(OV.hasValue() && "Could not find value for GVN?"); + return *OV; +} + InstructionCost IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) { InstructionCost OverallCost = 0; @@ -1723,10 +2236,8 @@ TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent()); // Each output incurs a load after the call, so we add that to the cost. - for (unsigned OutputGVN : Region->GVNStores) { - Optional OV = Region->Candidate->fromGVN(OutputGVN); - assert(OV.hasValue() && "Could not find value for GVN?"); - Value *V = OV.getValue(); + for (unsigned OutputCanon : Region->GVNStores) { + Value *V = extractValueFromPhiNumOrCanonNum(*Region, OutputCanon); InstructionCost LoadCost = TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0, TargetTransformInfo::TCK_CodeSize); @@ -1755,6 +2266,7 @@ InstructionCost OutputCost = 0; unsigned NumOutputBranches = 0; + OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0]; IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate; DenseSet CandidateBlocks; Candidate.getBasicBlocks(CandidateBlocks); @@ -1780,10 +2292,8 @@ for (const ArrayRef &OutputUse : CurrentGroup.OutputGVNCombinations) { - for (unsigned GVN : OutputUse) { - Optional OV = Candidate.fromGVN(GVN); - assert(OV.hasValue() && "Could not find value for GVN?"); - Value *V = OV.getValue(); + for (unsigned OutputCanon : OutputUse) { + Value *V = extractValueFromPhiNumOrCanonNum(FirstRegion, OutputCanon); InstructionCost StoreCost = TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0, TargetTransformInfo::TCK_CodeSize); Index: llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Show that we do differentiate between outputs of the region stored in PHINodes +; versus those stored outside of PHINodes. + +define void @function1(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br i1 true, label %first, label %test +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + %1 = phi i32 [ %c, %test ], [ %e, %test1 ] + ret void +next: + ret void +} + +define void @function2(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br i1 true, label %first, label %test +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + ret void +next: + %1 = add i32 %c, 1 + %2 = add i32 %e, 1 + ret void +} +; CHECK-LABEL: @function1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32* null, i32 0) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @function2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[E_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[C_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[E_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[C_LOC]], i32* [[E_LOC]], i32 1) +; CHECK-NEXT: [[C_RELOAD:%.*]] = load i32, i32* [[C_LOC]], align 4 +; CHECK-NEXT: [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[C_RELOAD]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[E_RELOAD]], 1 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define internal i1 @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: br label [[TEST1:%.*]] +; CHECK: test1: +; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]] +; CHECK: test: +; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]] +; CHECK: first.split: +; CHECK-NEXT: [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: br label [[FIRST_EXITSTUB:%.*]] +; CHECK: first.exitStub: +; CHECK-NEXT: switch i32 [[TMP3:%.*]], label [[FINAL_BLOCK_1:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_0_1:%.*]] +; CHECK-NEXT: i32 1, label [[OUTPUT_BLOCK_1_1:%.*]] +; CHECK-NEXT: ] +; CHECK: next.exitStub: +; CHECK-NEXT: switch i32 [[TMP3]], label [[FINAL_BLOCK_0:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_1_0:%.*]] +; CHECK-NEXT: ] +; CHECK: output_block_0_1: +; CHECK-NEXT: store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_1]] +; CHECK: output_block_1_0: +; CHECK-NEXT: store i32 [[C]], i32* [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[E]], i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: output_block_1_1: +; CHECK-NEXT: store i32 [[C]], i32* [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[E]], i32* [[TMP2]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_1]] +; CHECK: final_block_0: +; CHECK-NEXT: ret i1 false +; CHECK: final_block_1: +; CHECK-NEXT: ret i1 true +; Index: llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Show that we are able to extract blocks that contain PHINodes, and selectively +; store into it's respective block, creating a new block if needed. + +define void @function1(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br label %first +test: + %d = load i32, i32* %0, align 4 + br label %first +first: + ret void +} + +define void @function2(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br label %first +test: + %d = load i32, i32* %0, align 4 + br label %first +first: + %1 = phi i32 [ %c, %test ], [ %e, %test1 ] + ret void +} +; CHECK-LABEL: @function1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1) +; CHECK-NEXT: br label [[FIRST:%.*]] +; CHECK: first: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @function2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br label [[FIRST:%.*]] +; CHECK: first: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define internal void @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: br label [[TEST1:%.*]] +; CHECK: test1: +; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br label [[PHI_BLOCK:%.*]] +; CHECK: test: +; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br label [[PHI_BLOCK]] +; CHECK: first.exitStub: +; CHECK-NEXT: switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_1_0:%.*]] +; CHECK-NEXT: ] +; CHECK: output_block_1_0: +; CHECK-NEXT: store i32 [[TMP3:%.*]], i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: phi_block: +; CHECK-NEXT: [[TMP3]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: br label [[FIRST_EXITSTUB:%.*]] +; CHECK: final_block_0: +; CHECK-NEXT: ret void +; Index: llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Show that we are able to extract blocks that contain PHINodes, and selectively +; store into it's respective block, only using if needed. + +define void @function1(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br label %first +test: + %d = load i32, i32* %0, align 4 + br label %first +first: + %1 = phi i32 [ %c, %test ], [ %e, %test1 ] + ret void +} + +define void @function2(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br label %first +test: + %d = load i32, i32* %0, align 4 + br label %first +first: + ret void +} +; CHECK-LABEL: @function1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br label [[FIRST:%.*]] +; CHECK: first: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @function2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1) +; CHECK-NEXT: br label [[FIRST:%.*]] +; CHECK: first: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define internal void @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: br label [[TEST1:%.*]] +; CHECK: test1: +; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br label [[FIRST_SPLIT:%.*]] +; CHECK: test: +; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br label [[FIRST_SPLIT]] +; CHECK: first.split: +; CHECK-NEXT: [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: br label [[FIRST_EXITSTUB:%.*]] +; CHECK: first.exitStub: +; CHECK-NEXT: switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_0_0:%.*]] +; CHECK-NEXT: ] +; CHECK: output_block_0_0: +; CHECK-NEXT: store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: final_block_0: +; CHECK-NEXT: ret void +; Index: llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Show that we do not extract similar regions that would involve the splitting +; of phi nodes on exit. + +define void @function1(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br i1 true, label %first, label %test +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + %1 = phi i32 [ %c, %test ], [ %e, %test1 ] + ret void +next: + %2 = add i32 %d, 1 + %3 = add i32 %e, 1 + ret void +} + +define void @function2(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br i1 true, label %first, label %test +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + ret void +next: + %1 = add i32 %d, 1 + %2 = add i32 %e, 1 + ret void +} +; CHECK-LABEL: @function1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[D_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[E_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[LT_CAST2:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* [[DOTCE_LOC]], i32 0) +; CHECK-NEXT: [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4 +; CHECK-NEXT: [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4 +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @function2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[D_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[E_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* null, i32 1) +; CHECK-NEXT: [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4 +; CHECK-NEXT: [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define internal i1 @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: br label [[TEST1:%.*]] +; CHECK: test1: +; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]] +; CHECK: test: +; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]] +; CHECK: first.split: +; CHECK-NEXT: [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: br label [[FIRST_EXITSTUB:%.*]] +; CHECK: first.exitStub: +; CHECK-NEXT: switch i32 [[TMP4:%.*]], label [[FINAL_BLOCK_1:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_0_1:%.*]] +; CHECK-NEXT: i32 1, label [[OUTPUT_BLOCK_1_1:%.*]] +; CHECK-NEXT: ] +; CHECK: next.exitStub: +; CHECK-NEXT: switch i32 [[TMP4]], label [[FINAL_BLOCK_0:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_0_0:%.*]] +; CHECK-NEXT: i32 1, label [[OUTPUT_BLOCK_1_0:%.*]] +; CHECK-NEXT: ] +; CHECK: output_block_0_0: +; CHECK-NEXT: store i32 [[E]], i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: store i32 [[D]], i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: output_block_0_1: +; CHECK-NEXT: store i32 [[E]], i32* [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[DOTCE]], i32* [[TMP3:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_1]] +; CHECK: output_block_1_0: +; CHECK-NEXT: store i32 [[E]], i32* [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[D]], i32* [[TMP2]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: output_block_1_1: +; CHECK-NEXT: store i32 [[E]], i32* [[TMP1]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_1]] +; CHECK: final_block_0: +; CHECK-NEXT: ret i1 false +; CHECK: final_block_1: +; CHECK-NEXT: ret i1 true +; +; +; CHECK-LABEL: @outlined_ir_func_1( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[NEXT_TO_OUTLINE:%.*]] +; CHECK: next_to_outline: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0:%.*]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1:%.*]], 1 +; CHECK-NEXT: br label [[NEXT_AFTER_OUTLINE_EXITSTUB:%.*]] +; CHECK: next_after_outline.exitStub: +; CHECK-NEXT: ret void +; Index: llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Here we have multiple exits, but the different sources, same outputs are +; needed, this checks that they are compressed, and moved into the appropriate +; output blocks. + +define void @outline_outputs1() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_6 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_6 +block_6: + %diff = phi i32 [%aval, %block_4], [%a2val, %block_5] + ret void +} + +define void @outline_outputs2() #0 { +entry: + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + br label %block_2 +block_1: + %a2 = alloca i32, align 4 + %b2 = alloca i32, align 4 + br label %block_2 +block_2: + %a2val = load i32, i32* %a + %b2val = load i32, i32* %b + %add2 = add i32 2, %a2val + %mul2 = mul i32 2, %b2val + br label %block_5 +block_3: + %aval = load i32, i32* %a + %bval = load i32, i32* %b + %add = add i32 2, %aval + %mul = mul i32 2, %bval + br label %block_4 +block_4: + store i32 %add, i32* %output, align 4 + store i32 %mul, i32* %result, align 4 + br label %block_6 +block_5: + store i32 %add2, i32* %output, align 4 + store i32 %mul2, i32* %result, align 4 + br label %block_6 +block_6: + %diff = phi i32 [%aval, %block_4], [%a2val, %block_5] + ret void +} + +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIFF_CE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]]) +; CHECK-NEXT: [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br label [[BLOCK_6:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @outline_outputs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIFF_CE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2:%.*]] +; CHECK: block_1: +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BLOCK_2]] +; CHECK: block_2: +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]]) +; CHECK-NEXT: [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br label [[BLOCK_6:%.*]] +; CHECK: block_6: +; CHECK-NEXT: [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ] +; CHECK-NEXT: ret void +; +; +; CHECK: define internal void @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[BLOCK_2_TO_OUTLINE:%.*]] +; CHECK: block_2_to_outline: +; CHECK-NEXT: [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 2, [[A2VAL]] +; CHECK-NEXT: [[MUL2:%.*]] = mul i32 2, [[B2VAL]] +; CHECK-NEXT: br label [[BLOCK_5:%.*]] +; CHECK: block_3: +; CHECK-NEXT: [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 2, [[AVAL]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 2, [[BVAL]] +; CHECK-NEXT: br label [[BLOCK_4:%.*]] +; CHECK: block_4: +; CHECK-NEXT: store i32 [[ADD]], i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: store i32 [[MUL]], i32* [[TMP3:%.*]], align 4 +; CHECK-NEXT: br label [[BLOCK_6_SPLIT:%.*]] +; CHECK: block_5: +; CHECK-NEXT: store i32 [[ADD2]], i32* [[TMP2]], align 4 +; CHECK-NEXT: store i32 [[MUL2]], i32* [[TMP3]], align 4 +; CHECK-NEXT: br label [[BLOCK_6_SPLIT]] +; CHECK: block_6.split: +; CHECK-NEXT: [[DIFF_CE:%.*]] = phi i32 [ [[AVAL]], [[BLOCK_4]] ], [ [[A2VAL]], [[BLOCK_5]] ] +; CHECK-NEXT: br label [[BLOCK_6_EXITSTUB:%.*]] +; CHECK: block_6.exitStub: +; CHECK-NEXT: store i32 [[DIFF_CE]], i32* [[TMP4:%.*]], align 4 +; CHECK-NEXT: ret void +; Index: llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll =================================================================== --- llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll +++ llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll @@ -37,42 +37,50 @@ } ; CHECK-LABEL: @function1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0]], align 4 -; CHECK-NEXT: br label [[TEST1:%.*]] -; CHECK: test1: -; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]]) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) ; CHECK-NEXT: br label [[FIRST:%.*]] -; CHECK: test: -; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]]) -; CHECK-NEXT: br label [[FIRST]] ; CHECK: first: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: @function2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0]], align 4 -; CHECK-NEXT: br label [[TEST1:%.*]] -; CHECK: test1: -; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]]) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) ; CHECK-NEXT: br label [[FIRST:%.*]] -; CHECK: test: -; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[TMP0]]) -; CHECK-NEXT: br label [[FIRST]] ; CHECK: first: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; ; -; CHECK: define internal void @outlined_ir_func_0( +; CHECK-LABEL: define internal void @outlined_ir_func_0( ; CHECK-NEXT: newFuncRoot: -; CHECK-NEXT: br label [[TEST_TO_OUTLINE:%.*]] -; CHECK: test_to_outline: -; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: br label [[TEST1:%.*]] +; CHECK: test1: +; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br label [[FIRST_SPLIT:%.*]] +; CHECK: test: +; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br label [[FIRST_SPLIT]] +; CHECK: first.split: +; CHECK-NEXT: [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ] ; CHECK-NEXT: br label [[FIRST_EXITSTUB:%.*]] ; CHECK: first.exitStub: +; CHECK-NEXT: store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4 ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Show that we do not extract similar regions that would involve the splitting +; of phi nodes on exit. + +define void @function1(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + %1 = phi i32 [ %c, %test ], [ %e, %test1 ] + ret void +next: + ret void +} + +define void @function2(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + br label %test1 +test1: + %e = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + ret void +next: + %1 = phi i32 [ %c, %test ], [ %e, %test1 ] + ret void +} +; CHECK-LABEL: @function1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @function2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 1) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define internal i1 @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: br label [[TEST1:%.*]] +; CHECK: test1: +; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT:%.*]], label [[PHI_BLOCK:%.*]] +; CHECK: test: +; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT]], label [[PHI_BLOCK]] +; CHECK: first.split: +; CHECK-NEXT: [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: br label [[FIRST_EXITSTUB:%.*]] +; CHECK: first.exitStub: +; CHECK-NEXT: switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_1:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_0_1:%.*]] +; CHECK-NEXT: ] +; CHECK: next.exitStub: +; CHECK-NEXT: switch i32 [[TMP2]], label [[FINAL_BLOCK_0:%.*]] [ +; CHECK-NEXT: i32 0, label [[OUTPUT_BLOCK_1_0:%.*]] +; CHECK-NEXT: ] +; CHECK: output_block_0_1: +; CHECK-NEXT: store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_1]] +; CHECK: output_block_1_0: +; CHECK-NEXT: store i32 [[TMP3:%.*]], i32* [[TMP1]], align 4 +; CHECK-NEXT: br label [[FINAL_BLOCK_0]] +; CHECK: phi_block: +; CHECK-NEXT: [[TMP3]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ] +; CHECK-NEXT: br label [[NEXT_EXITSTUB:%.*]] +; CHECK: final_block_0: +; CHECK-NEXT: ret i1 false +; CHECK: final_block_1: +; CHECK-NEXT: ret i1 true +; Index: llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +; Show that we are able to propogate inputs to the region into the split PHINode +; outside of the region if necessary. + +define void @function1(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + %z = add i32 %c, %c + br i1 true, label %test1, label %first +test1: + %e = load i32, i32* %0, align 4 + %1 = add i32 %c, %c + br i1 true, label %first, label %test +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ] + ret void +next: + ret void +} + +define void @function2(i32* %a, i32* %b) { +entry: + %0 = alloca i32, align 4 + %c = load i32, i32* %0, align 4 + %z = mul i32 %c, %c + br i1 true, label %test1, label %first +test1: + %e = load i32, i32* %0, align 4 + %1 = add i32 %c, %c + br i1 true, label %first, label %test +test: + %d = load i32, i32* %0, align 4 + br i1 true, label %first, label %next +first: + %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ] + ret void +next: + ret void +} +; CHECK-LABEL: @function1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[Z:%.*]] = add i32 [[C]], [[C]] +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]]) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @function2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCE_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[C:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[Z:%.*]] = mul i32 [[C]], [[C]] +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]]) +; CHECK-NEXT: [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]] +; CHECK: first: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: ret void +; CHECK: next: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define internal i1 @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: br i1 true, label [[TEST1:%.*]], label [[FIRST_SPLIT:%.*]] +; CHECK: test1: +; CHECK-NEXT: [[E:%.*]] = load i32, i32* [[TMP0:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1:%.*]], [[TMP1]] +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT]], label [[TEST:%.*]] +; CHECK: test: +; CHECK-NEXT: [[D:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]] +; CHECK: first.split: +; CHECK-NEXT: [[DOTCE:%.*]] = phi i32 [ [[D]], [[TEST]] ], [ [[E]], [[TEST1]] ], [ [[TMP1]], [[ENTRY_TO_OUTLINE]] ] +; CHECK-NEXT: br label [[FIRST_EXITSTUB:%.*]] +; CHECK: first.exitStub: +; CHECK-NEXT: store i32 [[DOTCE]], i32* [[TMP2:%.*]], align 4 +; CHECK-NEXT: ret i1 true +; CHECK: next.exitStub: +; CHECK-NEXT: ret i1 false +;