diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h --- a/llvm/include/llvm/Transforms/IPO/IROutliner.h +++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h @@ -73,6 +73,10 @@ /// The number of extracted inputs from the CodeExtractor. unsigned NumExtractedInputs; + /// The corresponding BasicBlock with the appropriate stores for this + /// OutlinableRegion in the overall function. + unsigned OutputBlockNum; + /// Mapping the extracted argument number to the argument number in the /// overall function. Since there will be inputs, such as elevated constants /// that are not the same in each region in a SimilarityGroup, or values that @@ -87,6 +91,11 @@ /// since the CodeExtractor does not recognize constants. DenseMap AggArgToConstant; + /// The global value numbers that are used as outputs for this section. Once + /// extracted, each output will be stored to an output register. This + /// documents the global value numbers that are used in this pattern. + std::set GVNStores; + /// Used to create an outlined function. CodeExtractor *CE = nullptr; @@ -192,6 +201,15 @@ void findAddInputsOutputs(Module &M, OutlinableRegion &Region, DenseSet &NotSame); + /// Update the output mapping based on the load instruction, and the outputs + /// of the extracted function. + /// + /// \param Region - The region extracted + /// \param Outputs - The outputs from the extracted function. + /// \param LI - The load instruction used to update the mapping. + void updateOutputMapping(OutlinableRegion &Region, + SetVector &Outputs, LoadInst *LI); + /// Extract \p Region into its own function. /// /// \param [in] Region - The region to be extracted into its own function. @@ -218,6 +236,11 @@ /// TargetTransformInfo lambda for target specific information. function_ref getTTI; + /// A mapping from newly created reloaded output values to the original value. + /// If an value is replace by an output from an outlined region, this maps + /// that Value, back to its original Value. + DenseMap OutputMappings; + /// IRSimilarityIdentifier lambda to retrieve IRSimilarityIdentifier. function_ref getIRSI; diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -50,6 +50,9 @@ /// for extraction. bool IgnoreGroup = false; + /// The return block for the overall function. + BasicBlock *EndBB = nullptr; + /// Flag for whether the \ref ArgumentTypes have been defined after the /// extraction of the first region. bool InputTypesSet = false; @@ -342,18 +345,47 @@ /// CodeExtractor. /// \param [out] EndInputNumbers - The global value numbers for the extracted /// arguments. +/// \param [in] OutputMappings - The mapping of values that have been replaced +/// by a new output value. +/// \param [out] EndInputs - The global value numbers for the extracted +/// arguments. static void mapInputsToGVNs(IRSimilarityCandidate &C, SetVector &CurrentInputs, + DenseMap &OutputMappings, std::vector &EndInputNumbers) { - // Get the global value number for each input. + // Get the Global Value Number for each input. We check if the Value has been + // replaced by a different value at output, and use the original value before + // replacement. for (Value *Input : CurrentInputs) { assert(Input && "Have a nullptr as an input"); + if (OutputMappings.find(Input) != OutputMappings.end()) + Input = OutputMappings.find(Input)->second; assert(C.getGVN(Input).hasValue() && "Could not find a numbering for the given input"); EndInputNumbers.push_back(C.getGVN(Input).getValue()); } } +/// Find the original value for the \p ArgInput values if any one of them was +/// replaced during a previous extraction. +/// +/// \param [in] ArgInputs - The inputs to be extracted by the code extractor. +/// \param [in] OutputMappings - The mapping of values that have been replaced +/// by a new output value. +/// \param [out] RemappedArgInputs - The remapped values according to +/// \p OutputMappings that will be extracted. +static void remapExtractedInputs(SetVector &ArgInputs, + DenseMap &OutputMappings, + SetVector &RemappedArgInputs) { + // Get the global value number for each input that will be extracted as an + // argument by the code extractor, remapping if needed for reloaded values. + for (Value *Input : ArgInputs) { + if (OutputMappings.find(Input) != OutputMappings.end()) + Input = OutputMappings.find(Input)->second; + RemappedArgInputs.insert(Input); + } +} + /// Find the input GVNs and the output values for a region of Instructions. /// Using the code extractor, we collect the inputs to the extracted function. /// @@ -366,19 +398,25 @@ /// \param [in] NotSame - The global value numbers in the region that do not /// have the same constant value in the regions structurally similar to /// \p Region. +/// \param [in] OutputMappings - The mapping of values that have been replaced +/// by a new output value after extraction. /// \param [out] ArgInputs - The values of the inputs to the extracted function. -static void getCodeExtractorArguments(OutlinableRegion &Region, - std::vector &InputGVNs, - DenseSet &NotSame, - SetVector &ArgInputs) { +/// \param [out] Outputs - The set of values extractored by the CodeExtractor +/// as outputs. +static void getCodeExtractorArguments( + OutlinableRegion &Region, std::vector &InputGVNs, + DenseSet &NotSame, DenseMap &OutputMappings, + SetVector &ArgInputs, SetVector &Outputs) { IRSimilarityCandidate &C = *Region.Candidate; // OverallInputs are the inputs to the region found by the CodeExtractor, // SinkCands and HoistCands are used by the CodeExtractor to find sunken // allocas of values whose lifetimes are contained completely within the - // outlined region. Outputs are values used outside of the outlined region - // found by the CodeExtractor. - SetVector OverallInputs, SinkCands, HoistCands, Outputs; + // outlined region. PremappedInputs are the arguments found by the + // CodeExtractor, removing conditions such as sunken allocas, but that + // may need to be remapped due to the extracted output values replacing + // the original values. + SetVector OverallInputs, PremappedInputs, SinkCands, HoistCands; // Use the code extractor to get the inputs and outputs, without sunken // allocas or removing llvm.assumes. @@ -398,27 +436,23 @@ // Find if any values are going to be sunk into the function when extracted CE->findAllocas(CEAC, SinkCands, HoistCands, Dummy); - CE->findInputsOutputs(ArgInputs, Outputs, SinkCands); - - // TODO: Support regions with output values. Outputs add an extra layer of - // resolution that adds too much complexity at this stage. - if (Outputs.size() > 0) { - Region.IgnoreRegion = true; - return; - } + CE->findInputsOutputs(PremappedInputs, Outputs, SinkCands); // TODO: Support regions with sunken allocas: values whose lifetimes are // contained completely within the outlined region. These are not guaranteed // to be the same in every region, so we must elevate them all to arguments // when they appear. If these values are not equal, it means there is some // Input in OverallInputs that was removed for ArgInputs. - if (ArgInputs.size() != OverallInputs.size()) { + if (OverallInputs.size() != PremappedInputs.size()) { Region.IgnoreRegion = true; return; } findConstants(C, NotSame, InputGVNs); - mapInputsToGVNs(C, OverallInputs, InputGVNs); + + mapInputsToGVNs(C, OverallInputs, OutputMappings, InputGVNs); + + remapExtractedInputs(PremappedInputs, OutputMappings, ArgInputs); // Sort the GVNs, since we now have constants included in the \ref InputGVNs // we need to make sure they are in a deterministic order. @@ -437,7 +471,7 @@ /// function. static void findExtractedInputToOverallInputMapping(OutlinableRegion &Region, - std::vector InputGVNs, + std::vector &InputGVNs, SetVector &ArgInputs) { IRSimilarityCandidate &C = *Region.Candidate; @@ -493,12 +527,80 @@ Region.NumExtractedInputs = OriginalIndex; } +/// Create a mapping of the output arguments for the \p Region to the output +/// arguments of the overall outlined function. +/// +/// \param [in,out] Region - The region of code to be analyzed. +/// \param [in] Outputs - The values found by the code extractor. +static void +findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, + SetVector Outputs) { + OutlinableGroup &Group = *Region.Parent; + IRSimilarityCandidate &C = *Region.Candidate; + + // This counts the argument number in the extracted function. + unsigned OriginalIndex = Region.NumExtractedInputs; + + // This counts the argument number in the overall function. + unsigned TypeIndex = Group.NumAggregateInputs; + bool TypeFound; + DenseSet AggArgsUsed; + + // Iterate over the output types and identify if there is an aggregate pointer + // type whose base type matches the current output type. If there is, we mark + // that we will use this output register for this value. If not we add another + // type to the overall argument type list. We also store the GVNs used for + // stores to identify which values will need to be moved into an special + // block that holds the stores to the output registers. + for (Value *Output : Outputs) { + TypeFound = false; + // We can do this since it is a result value, and will have a number + // that is necessarily the same. BUT if in the future, the instructions + // do not have to be in same order, but are functionally the same, we will + // have to use a different scheme, as one-to-one correspondence is not + // guaranteed. + unsigned GlobalValue = C.getGVN(Output).getValue(); + unsigned ArgumentSize = Group.ArgumentTypes.size(); + + for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) { + if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType())) + continue; + + if (AggArgsUsed.find(Jdx) != AggArgsUsed.end()) + continue; + + TypeFound = true; + AggArgsUsed.insert(Jdx); + Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx)); + Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex)); + Region.GVNStores.insert(GlobalValue); + break; + } + + // We were unable to find an unused type in the output type set that matches + // the output, so we add a pointer type to the argument types of the overall + // function to handle this output and create a mapping to it. + if (!TypeFound) { + Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType())); + AggArgsUsed.insert(Group.ArgumentTypes.size() - 1); + Region.ExtractedArgToAgg.insert( + std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1)); + Region.AggArgToExtracted.insert( + std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex)); + Region.GVNStores.insert(GlobalValue); + } + OriginalIndex++; + TypeIndex++; + } +} + void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region, DenseSet &NotSame) { std::vector Inputs; - SetVector ArgInputs; + SetVector ArgInputs, Outputs; - getCodeExtractorArguments(Region, Inputs, NotSame, ArgInputs); + getCodeExtractorArguments(Region, Inputs, NotSame, OutputMappings, ArgInputs, + Outputs); if (Region.IgnoreRegion) return; @@ -506,6 +608,10 @@ // Map the inputs found by the CodeExtractor to the arguments found for // the overall function. findExtractedInputToOverallInputMapping(Region, Inputs, ArgInputs); + + // Map the outputs found by the CodeExtractor to the arguments found for + // the overall function. + findExtractedOutputToOverallOutputMapping(Region, Outputs); } /// Replace the extracted function in the Region with a call to the overall @@ -543,6 +649,18 @@ // new argument list. for (unsigned AggArgIdx = 0; AggArgIdx < AggFunc->arg_size(); AggArgIdx++) { + if (AggArgIdx == AggFunc->arg_size() - 1 && + Group.ArgumentTypes.size() > Group.NumAggregateInputs) { + // If we are on the last argument, and we need to differentiate between + // output blocks, add an integer to the argument list to determine + // what block to take + LLVM_DEBUG(dbgs() << "Set switch block argument to " + << Region.OutputBlockNum << "\n"); + NewCallArgs.push_back(ConstantInt::get(Type::getInt32Ty(M.getContext()), + Region.OutputBlockNum)); + continue; + } + ArgPair = Region.AggArgToExtracted.find(AggArgIdx); if (ArgPair != Region.AggArgToExtracted.end()) { Value *ArgumentValue = Call->getArgOperand(ArgPair->second); @@ -602,8 +720,11 @@ // Within an extracted function, replace the argument uses of the extracted // region with the arguments of the function for an OutlinableGroup. // -// \param OS [in] - The region of extracted code to be changed. -static void replaceArgumentUses(OutlinableRegion &Region) { +/// \param [in] Region - The region of extracted code to be changed. +/// \param [in,out] OutputBB - The BasicBlock for the output stores for this +/// region. +static void replaceArgumentUses(OutlinableRegion &Region, + BasicBlock *OutputBB) { OutlinableGroup &Group = *Region.Parent; assert(Region.ExtractedFunction && "Region has no extracted function?"); @@ -617,7 +738,28 @@ Argument *Arg = Region.ExtractedFunction->getArg(ArgIdx); // The argument is an input, so we can simply replace it with the overall // argument value - LLVM_DEBUG(dbgs() << "Replacing uses of input " << *Arg << " in function " + if (ArgIdx < Region.NumExtractedInputs) { + LLVM_DEBUG(dbgs() << "Replacing uses of input " << *Arg << " in function " + << *Region.ExtractedFunction << " with " << *AggArg + << " in function " << *Group.OutlinedFunction << "\n"); + Arg->replaceAllUsesWith(AggArg); + continue; + } + + // If we are replacing an output, we place the store value in its own + // block inside the overall function before replacing the use of the output + // in the function. + assert(Arg->hasOneUse() && "Output argument can only have one use"); + User *InstAsUser = Arg->user_back(); + + Instruction *I = dyn_cast(InstAsUser); + I->setDebugLoc(DebugLoc()); + LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " + << *OutputBB << "\n"); + + I->moveBefore(*OutputBB, OutputBB->end()); + + LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function " << *Region.ExtractedFunction << " with " << *AggArg << " in function " << *Group.OutlinedFunction << "\n"); Arg->replaceAllUsesWith(AggArg); @@ -655,38 +797,146 @@ } } +/// For the outlined section, move needed the StoreInsts for the output +/// registers into their own block. +/// +/// \param [in] OG - The group of regions to be outlined. +/// \param [in] OS - The region that is being analyzed. +/// \param [in,out] OutputBB - the block that stores for this region will be +/// placed in. +/// \param [in] EndBB - the final block of the extracted function. +/// \param [in] OutputMappings - OutputMappings the mapping of values that have +/// been replaced by a new output value. +/// \param [in,out] OutputStoreBBs - The existing output blocks. +void alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion *OS, + BasicBlock *OutputBB, BasicBlock *EndBB, + DenseMap &OutputMappings, + std::vector &OutputStoreBBs) { + std::set ValuesToFind = OS->GVNStores; + + // We iterate over the instructions in the extracted function, and find the + // global value number of the instructions. If we find a value that should + // be contained in a store, we replace the uses of the value with the value + // from the overall function, so that the store is storing the correct + // value from the overall function. + Function::iterator FIt = OG.OutlinedFunction->begin(); + for (BasicBlock &BB : *(OS->ExtractedFunction)) { + auto BBIt = FIt->instructionsWithoutDebug().begin(); + for (Instruction &Inst : BB.instructionsWithoutDebug()) { + if (Inst.isLifetimeStartOrEnd()) + continue; + Value *V = &Inst; + if (OutputMappings.find(V) != OutputMappings.end()) + V = OutputMappings.find(V)->second; + Optional GVN = OS->Candidate->getGVN(V); + + // If we have found one of the stored values for output, replace the value + // with the corresponding one from the overall function. + if (GVN.hasValue() && + ValuesToFind.find(GVN.getValue()) != ValuesToFind.end()) { + ValuesToFind.erase(GVN.getValue()); + Inst.replaceAllUsesWith(&(*BBIt)); + if (ValuesToFind.size() == 0) + break; + } + + BBIt++; + if (BBIt != FIt->instructionsWithoutDebug().end()) + while (BBIt->isLifetimeStartOrEnd()) + BBIt++; + } + if (ValuesToFind.size() == 0) + break; + FIt++; + } +} + +/// Create the switch statement for outlined function to differentiate between +/// all the output blocks. +/// +/// For the outlined section, determine if an outlined block already exists that +/// matches the needed stores for the extracted section. +/// \param [in] M - The module we are outlining from. +/// \param [in] OG - The group of regions to be outlined. +/// \param [in] OS - The region that is being analyzed. +/// \param [in] EndBB - The final block of the extracted function. +/// \param [in,out] OutputStoreBBs - The existing output blocks. +void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB, + std::vector &OutputStoreBBs) { + Function *AggFunc = OG.OutlinedFunction; + // Create a final block + BasicBlock *ReturnBlock = + BasicBlock::Create(M.getContext(), "final_block", AggFunc); + Instruction *Term = EndBB->getTerminator(); + Term->moveBefore(*ReturnBlock, ReturnBlock->end()); + // Put the switch statement in the old end basic block for the function with + // a fall through to the new return block + LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for " + << OutputStoreBBs.size() << "\n"); + SwitchInst *SwitchI = + SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1), ReturnBlock, + OutputStoreBBs.size(), EndBB); + + unsigned Idx = 0; + for (BasicBlock *BB : OutputStoreBBs) { + SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), + BB); + Term = BB->getTerminator(); + Term->setSuccessor(0, ReturnBlock); + Idx++; + } + + return; +} + /// Fill the new function that will serve as the replacement function for all of /// the extracted regions of a certain structure from the first region in the /// list of regions. Replace this first region's extracted function with the /// new overall function. /// -/// \param M [in] - The module we are outlining from. -/// \param CurrentGroup [in] - The group of regions to be outlined. -/// \param FuncsToRemove [in,out] - Extracted functions to erase from module +/// \param [in] M - The module we are outlining from. +/// \param [in] CurrentGroup - The group of regions to be outlined. +/// \param [in,out] OutputStoreBBs - The output blocks for each different +/// set of stores needed for the different functions. +/// \param [in,out] FuncsToRemove - Extracted functions to erase from module /// once outlining is complete. static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup, + std::vector &OutputStoreBBs, std::vector &FuncsToRemove) { OutlinableRegion *CurrentOS = CurrentGroup.Regions[0]; - // Move first extracted function's instructions into new function + // Move first extracted function's instructions into new function. LLVM_DEBUG(dbgs() << "Move instructions from " << *CurrentOS->ExtractedFunction << " to instruction " << *CurrentGroup.OutlinedFunction << "\n"); - moveFunctionData(*CurrentOS->ExtractedFunction, - *CurrentGroup.OutlinedFunction); - // Transfer the attributes + CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction, + *CurrentGroup.OutlinedFunction); + + // Transfer the attributes from the function to the new function. for (Attribute A : CurrentOS->ExtractedFunction->getAttributes().getFnAttributes()) CurrentGroup.OutlinedFunction->addFnAttr(A); - replaceArgumentUses(*CurrentOS); + // Create an output block for the first extracted function. + BasicBlock *NewBB = + BasicBlock::Create(M.getContext(), "output_block_" + std::to_string(0), + CurrentGroup.OutlinedFunction); + CurrentOS->OutputBlockNum = 0; + + replaceArgumentUses(*CurrentOS, NewBB); replaceConstants(*CurrentOS); + if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) { + BranchInst::Create(CurrentGroup.EndBB, NewBB); + OutputStoreBBs.push_back(NewBB); + } else + NewBB->eraseFromParent(); + // Replace the call to the extracted function with the outlined function. CurrentOS->Call = replaceCalledFunction(M, *CurrentOS); - // We only delete the extracted funcitons at the end since we may need to + // We only delete the extracted functions at the end since we may need to // reference instructions contained in them for mapping purposes. FuncsToRemove.push_back(CurrentOS->ExtractedFunction); } @@ -700,17 +950,39 @@ OutlinableRegion *CurrentOS; - fillOverallFunction(M, CurrentGroup, FuncsToRemove); + fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove); - // Do the same for the other extracted functions + BasicBlock *NewBB; for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) { CurrentOS = CurrentGroup.Regions[Idx]; - replaceArgumentUses(*CurrentOS); + // Create a new BasicBlock to hold the needed store instructions. + NewBB = BasicBlock::Create(M.getContext(), + "output_block_" + std::to_string(Idx), + CurrentGroup.OutlinedFunction); + replaceArgumentUses(*CurrentOS, NewBB); + + // We erase the BasicBlock if there are not outputs to be considered. + // Otherwise, we move the store instructions to their block, and create + // a branch to the finalization block of the overall function. + if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) { + alignOutputBlockWithAggFunc(CurrentGroup, CurrentOS, NewBB, + CurrentGroup.EndBB, OutputMappings, + OutputStoreBBs); + BranchInst::Create(CurrentGroup.EndBB, NewBB); + CurrentOS->OutputBlockNum = OutputStoreBBs.size(); + OutputStoreBBs.push_back(NewBB); + } else + NewBB->eraseFromParent(); + CurrentOS->Call = replaceCalledFunction(M, *CurrentOS); FuncsToRemove.push_back(CurrentOS->ExtractedFunction); } + // Create a switch statement to handle the different output schemes. + if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) + createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs); + OutlinedFunctionNum++; } @@ -765,11 +1037,45 @@ } } +void IROutliner::updateOutputMapping(OutlinableRegion &Region, + SetVector &Outputs, + LoadInst *LI) { + // For and load instructions following the call + Value *Operand = LI->getPointerOperand(); + Optional OutputIdx = None; + // Find if the operand it is an output register. + for (unsigned ArgIdx = Region.NumExtractedInputs; + ArgIdx < Region.Call->arg_size(); ArgIdx++) { + if (Operand == Region.Call->getArgOperand(ArgIdx)) { + OutputIdx = ArgIdx - Region.NumExtractedInputs; + break; + } + } + + // If we found an output register, place a mapping of the new value + // to the original in the mapping. + if (!OutputIdx.hasValue()) + return; + + if (OutputMappings.find(Outputs[OutputIdx.getValue()]) == + OutputMappings.end()) { + LLVM_DEBUG(dbgs() << "Mapping extracted output " << *LI << " to " + << *Outputs[OutputIdx.getValue()] << "\n"); + OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.getValue()])); + } else { + Value *Orig = OutputMappings.find(Outputs[OutputIdx.getValue()])->second; + LLVM_DEBUG(dbgs() << "Mapping extracted output " << *Orig << " to " + << *Outputs[OutputIdx.getValue()] << "\n"); + OutputMappings.insert(std::make_pair(LI, Orig)); + } +} + bool IROutliner::extractSection(OutlinableRegion &Region) { - assert(Region.StartBB != nullptr && - "StartBB for the OutlinableRegion is nullptr!"); - assert(Region.FollowBB != nullptr && - "StartBB for the OutlinableRegion is nullptr!"); + SetVector ArgInputs, Outputs, SinkCands; + Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands); + + assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!"); + assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!"); Function *OrigF = Region.StartBB->getParent(); CodeExtractorAnalysisCache CEAC(*OrigF); Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC); @@ -817,15 +1123,17 @@ // Iterate over the new set of instructions to find the new call // instruction. for (Instruction &I : *RewrittenBB) - if (CallInst *CI = dyn_cast(&I)) + if (CallInst *CI = dyn_cast(&I)) { if (Region.ExtractedFunction == CI->getCalledFunction()) Region.Call = CI; + } else if (LoadInst *LI = dyn_cast(&I)) + updateOutputMapping(Region, Outputs, LI); Region.reattachCandidate(); return true; } unsigned IROutliner::doOutline(Module &M) { - // Find the possibile similarity sections. + // Find the possible similarity sections. IRSimilarityIdentifier &Identifier = getIRSI(M); SimilarityGroupList &SimilarityCandidates = Identifier.getSimilarity(); @@ -885,6 +1193,15 @@ CurrentGroup.Regions = OutlinedRegions; + if (CurrentGroup.Regions.empty()) + continue; + + // We are adding an extracted argument to decide between which output path + // to use in the basic block. It is used in a switch statement and only + // needs to be an integer. + if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) + CurrentGroup.ArgumentTypes.push_back(Type::getInt32Ty(M.getContext())); + // Create functions out of all the sections, and mark them as outlined OutlinedRegions.clear(); for (OutlinableRegion *OS : CurrentGroup.Regions) { diff --git a/llvm/test/Transforms/IROutliner/extraction.ll b/llvm/test/Transforms/IROutliner/extraction.ll --- a/llvm/test/Transforms/IROutliner/extraction.ll +++ b/llvm/test/Transforms/IROutliner/extraction.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]]) ; CHECK-NEXT: ret void ; entry: @@ -32,7 +32,7 @@ ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]]) ; CHECK-NEXT: ret void ; entry: @@ -53,19 +53,23 @@ define void @extract_outs1() #0 { ; CHECK-LABEL: @extract_outs1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 2, i32* [[A]], align 4 -; CHECK-NEXT: store i32 3, i32* [[B]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[B]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[OUTPUT]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[OUTPUT]], align 4 -; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[TMP2]], i32 [[ADD]], i32* [[RESULT]]) +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4 +; CHECK-NEXT: call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) ; CHECK-NEXT: ret void ; entry: @@ -91,18 +95,22 @@ define void @extract_outs2() #0 { ; CHECK-LABEL: @extract_outs2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 2, i32* [[A]], align 4 -; CHECK-NEXT: store i32 3, i32* [[B]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[B]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[OUTPUT]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4 -; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[TMP2]], i32 [[ADD]], i32* [[RESULT]]) +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 1) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/IROutliner/illegal-assumes.ll b/llvm/test/Transforms/IROutliner/illegal-assumes.ll --- a/llvm/test/Transforms/IROutliner/illegal-assumes.ll +++ b/llvm/test/Transforms/IROutliner/illegal-assumes.ll @@ -7,15 +7,19 @@ define void @outline_assumes() { ; CHECK-LABEL: @outline_assumes( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DL_LOC:%.*]] = alloca i1, align 1 ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[D:%.*]] = alloca i1, align 4 -; CHECK-NEXT: store i1 true, i1* [[D]], align 4 -; CHECK-NEXT: [[DL:%.*]] = load i1, i1* [[D]], align 1 -; CHECK-NEXT: [[SPLIT_INST:%.*]] = sub i1 [[DL]], [[DL]] +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i1* [[DL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_3(i1 true, i1* [[D]], i1* [[DL_LOC]], i32 0) +; CHECK-NEXT: [[DL_RELOAD:%.*]] = load i1, i1* [[DL_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[SPLIT_INST:%.*]] = sub i1 [[DL_RELOAD]], [[DL_RELOAD]] ; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]]) -; CHECK-NEXT: call void @llvm.assume(i1 [[DL]]) +; CHECK-NEXT: call void @llvm.assume(i1 [[DL_RELOAD]]) ; CHECK-NEXT: call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]]) ; CHECK-NEXT: ret void ; @@ -40,14 +44,18 @@ define void @outline_assumes2() { ; CHECK-LABEL: @outline_assumes2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DL_LOC:%.*]] = alloca i1, align 1 ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[D:%.*]] = alloca i1, align 4 -; CHECK-NEXT: store i1 false, i1* [[D]], align 4 -; CHECK-NEXT: [[DL:%.*]] = load i1, i1* [[D]], align 1 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i1* [[DL_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_3(i1 false, i1* [[D]], i1* [[DL_LOC]], i32 1) +; CHECK-NEXT: [[DL_RELOAD:%.*]] = load i1, i1* [[DL_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) ; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]]) -; CHECK-NEXT: call void @llvm.assume(i1 [[DL]]) +; CHECK-NEXT: call void @llvm.assume(i1 [[DL_RELOAD]]) ; CHECK-NEXT: call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/IROutliner/illegal-memcpy.ll b/llvm/test/Transforms/IROutliner/illegal-memcpy.ll --- a/llvm/test/Transforms/IROutliner/illegal-memcpy.ll +++ b/llvm/test/Transforms/IROutliner/illegal-memcpy.ll @@ -9,12 +9,22 @@ define i8 @function1(i8* noalias %s, i8* noalias %d, i64 %len) { ; CHECK-LABEL: @function1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = load i8, i8* [[S:%.*]], align 1 -; CHECK-NEXT: [[B:%.*]] = load i8, i8* [[D:%.*]], align 1 +; CHECK-NEXT: [[B_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[A_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[RET_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 0) +; CHECK-NEXT: [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1 +; CHECK-NEXT: [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]]) ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false) -; CHECK-NEXT: [[C:%.*]] = add i8 [[A]], [[B]] -; CHECK-NEXT: [[RET:%.*]] = load i8, i8* [[S]], align 1 -; CHECK-NEXT: ret i8 [[RET]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 0) +; CHECK-NEXT: [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: ret i8 [[RET_RELOAD]] ; entry: %a = load i8, i8* %s @@ -28,12 +38,22 @@ define i8 @function2(i8* noalias %s, i8* noalias %d, i64 %len) { ; CHECK-LABEL: @function2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = load i8, i8* [[S:%.*]], align 1 -; CHECK-NEXT: [[B:%.*]] = load i8, i8* [[D:%.*]], align 1 +; CHECK-NEXT: [[B_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[A_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[RET_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 1) +; CHECK-NEXT: [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1 +; CHECK-NEXT: [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]]) ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false) -; CHECK-NEXT: [[C:%.*]] = add i8 [[A]], [[B]] -; CHECK-NEXT: [[RET:%.*]] = load i8, i8* [[S]], align 1 -; CHECK-NEXT: ret i8 [[RET]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 1) +; CHECK-NEXT: [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: ret i8 [[RET_RELOAD]] ; entry: %a = load i8, i8* %s diff --git a/llvm/test/Transforms/IROutliner/illegal-memmove.ll b/llvm/test/Transforms/IROutliner/illegal-memmove.ll --- a/llvm/test/Transforms/IROutliner/illegal-memmove.ll +++ b/llvm/test/Transforms/IROutliner/illegal-memmove.ll @@ -9,12 +9,22 @@ define i8 @function1(i8* noalias %s, i8* noalias %d, i64 %len) { ; CHECK-LABEL: @function1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = load i8, i8* [[S:%.*]], align 1 -; CHECK-NEXT: [[B:%.*]] = load i8, i8* [[D:%.*]], align 1 +; CHECK-NEXT: [[B_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[A_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[RET_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 0) +; CHECK-NEXT: [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1 +; CHECK-NEXT: [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]]) ; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false) -; CHECK-NEXT: [[C:%.*]] = add i8 [[A]], [[B]] -; CHECK-NEXT: [[RET:%.*]] = load i8, i8* [[S]], align 1 -; CHECK-NEXT: ret i8 [[RET]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 0) +; CHECK-NEXT: [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: ret i8 [[RET_RELOAD]] ; entry: %a = load i8, i8* %s @@ -28,12 +38,22 @@ define i8 @function2(i8* noalias %s, i8* noalias %d, i64 %len) { ; CHECK-LABEL: @function2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = load i8, i8* [[S:%.*]], align 1 -; CHECK-NEXT: [[B:%.*]] = load i8, i8* [[D:%.*]], align 1 +; CHECK-NEXT: [[B_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[A_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[RET_LOC:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 1) +; CHECK-NEXT: [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1 +; CHECK-NEXT: [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]]) ; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false) -; CHECK-NEXT: [[C:%.*]] = add i8 [[A]], [[B]] -; CHECK-NEXT: [[RET:%.*]] = load i8, i8* [[S]], align 1 -; CHECK-NEXT: ret i8 [[RET]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 1) +; CHECK-NEXT: [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]]) +; CHECK-NEXT: ret i8 [[RET_RELOAD]] ; entry: %a = load i8, i8* %s diff --git a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll --- a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll +++ b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll @@ -11,17 +11,20 @@ define i32 @func1(i32 %a, double %b, i8* %v, ...) nounwind { ; CHECK-LABEL: @func1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[AP1_LOC:%.*]] = alloca i8*, align 8 ; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 ; CHECK-NEXT: [[AP:%.*]] = alloca i8*, align 4 ; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 -; CHECK-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 -; CHECK-NEXT: [[AP1:%.*]] = bitcast i8** [[AP]] to i8* -; CHECK-NEXT: call void @llvm.va_start(i8* [[AP1]]) +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i8** [[AP1_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32 [[A:%.*]], i32* [[A_ADDR]], double [[B:%.*]], double* [[B_ADDR]], i8** [[AP]], i8** [[AP1_LOC]], i32 0) +; CHECK-NEXT: [[AP1_RELOAD:%.*]] = load i8*, i8** [[AP1_LOC]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.va_start(i8* [[AP1_RELOAD]]) ; CHECK-NEXT: [[TMP0:%.*]] = va_arg i8** [[AP]], i32 -; CHECK-NEXT: call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1]]) -; CHECK-NEXT: call void @llvm.va_end(i8* [[AP1]]) +; CHECK-NEXT: call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1_RELOAD]]) +; CHECK-NEXT: call void @llvm.va_end(i8* [[AP1_RELOAD]]) ; CHECK-NEXT: store i32 [[TMP0]], i32* [[C]], align 4 ; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[C]], align 4 ; CHECK-NEXT: ret i32 [[TMP]] @@ -46,17 +49,20 @@ define i32 @func2(i32 %a, double %b, i8* %v, ...) nounwind { ; CHECK-LABEL: @func2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[AP1_LOC:%.*]] = alloca i8*, align 8 ; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 ; CHECK-NEXT: [[AP:%.*]] = alloca i8*, align 4 ; CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 -; CHECK-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 -; CHECK-NEXT: [[AP1:%.*]] = bitcast i8** [[AP]] to i8* -; CHECK-NEXT: call void @llvm.va_start(i8* [[AP1]]) +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i8** [[AP1_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32 [[A:%.*]], i32* [[A_ADDR]], double [[B:%.*]], double* [[B_ADDR]], i8** [[AP]], i8** [[AP1_LOC]], i32 1) +; CHECK-NEXT: [[AP1_RELOAD:%.*]] = load i8*, i8** [[AP1_LOC]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.va_start(i8* [[AP1_RELOAD]]) ; CHECK-NEXT: [[TMP0:%.*]] = va_arg i8** [[AP]], i32 -; CHECK-NEXT: call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1]]) -; CHECK-NEXT: call void @llvm.va_end(i8* [[AP1]]) +; CHECK-NEXT: call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1_RELOAD]]) +; CHECK-NEXT: call void @llvm.va_end(i8* [[AP1_RELOAD]]) ; CHECK-NEXT: store i32 [[TMP0]], i32* [[C]], align 4 ; CHECK-NEXT: [[AP2:%.*]] = bitcast i8** [[AP]] to i8* ; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[C]], align 4 diff --git a/llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll b/llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -iroutliner < %s | FileCheck %s + +; These functions are constructed slightly differently so that they require +; different output blocks for the values used outside of the region. We are +; checking that two output blocks are created with different values. + +define void @outline_outputs1() #0 { +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4 +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %add = add i32 %0, %1 + %sub = sub i32 %0, %1 + store i32 %add, i32* %output, align 4 + %2 = load i32, i32* %output, align 4 + %3 = load i32, i32* %output, align 4 + %mul = mul i32 %2, %add + store i32 %mul, i32* %result, align 4 + ret void +} + +define void @outline_outputs2() #0 { +; CHECK-LABEL: @outline_outputs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[SUB_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[SUB_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[SUB_LOC]], i32* [[DOTLOC]], i32 1) +; CHECK-NEXT: [[SUB_RELOAD:%.*]] = load i32, i32* [[SUB_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[SUB_RELOAD]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %add = add i32 %0, %1 + %sub = sub i32 %0, %1 + store i32 %add, i32* %output, align 4 + %2 = load i32, i32* %output, align 4 + %mul = mul i32 %2, %sub + store i32 %mul, i32* %result, align 4 + ret void +} + +; CHECK: define internal void @outlined_ir_func_0(i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]], i32* [[ARG3:%.*]], i32* [[ARG4:%.*]], i32 [[ARG5:%.*]]) #1 { +; CHECK: _after_outline.exitStub: +; CHECK-NEXT: switch i32 [[ARG5]], label [[BLOCK:%.*]] [ +; CHECK-NEXT: i32 0, label %[[BLOCK_0:.*]] +; CHECK-NEXT: i32 1, label %[[BLOCK_1:.*]] + +; CHECK: entry_to_outline: +; CHECK-NEXT: store i32 2, i32* [[ARG0]], align 4 +; CHECK-NEXT: store i32 3, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARG0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARG2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARG2]], align 4 + +; CHECK: [[BLOCK_0]]: +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARG3]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARG4]], align 4 + +; CHECK: [[BLOCK_1]]: +; CHECK-NEXT: store i32 [[SUB]], i32* [[ARG3]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARG4]], align 4 diff --git a/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll b/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -iroutliner < %s | FileCheck %s + +; This test tests that inputs that are replaced with the output of an outlined +; function is still recognized as the same value. + +define void @outline_outputs1() #0 { +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD2_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32 2, i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4 +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: +; CHECK-NEXT: [[LT_CAST4:%.*]] = bitcast i32* [[ADD2_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST4]]) +; CHECK-NEXT: [[LT_CAST5:%.*]] = bitcast i32* [[DOTLOC2]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST5]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32 [[ADD_RELOAD]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[OUTPUT2]], i32* [[ADD2_LOC]], i32* [[DOTLOC2]], i32 1) +; CHECK-NEXT: [[ADD2_RELOAD:%.*]] = load i32, i32* [[ADD2_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD3:%.*]] = load i32, i32* [[DOTLOC2]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST4]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST5]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[DOTRELOAD3]], i32 [[ADD2_RELOAD]], i32* [[RESULT2]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + %output2 = alloca i32, align 4 + %result2 = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %add = add i32 %0, %1 + store i32 %add, i32* %output, align 4 + %2 = load i32, i32* %output, align 4 + %3 = load i32, i32* %output, align 4 + %mul = mul i32 %2, %add + store i32 %mul, i32* %result, align 4 + br label %next +next: + store i32 %add, i32* %output, align 4 + store i32 3, i32* %result, align 4 + %4 = load i32, i32* %output, align 4 + %5 = load i32, i32* %result, align 4 + %add2 = add i32 %4, %5 + store i32 %add2, i32* %output2, align 4 + %6 = load i32, i32* %output2, align 4 + %mul2 = mul i32 %6, %add2 + store i32 %mul2, i32* %result2, align 4 + ret void +} diff --git a/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll b/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -iroutliner < %s | FileCheck %s + +; These functions are constructed slightly differently so that they require +; the same output blocks for the values used outside of the region. We are +; checking that two output blocks are created with the same store instructions. + +define void @outline_outputs1() #0 { +; CHECK-LABEL: @outline_outputs1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4 +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %add = add i32 %0, %1 + store i32 %add, i32* %output, align 4 + %2 = load i32, i32* %output, align 4 + %3 = load i32, i32* %output, align 4 + %mul = mul i32 %2, %add + store i32 %mul, i32* %result, align 4 + ret void +} + +define void @outline_outputs2() #0 { +; CHECK-LABEL: @outline_outputs2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTLOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[ADD_LOC:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[OUTPUT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RESULT:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 1) +; CHECK-NEXT: [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4 +; CHECK-NEXT: [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]]) +; CHECK-NEXT: call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]]) +; CHECK-NEXT: ret void +; +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %output = alloca i32, align 4 + %result = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + %0 = load i32, i32* %a, align 4 + %1 = load i32, i32* %b, align 4 + %add = add i32 %0, %1 + store i32 %add, i32* %output, align 4 + %2 = load i32, i32* %output, align 4 + %mul = mul i32 %2, %add + store i32 %mul, i32* %result, align 4 + ret void +} + +; CHECK: define internal void @outlined_ir_func_0(i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]], i32* [[ARG3:%.*]], i32* [[ARG4:%.*]], i32 [[ARG5:%.*]]) #1 { +; CHECK: entry_after_outline.exitStub: +; CHECK-NEXT: switch i32 [[ARG5]], label [[BLOCK:%.*]] [ +; CHECK-NEXT: i32 0, label %[[BLOCK_0:.*]] +; CHECK-NEXT: i32 1, label %[[BLOCK_1:.*]] + +; CHECK: entry_to_outline: +; CHECK-NEXT: store i32 2, i32* [[ARG0]], align 4 +; CHECK-NEXT: store i32 3, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARG0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARG1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARG2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARG2]], align 4 + +; CHECK: [[BLOCK_0]]: +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARG3]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARG4]], align 4 + +; CHECK: [[BLOCK_1]]: +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARG3]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARG4]], align 4