diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -56,6 +56,7 @@ FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); +FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); @@ -288,6 +289,9 @@ void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); extern char &SIOptimizeExecMaskingPreRAID; +void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &); +extern char &SIOptimizeVGPRLiveRangeID; + void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -162,6 +162,11 @@ cl::init(true), cl::Hidden); +static cl::opt OptVGPRLiveRange( + "amdgpu-opt-vgpr-liverange", + cl::desc("Enable VGPR liverange optimizations for if-else structure"), + cl::init(false), cl::Hidden); + // Enable atomic optimization static cl::opt EnableAtomicOptimizations( "amdgpu-atomic-optimizations", @@ -220,6 +225,7 @@ initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); + initializeSIOptimizeVGPRLiveRangePass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); @@ -1169,6 +1175,8 @@ insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + if (OptVGPRLiveRange) + insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -131,6 +131,7 @@ SIMemoryLegalizer.cpp SIOptimizeExecMasking.cpp SIOptimizeExecMaskingPreRA.cpp + SIOptimizeVGPRLiveRange.cpp SIPeepholeSDWA.cpp SIPostRABundler.cpp SIPreEmitPeephole.cpp diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -0,0 +1,497 @@ +//===--------------------- SIOptimizeVGPRLiveRange.cpp -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass tries to optimize off unnecessary VGPR live range in divergent +/// if-else structure. +/// +/// When we do structurization, we usually transform a if-else into two +/// sucessive if-then (with a flow block to do predicate inversion). Consider a +/// simple case after structurization: A divergent value %a was defined before +/// if-else and used in both THEN (use in THEN is optional) and ELSE part: +/// bb.if: +/// %a = ... +/// ... +/// bb.then: +/// ... = op %a +/// ... // %a can be dead here +/// bb.flow: +/// ... +/// bb.else: +/// ... = %a +/// ... +/// bb.endif +/// +/// As LLVM has no idea of the thread-control-flow, it will just assume +/// %a would be alive in the whole range of bb.then because of a later use in +/// bb.else. On AMDGPU architecture, the VGPR was accessed with respect to exec +/// mask. For this if-else case, the lanes active in bb.then will be inactive +/// in bb.else, and vice-verse. So we are safe to say that %a was dead after +/// the last use in bb.then untill the end of the block. The reason is the +/// instructions in bb.then will only overwrite lanes that will never be +/// accessed in bb.else. +/// +/// This pass aims to to tell LLVM that %a is in-fact dead, through inserting +/// a phi-node in bb.flow saying that %a is undef when coming from bb.then, +/// and then replace the uses in the bb.else with the result of newly +/// inserted phi. +/// +/// Two key conditions must be met to ensure correctness: +/// 1.) The def-point should be in the same loop-level as if-else-endif to make +/// sure the second loop iteration still get correct data. +/// 2.) There should be no further uses after the IF-ELSE region. +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-opt-vgpr-liverange" + +namespace { + +class SIOptimizeVGPRLiveRange : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + LiveVariables *LV; + MachineDominatorTree *MDT = nullptr; + const MachineLoopInfo *Loops = nullptr; + MachineRegisterInfo *MRI = nullptr; + +public: + static char ID; + + MachineBasicBlock *getElseTarget(MachineBasicBlock *MBB) const; + + void collectElseRegionBlocks(MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallVectorImpl &) const; + + void + collectCandidateRegisters(MachineBasicBlock *If, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallVectorImpl &ElseBlocks, + SmallVectorImpl &CandidateRegs) const; + + void FindNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB, + SmallVectorImpl &Uses) const; + + void updateLiveRangeInThenRegion(Register Reg, MachineBasicBlock *If, + MachineBasicBlock *Flow) const; + + void updateLiveRangeInElseRegion( + Register Reg, Register NewReg, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallVectorImpl &ElseBlocks) const; + + void + optimizeLiveRange(Register Reg, MachineBasicBlock *If, + MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallVectorImpl &ElseBlocks) const; + + SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI Optimize VGPR LiveRange"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; + +} // end anonymous namespace + +// Check whether the MBB is a else flow block and get the branching target which +// is the Endif block +MachineBasicBlock * +SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const { + for (auto &br : MBB->terminators()) { + if (br.getOpcode() == AMDGPU::SI_ELSE) { + return br.getOperand(2).getMBB(); + } + } + return nullptr; +} + +void SIOptimizeVGPRLiveRange::collectElseRegionBlocks( + MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallVectorImpl &Blocks) const { + assert(Flow != Endif); + + MachineBasicBlock *MBB = Endif; + unsigned Cur = 0; + while (MBB != nullptr) { + for (auto *Pred : MBB->predecessors()) + if (Pred != Flow && llvm::find(Blocks, Pred) == Blocks.end()) + Blocks.push_back(Pred); + + if (Cur < Blocks.size()) { + MBB = Blocks[Cur++]; + } else + MBB = nullptr; + } + + LLVM_DEBUG(dbgs() << "Found Else blocks:"); + for (auto *MBB : Blocks) { + LLVM_DEBUG(dbgs() << " bb." << MBB->getNumber()); + } + LLVM_DEBUG(dbgs() << "\n"); +} + +/// Find the instructions(excluding phi) in \p MBB that uses the \p Reg. +void SIOptimizeVGPRLiveRange::FindNonPHIUsesInBlock( + Register Reg, MachineBasicBlock *MBB, + SmallVectorImpl &Uses) const { + for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; + ++I) { + auto *UseMI = I->getParent(); + if (UseMI->getParent() == MBB && !UseMI->isPHI()) + Uses.push_back(UseMI); + } +} + +/// Collect the killed registers in the ELSE region which are not alive through +/// the whole THEN region. +void SIOptimizeVGPRLiveRange::collectCandidateRegisters( + MachineBasicBlock *If, MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallVectorImpl &ElseBlocks, + SmallVectorImpl &CandidateRegs) const { + + SmallSet KillsInElse; + + for (auto *Else : ElseBlocks) { + for (auto &MI : Else->instrs()) { + if (MI.isDebugInstr()) + continue; + unsigned NumOps = MI.getNumOperands(); + for (unsigned Op = 0; Op < NumOps; ++Op) { + MachineOperand &MO = MI.getOperand(Op); + if (!MO.isReg() || MO.getReg() == 0 || MO.isDef()) + continue; + + Register MOReg = MO.getReg(); + // We can only optimize VGPR virtual register + if (MOReg.isPhysical() || !TRI->isVGPR(*MRI, MOReg)) + continue; + + if (MO.isKill() && MO.readsReg()) { + LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg); + const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + // Make sure two conditions are met: + // a.) the value is defined before/in the IF block + // b.) should be defined in the same loop-level. + if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) && + Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) + KillsInElse.insert(MOReg); + } + } + } + } + + // Check the phis in the Endif, looking for value coming from the ELSE + // region. Make sure the phi-use is the last use. + for (auto &MI : Endif->phis()) { + for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) { + auto &MO = MI.getOperand(Idx); + auto *Pred = MI.getOperand(Idx + 1).getMBB(); + if (Pred == Flow) + continue; + + if (!MO.isReg() || MO.getReg() == 0 || MO.isUndef()) + continue; + Register Reg = MO.getReg(); + + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + const MachineBasicBlock *DefMBB = MRI->getVRegDef(Reg)->getParent(); + + if (Reg.isPhysical() || !TRI->isVGPR(*MRI, Reg)) + continue; + + if (VI.isLiveIn(*Endif, Reg, *MRI)) { + LLVM_DEBUG(dbgs() << "Excluding " << printReg(Reg, TRI) + << " as Live in Endif\n"); + continue; + } + // Make sure two conditions are met: + // a.) the value is defined before/in the IF block + // b.) should be defined in the same loop-level. + if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) && + Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) + KillsInElse.insert(Reg); + } + } + + auto IsLiveThroughThen = [&](Register Reg) { + for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; + ++I) { + if (!I->readsReg()) + continue; + auto *UseMI = I->getParent(); + auto *UseMBB = UseMI->getParent(); + if (UseMBB == Flow || UseMBB == Endif) { + if (!UseMI->isPHI()) + return true; + + auto *IncomingMBB = UseMI->getOperand(I.getOperandNo() + 1).getMBB(); + // The register is live through the path If->Flow or Flow->Endif. + // we should not optimize for such cases. + if ((UseMBB == Flow && IncomingMBB != If) || + (UseMBB == Endif && IncomingMBB == Flow)) + return true; + } + } + return false; + }; + + for (auto Reg : KillsInElse) + if (!IsLiveThroughThen(Reg)) + CandidateRegs.push_back(Reg); +} + +// Re-calculate the liveness of \p Reg in the THEN-region +void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( + Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { + + SmallPtrSet PHIIncoming; + + MachineBasicBlock *ThenEntry = nullptr; + for (auto *Succ : If->successors()) { + if (Succ != Flow) + ThenEntry = Succ; + } + + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + df_iterator_default_set Visited; + + for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { + if (MBB == Flow) + break; + + // Clear Live bit, as we will recalculate afterwards + LLVM_DEBUG(dbgs() << "Clear AliveBlock bb." << MBB->getNumber() << "\n"); + OldVarInfo.AliveBlocks.reset(MBB->getNumber()); + } + + // Get the blocks the Reg should be alive through + for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; + ++I) { + auto *UseMI = I->getParent(); + if (UseMI->isPHI() && I->readsReg()) { + if (Visited.contains(UseMI->getParent())) + PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB()); + } + } + + Visited.clear(); + + for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { + if (MBB == Flow) + break; + + SmallVector Uses; + // PHI instructions has been processed before. + FindNonPHIUsesInBlock(Reg, MBB, Uses); + + if (Uses.size() == 1) { + LLVM_DEBUG(dbgs() << "Found one Non-PHI use in bb." << MBB->getNumber() + << "\n"); + LV->HandleVirtRegUse(Reg, MBB, *(*Uses.begin())); + } else if (Uses.size() > 1) { + // Process the instructions in-order + LLVM_DEBUG(dbgs() << "Found " << Uses.size() << " Non-PHI uses in bb." + << MBB->getNumber() << "\n"); + for (MachineInstr &MI : *MBB) { + if (llvm::find(Uses, &MI) != Uses.end()) { + LV->HandleVirtRegUse(Reg, MBB, MI); + } + } + } + + // Mark Reg alive through the block if this is a PHI incoming block + if (PHIIncoming.contains(MBB)) + LV->MarkVirtRegAliveInBlock(OldVarInfo, MRI->getVRegDef(Reg)->getParent(), + MBB); + } + + // Set the isKilled flag if we get new Kills in the THEN region. + for (auto *MI : OldVarInfo.Kills) + if (llvm::find(Visited, MI->getParent()) != Visited.end()) + MI->addRegisterKilled(Reg, TRI); +} + +void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion( + Register Reg, Register NewReg, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallVectorImpl &ElseBlocks) const { + LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + + // Transfer aliveBlocks from Reg to NewReg + for (auto *MBB : ElseBlocks) { + unsigned BBNum = MBB->getNumber(); + if (OldVarInfo.AliveBlocks.test(BBNum)) { + NewVarInfo.AliveBlocks.set(BBNum); + LLVM_DEBUG(dbgs() << "Removing ALiveBlock bb." << BBNum << "\n"); + OldVarInfo.AliveBlocks.reset(BBNum); + } + } + + // Transfer the possible Kills in ElseBlocks from Reg to NewReg + std::vector::iterator I = OldVarInfo.Kills.begin(); + for (; I != OldVarInfo.Kills.end();) { + auto *KillBB = (*I)->getParent(); + auto It = llvm::find(ElseBlocks, KillBB); + + if (It != ElseBlocks.end()) { + NewVarInfo.Kills.push_back(*I); + I = OldVarInfo.Kills.erase(I); + } else { + ++I; + } + } +} + +void SIOptimizeVGPRLiveRange::optimizeLiveRange( + Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallVectorImpl &ElseBlocks) const { + // Insert a new PHI, marking the value from the THEN region being + // undef. + LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << "\n"); + auto *RC = MRI->getRegClass(Reg); + Register NewReg = MRI->createVirtualRegister(RC); + Register UndefReg = MRI->createVirtualRegister(RC); + MachineInstrBuilder PHI = BuildMI(*Flow, Flow->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : Flow->predecessors()) { + if (Pred == If) + PHI.addReg(Reg).addMBB(Pred); + else + PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred); + } + + // Replace all uses in the ELSE region or the PHIs in ENDIF block + for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) { + MachineOperand &O = *I; + // This is a little bit tricky, the setReg() will update the linked list, + // so we have to increment the iterator before setReg() to avoid skipping + // some uses. + ++I; + auto *UseMI = O.getParent(); + auto *UseBlock = UseMI->getParent(); + // Replace uses in Endif block + if (UseBlock == Endif) { + assert(UseMI->isPHI() && "Uses should be PHI in Endif block"); + O.setReg(NewReg); + continue; + } + + // Replace uses in Else region + auto It = llvm::find(ElseBlocks, UseBlock); + if (It != ElseBlocks.end()) { + O.setReg(NewReg); + } + } + + // The optimized Reg is not alive through Flow blocks anymore. + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + OldVarInfo.AliveBlocks.reset(Flow->getNumber()); + + updateLiveRangeInElseRegion(Reg, NewReg, Flow, Endif, ElseBlocks); + updateLiveRangeInThenRegion(Reg, If, Flow); +} + +char SIOptimizeVGPRLiveRange::ID = 0; + +INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, + "SI Optimize VGPR LiveRange", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE, + "SI Optimize VGPR LiveRange", false, false) + +char &llvm::SIOptimizeVGPRLiveRangeID = SIOptimizeVGPRLiveRange::ID; + +FunctionPass *llvm::createSIOptimizeVGPRLiveRangePass() { + return new SIOptimizeVGPRLiveRange(); +} + +bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MDT = &getAnalysis(); + Loops = &getAnalysis(); + LV = &getAnalysis(); + MRI = &MF.getRegInfo(); + + if (skipFunction(MF.getFunction())) + return false; + + bool MadeChange = false; + + // TODO: we need to think about the order of visiting the blocks to get + // optimal result for nesting if-else cases. + for (MachineBasicBlock &MBB : MF) { + for (auto &MI : MBB.terminators()) { + // Detect the if-else blocks + if (MI.getOpcode() == AMDGPU::SI_IF) { + MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB(); + auto *Endif = getElseTarget(IfTarget); + if (!Endif) + continue; + + SmallVector ElseBlocks; + SmallVector CandidateRegs; + + LLVM_DEBUG(dbgs() << "Checking IF-FLOW-ENDIF: bb." << MBB.getNumber() + << " bb." << IfTarget->getNumber() << " bb." + << Endif->getNumber() << "\n"); + + // Collect all the blocks in the ELSE region + collectElseRegionBlocks(IfTarget, Endif, ElseBlocks); + + // Collect the registers can be optimized + collectCandidateRegisters(&MBB, IfTarget, Endif, ElseBlocks, + CandidateRegs); + MadeChange |= !CandidateRegs.empty(); + // Now we are safe to optimize. + for (auto Reg : CandidateRegs) + optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); + } + } + } + + return MadeChange; +} diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-opt-vgpr-liverange=true -stop-after=si-opt-vgpr-liverange -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; a normal if-else +define amdgpu_ps float @else1(i32 %z, float %v) #0 { + ; SI-LABEL: name: else1 + ; SI: bb.0.main_body: + ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; SI: liveins: $vgpr0, $vgpr1 + ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec + ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.3 + ; SI: bb.1.Flow: + ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %13:vgpr_32, %bb.0, %4, %bb.3 + ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, undef %15:vgpr_32, %bb.3 + ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.2 + ; SI: bb.2.if: + ; SI: successors: %bb.4(0x80000000) + ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], [[PHI1]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.4 + ; SI: bb.3.else: + ; SI: successors: %bb.1(0x80000000) + ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, killed [[COPY]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.1 + ; SI: bb.4.end: + ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 + ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: $vgpr0 = COPY killed [[PHI2]] + ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 +main_body: + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %v.if = fmul float %v, 2.0 + br label %end + +else: + %v.else = fmul float %v, 3.0 + br label %end + +end: + %r = phi float [ %v.if, %if ], [ %v.else, %else ] + ret float %r +} + + +; %v was used after if-else +define amdgpu_ps float @else2(i32 %z, float %v) #0 { + ; SI-LABEL: name: else2 + ; SI: bb.0.main_body: + ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; SI: liveins: $vgpr0, $vgpr1 + ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec + ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.3 + ; SI: bb.1.Flow: + ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %15:vgpr_32, %bb.0, %4, %bb.3 + ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.2 + ; SI: bb.2.if: + ; SI: successors: %bb.4(0x80000000) + ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[COPY]], [[COPY]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.4 + ; SI: bb.3.else: + ; SI: successors: %bb.1(0x80000000) + ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, [[COPY]], implicit $mode, implicit $exec + ; SI: S_BRANCH %bb.1 + ; SI: bb.4.end: + ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.1, %3, %bb.2 + ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 + ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: %14:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], killed [[PHI2]], implicit $mode, implicit $exec + ; SI: $vgpr0 = COPY killed %14 + ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 +main_body: + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %v.if = fmul float %v, 2.0 + br label %end + +else: + %v.else = fmul float %v, 3.0 + br label %end + +end: + %r0 = phi float [ %v.if, %if ], [ %v, %else ] + %r1 = phi float [ %v.if, %if ], [ %v.else, %else ] + %r2 = fadd float %r0, %r1 + ret float %r2 +} + +; if-else inside loop, %x can be optimized, but %v cannot be. +define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { + ; SI-LABEL: name: else3 + ; SI: bb.0.entry: + ; SI: successors: %bb.1(0x80000000) + ; SI: liveins: $vgpr0, $vgpr1, $sgpr0, $vgpr2 + ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 + ; SI: [[COPY1:%[0-9]+]]:sgpr_32 = COPY killed $sgpr0 + ; SI: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY3]], implicit $exec + ; SI: %1:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, [[COPY2]], implicit $mode, implicit $exec + ; SI: %2:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[COPY2]], [[COPY2]], implicit $mode, implicit $exec + ; SI: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; SI: bb.1.for.body: + ; SI: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; SI: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %13, %bb.5 + ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %12, %bb.5 + ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_GT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.4 + ; SI: bb.2.Flow: + ; SI: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.1, %9, %bb.4 + ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %38:vgpr_32, %bb.4 + ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: S_BRANCH %bb.3 + ; SI: bb.3.if: + ; SI: successors: %bb.5(0x80000000) + ; SI: %8:vgpr_32, dead %31:sreg_64 = V_ADD_CO_U32_e64 1, killed [[PHI3]], 0, implicit $exec + ; SI: S_BRANCH %bb.5 + ; SI: bb.4.else: + ; SI: successors: %bb.2(0x80000000) + ; SI: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec + ; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_MUL_LO_U32_e64_]] + ; SI: S_BRANCH %bb.2 + ; SI: bb.5.if.end: + ; SI: successors: %bb.6(0x04000000), %bb.1(0x7c000000) + ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI %1, %bb.2, %2, %bb.3 + ; SI: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, %8, %bb.3 + ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI: %12:vgpr_32, dead %33:sreg_64 = V_ADD_CO_U32_e64 1, [[PHI5]], 0, implicit $exec + ; SI: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc + ; SI: S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc + ; SI: S_CBRANCH_SCC1 %bb.1, implicit killed $scc + ; SI: S_BRANCH %bb.6 + ; SI: bb.6.for.end: + ; SI: %34:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI5]], killed [[PHI4]], implicit $mode, implicit $exec + ; SI: $vgpr0 = COPY killed %34 + ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 +entry: +; %break = icmp sgt i32 %bound, 0 +; br i1 %break, label %for.body, label %for.end + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %x = phi i32 [ %x0, %entry ], [ %xinc, %if.end ] + %cc = icmp sgt i32 %z, 5 + br i1 %cc, label %if, label %else + +if: + %v.if = fmul float %v, 2.0 + %x.if = add i32 %x, 1 + br label %if.end + +else: + %v.else = fmul float %v, 3.0 + %x.else = mul i32 %x, 3 + br label %if.end + +if.end: + %v.endif = phi float [ %v.if, %if ], [ %v.else, %else ] + %x.endif = phi i32 [ %x.if, %if ], [ %x.else, %else ] + + %xinc = add i32 %x.endif, 1 + %inc = add i32 %i, 1 + %cond = icmp slt i32 %inc, %bound + br i1 %cond, label %for.body, label %for.end + +for.end: + %x_float = bitcast i32 %x.endif to float + %r = fadd float %x_float, %v.endif + ret float %r +} + +attributes #0 = { nounwind }