diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1263,6 +1263,9 @@ [], [IntrNoMem]>; +// Like ps.live, but cannot be moved by LICM. +def int_amdgcn_wqm_helper : Intrinsic <[llvm_i1_ty], [], [IntrConvergent]>; + def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; @@ -1478,6 +1481,11 @@ // If false, set EXEC=0 for the current thread until the end of program. def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; +// If false, mark all active lanes as helper lanes until the end of program. +def int_amdgcn_wqm_demote : Intrinsic<[], + [llvm_i1_ty], [IntrConvergent] +>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -239,6 +239,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -66,6 +66,8 @@ bool kill(MachineInstr &MI); + void demoteCleanup(MachineInstr &MI); + bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); public: @@ -342,6 +344,28 @@ } } +void SIInsertSkips::demoteCleanup(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + switch (MI.getOpcode()) { + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B32), AMDGPU::EXEC_LO) + .addReg(AMDGPU::EXEC_LO) + .add(MI.getOperand(0)); + break; + + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(0)); + break; + + default: + llvm_unreachable("invalid opcode, expected SI_DEMOTE_CLEANUP_*_TERMINATOR"); + } +} + // Returns true if a branch over the block was inserted. bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { @@ -417,6 +441,18 @@ break; } + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: + assert(MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS); + demoteCleanup(MI); + if (dominatesAllReachable(MBB)) { + // As with kill we can null export if all lanes are demoted. + KillInstrs.push_back(&MI); + } else { + MI.eraseFromParent(); + } + break; + case AMDGPU::SI_RETURN_TO_EPILOG: // FIXME: Should move somewhere else assert(!MF.getInfo()->returnsVoid()); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1554,6 +1554,18 @@ MI.setDesc(get(AMDGPU::S_ANDN2_B32)); break; + case AMDGPU::S_AND_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B64)); + break; + + case AMDGPU::S_AND_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2107,15 +2119,19 @@ case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_AND_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: + case AMDGPU::S_AND_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: case AMDGPU::SI_KILL_I1_TERMINATOR: case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: // FIXME: It's messy that these need to be considered here at all. return true; default: @@ -6392,6 +6408,8 @@ switch (Opcode) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: return true; default: return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -237,6 +237,7 @@ def S_MOV_B64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; +def S_AND_B64_term : WrapTerminatorInst; } let WaveSizePredicate = isWave32 in { @@ -244,6 +245,7 @@ def S_XOR_B32_term : WrapTerminatorInst; def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; +def S_AND_B32_term : WrapTerminatorInst; } def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), @@ -371,6 +373,31 @@ let SALU = 1; } +let Uses = [EXEC] in { +def SI_WQM_HELPER : PseudoInstSI < + (outs SReg_1:$dst), (ins), + [(set i1:$dst, (int_amdgcn_wqm_helper))]> { + let SALU = 1; +} + +let Defs = [EXEC] in { +// Demote: Turn a pixel shader thread into a helper lane. +def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> { +} +// Demote clean up terminators are added to indicate points where the execmask +// can be cleaned using the specified livemask. +// This allows the early termination of threads if their quad has become only +// helper lanes as a result of demotes. +def SI_DEMOTE_CLEANUP_B32_TERMINATOR : SPseudoInstSI <(outs), (ins SSrc_b32:$livemask)> { + let isTerminator = 1; +} +def SI_DEMOTE_CLEANUP_B64_TERMINATOR : SPseudoInstSI <(outs), (ins SSrc_b64:$livemask)> { + let isTerminator = 1; +} +} // End Defs = [EXEC] + +} // End Uses = [EXEC] + def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), [(int_amdgcn_unreachable)], "; divergent unreachable"> { @@ -745,6 +772,16 @@ (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) >; +def : Pat < + (int_amdgcn_wqm_demote i1:$src), + (SI_DEMOTE_I1 $src, 0) +>; + +def : Pat < + (int_amdgcn_wqm_demote (i1 (not i1:$src))), + (SI_DEMOTE_I1 $src, -1) +>; + def : Pat < (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -215,6 +215,18 @@ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); return true; } + case AMDGPU::S_AND_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + return true; + } + case AMDGPU::S_AND_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + return true; + } default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -68,11 +68,13 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -134,6 +136,9 @@ char Needs = 0; char InNeeds = 0; char OutNeeds = 0; + char InitialState = 0; + unsigned LiveMaskIn = 0; // Initial live mask register + unsigned LiveMaskOut = 0; // Outgoing live mask register }; struct WorkItem { @@ -153,12 +158,22 @@ const GCNSubtarget *ST; MachineRegisterInfo *MRI; LiveIntervals *LIS; + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; DenseMap Instructions; - MapVector Blocks; - SmallVector LiveMaskQueries; + DenseMap Blocks; + + // Tracks live mask output of instructions + DenseMap LiveMaskRegs; + // Tracks state (WQM/WWM/Exact) after a given instruction + DenseMap StateTransition; + + SmallVector LiveMaskQueries; SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; + SmallVector DemoteInstrs; + SmallSet NeedsDemoteCleanup; void printInfo(); @@ -171,12 +186,16 @@ void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); + void scanLiveLanes(MachineBasicBlock &MBB, + std::vector &Worklist); + void analyzeLiveLanes(MachineFunction &MF); + MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before); MachineBasicBlock::iterator prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, MachineBasicBlock::iterator Last, bool PreferLast, - bool SaveSCC); + bool SaveSCC, bool CheckPhys); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, @@ -184,11 +203,28 @@ void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveOrig); void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig); - void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + unsigned SavedOrig, char NonWWMState); + + bool canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI); + MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); + void lowerBlock(MachineBasicBlock &MBB); + + unsigned findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI, + MachineBasicBlock::iterator &Before); + void processBlock(MachineBasicBlock &MBB, bool isEntry); - void lowerLiveMaskQueries(unsigned LiveMaskReg); + bool lowerLiveMaskQueries(unsigned LiveMaskReg); void lowerCopyInstrs(); + bool lowerDemoteInstrs(); + + void lowerLiveMaskQuery(MachineBasicBlock &MBB, MachineInstr &MI, + unsigned LiveMaskReg, bool isWQM); + MachineInstr *lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI, + unsigned LiveMaskIn, unsigned LiveMaskOut, + bool isWQM); + MachineInstr *insertDemoteCleanup(MachineBasicBlock &MBB, MachineInstr *MI, + MachineBasicBlock::iterator *Before, + unsigned LiveMask); public: static char ID; @@ -201,10 +237,14 @@ StringRef getPassName() const override { return "SI Whole Quad Mode"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addPreserved(); + AU.addRequired(); AU.addPreserved(); - AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -216,6 +256,8 @@ INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -323,6 +365,7 @@ for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = **BI; BlockInfo &BBI = Blocks[&MBB]; + bool HasDemoteInBlock = false; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; @@ -381,8 +424,39 @@ III.Disabled = StateWQM | StateWWM; continue; } else { - if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_WQM_HELPER) { LiveMaskQueries.push_back(&MI); + } else if (Opcode == AMDGPU::SI_DEMOTE_I1) { + // Only perform a demote dominance test once per block + if (!HasDemoteInBlock) { + SmallVector ControlFlowInstrs; + bool DominatesAllReachable = true; + + // Simultaneously check if this demote is in control flow + // (dominates all blocks) and find all control flow ends + // which post dominate this block. + for (MachineBasicBlock *Other : depth_first(&MBB)) { + if (DominatesAllReachable && !MDT->dominates(&MBB, Other)) + DominatesAllReachable = false; + if (PDT->dominates(Other, &MBB)) { + auto FirstMI = Other->getFirstNonPHI(); + if ((FirstMI != Other->end()) && + (FirstMI->getOpcode() == AMDGPU::SI_END_CF)) { + ControlFlowInstrs.push_back(&*FirstMI); + } + } + } + + if (!DominatesAllReachable) { + // Demote is in control flow hence we must mark all control + // flow end instructions requiring clean up. + for (MachineInstr *CF : ControlFlowInstrs) + NeedsDemoteCleanup.insert(CF); + } + } + + DemoteInstrs.push_back(&MI); + HasDemoteInBlock = true; } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -524,6 +598,115 @@ return GlobalFlags; } +// Trace live mask manipulate through block, creating new virtual registers. +// Additionally insert PHI nodes when block has multiple predecessors +// which manipulated the mask. +void SIWholeQuadMode::scanLiveLanes( + MachineBasicBlock &MBB, std::vector &Worklist) { + BlockInfo &BI = Blocks[&MBB]; + + if (BI.LiveMaskIn && BI.LiveMaskOut) + return; // Block has been fully traced already. + + if (!BI.LiveMaskIn) { + // Find the incoming live mask, or insert PHI if there are multiple. + unsigned LastPredReg = 0; + unsigned Count = 0; + bool Valid = true; + + // Find predecessor live masks while performing basic deduplication. + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (!PredBI.LiveMaskOut) { + Valid = false; + break; + } + if (PredBI.LiveMaskOut != LastPredReg) { + LastPredReg = PredBI.LiveMaskOut; + Count++; + } + } + + if (Valid) { + // All predecessors have live mask outputs. + if (Count > 1) { + BI.LiveMaskIn = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstrBuilder PHI = + BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(TargetOpcode::PHI), + BI.LiveMaskIn); + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + PHI.addReg(PredBI.LiveMaskOut); + PHI.addMBB(Pred); + } + LIS->InsertMachineInstrInMaps(*PHI); + } else { + BI.LiveMaskIn = LastPredReg; + } + } else { + // Not all predecessor blocks have live mask outputs, + // so this block will need to be revisited. + + if (!BI.LiveMaskOut) { + // Give this block a live mask output to ensure forward progress. + BI.LiveMaskOut = MRI->createVirtualRegister(TRI->getBoolRC()); + } + + // Queue this block to be revisited and visit predecessors. + Worklist.push_back(&MBB); + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (!PredBI.LiveMaskOut) + Worklist.push_back(Pred); + } + return; + } + } + + assert(BI.LiveMaskIn); + + // Now that the initial live mask register is known the block can + // be traced and intermediate live mask registers assigned for instructions + // which manipulate the mask. + unsigned CurrentLive = BI.LiveMaskIn; + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + MachineInstr &MI = *II; + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + unsigned NewLive = MRI->createVirtualRegister(TRI->getBoolRC()); + LiveMaskRegs[&MI] = NewLive; + CurrentLive = NewLive; + } + II++; + } + + // If an output register was assigned to guarantee forward progress + // then it is possible the current live register will not become the output + // live mask register. This will be resolved during block lowering. + if (!BI.LiveMaskOut) { + BI.LiveMaskOut = CurrentLive; + } +} + +// Scan blocks for live mask manipulation operations in reverse post order +// to minimise rescans: a block will have to be rescanned if it's +// predecessors live mask output is not defined. +void SIWholeQuadMode::analyzeLiveLanes(MachineFunction &MF) { + std::vector Worklist; + + ReversePostOrderTraversal RPOT(&MF); + for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = **BI; + scanLiveLanes(MBB, Worklist); + } + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.back(); + Worklist.pop_back(); + scanLiveLanes(*MBB, Worklist); + } +} + MachineBasicBlock::iterator SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before) { @@ -548,7 +731,8 @@ // instructions we want to add necessarily clobber SCC. MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( MachineBasicBlock &MBB, MachineBasicBlock::iterator First, - MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { + MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC, + bool CheckPhys) { if (!SaveSCC) return PreferLast ? Last : First; @@ -581,9 +765,25 @@ MachineBasicBlock::iterator MBBI; - if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) + if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) { MBBI = MI; - else { + + if (CheckPhys) { + // Make sure insertion point is after any COPY instructions + // accessing physical live in registers. This is ensures that + // block splitting does not occur before all live ins have been copied. + while (MBBI != Last) { + if (MBBI->getOpcode() != AMDGPU::COPY) + break; + unsigned Register = MBBI->getOperand(1).getReg(); + if (!Register::isVirtualRegister(Register) && MBB.isLiveIn(Register)) { + MBBI++; + } else { + break; + } + } + } + } else { assert(Idx == LIS->getMBBEndIdx(&MBB)); MBBI = MBB.end(); } @@ -614,6 +814,7 @@ } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateExact; } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, @@ -633,6 +834,7 @@ } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWQM; } void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, @@ -644,11 +846,12 @@ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWWM; } void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig) { + unsigned SavedOrig, char NonWWMState) { MachineInstr *MI; assert(SavedOrig); @@ -656,20 +859,285 @@ ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = NonWWMState; +} + +void SIWholeQuadMode::lowerLiveMaskQuery(MachineBasicBlock &MBB, + MachineInstr &MI, unsigned LiveMaskReg, + bool isWQM) { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Dest = MI.getOperand(0).getReg(); + MachineInstr *Copy = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dest).addReg(LiveMaskReg); + LIS->ReplaceMachineInstrInMaps(MI, *Copy); + MBB.remove(&MI); +} + +MachineInstr * +SIWholeQuadMode::insertDemoteCleanup(MachineBasicBlock &MBB, MachineInstr *MI, + MachineBasicBlock::iterator *Before, + unsigned LiveMask) { + const DebugLoc &DL = DebugLoc(); + const unsigned TermOp = ST->isWave32() + ? AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR + : AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR; + const unsigned WQMOp = ST->isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; + unsigned LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); + + MachineInstr *LiveMaskMI = + BuildMI(MBB, MI ? *MI : *Before, DL, TII->get(WQMOp), LiveMaskWQM) + .addReg(LiveMask); + MachineInstr *NewTerm = BuildMI(MBB, MI ? *MI : *Before, DL, TII->get(TermOp)) + .addReg(LiveMaskWQM); + + LIS->InsertMachineInstrInMaps(*LiveMaskMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + return NewTerm; +} + +// Lower an instruction which demotes lanes to helpers by adding +// appropriate live mask manipulation. Note this is also applied to kills. +MachineInstr *SIWholeQuadMode::lowerDemote(MachineBasicBlock &MBB, + MachineInstr &MI, + unsigned LiveMaskIn, + unsigned LiveMaskOut, bool isWQM) { + const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned AndN2 = + ST->isWave32() ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + const unsigned And = ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineInstr *NewMI = nullptr; + + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + if (Imm == KillVal) { + NewMI = BuildMI(MBB, MI, DL, TII->get(AndN2), LiveMaskOut) + .addReg(LiveMaskIn) + .addReg(Exec); + } + } else { + unsigned Opcode = KillVal ? AndN2 : And; + NewMI = BuildMI(MBB, MI, DL, TII->get(Opcode), LiveMaskOut) + .addReg(LiveMaskIn) + .add(Op); + } + + if (NewMI) { + LIS->InsertMachineInstrInMaps(*NewMI); + } + + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + if (isWQM) { + // Inside WQM demotes are replaced with live mask manipulation + // and a terminator which is later lowered to remove unused helpers + MachineInstr *NewTerm = + insertDemoteCleanup(MBB, &MI, nullptr, LiveMaskOut); + LIS->RemoveMachineInstrFromMaps(MI); + MBB.remove(&MI); + return NewTerm; + } else { + // Outside WQM demotes become kills terminating the block + MI.setDesc(TII->get(AMDGPU::SI_KILL_I1_TERMINATOR)); + return &MI; + } + } + + return nullptr; +} + +bool SIWholeQuadMode::canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI) { + // Cannot split immediately before the epilog + // because there are values in physical registers + if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + return false; + } + + return true; } -void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, - bool isEntry) { +MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, + MachineInstr *TermMI) { + MachineBasicBlock::iterator SplitPoint(TermMI); + SplitPoint++; + + LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " + << *TermMI << "\n"); + + MachineBasicBlock *SplitBB = nullptr; + + // Only split the block if the split point is not + // already the end of the block. + if ((SplitPoint != BB->getFirstTerminator()) && (SplitPoint != BB->end())) { + MachineFunction *MF = BB->getParent(); + SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + MachineFunction::iterator MBBI(BB); + ++MBBI; + MF->insert(MBBI, SplitBB); + + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + SplitBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(SplitBB); + + // Update dominator trees + using DomTreeT = DomTreeBase; + SmallVector DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, BB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB}); + if (MDT) + MDT->getBase().applyUpdates(DTUpdates); + if (PDT) + PDT->getBase().applyUpdates(DTUpdates); + + // Update live intervals + MachineInstr &InsertionPoint = SplitBB->front(); + LIS->insertMBBInMaps(SplitBB, &InsertionPoint); + } + + // Convert last instruction in to a terminator. + // Note: this only covers the expected patterns + switch (TermMI->getOpcode()) { + case AMDGPU::S_AND_B32: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B32_term)); + break; + case AMDGPU::S_AND_B64: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B64_term)); + break; + default: + if (BB->getFirstTerminator() == BB->end()) { + assert(SplitBB != nullptr); + MachineInstr *MI = + BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(SplitBB); + LIS->InsertMachineInstrInMaps(*MI); + } + break; + } + + return SplitBB; +} + +// Replace (or supplement) instructions accessing live mask. +// This can only happen once all the live mask registers have been created +// and the execute state (WQM/WWM/Exact) of instructions is known. +void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) return; + LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); + const BlockInfo &BI = BII->second; + SmallVector SplitPoints; + unsigned LiveMaskReg = BI.LiveMaskIn; + char State = BI.InitialState; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + auto Next = std::next(II); + MachineInstr &MI = *II; + + if (StateTransition.count(&MI)) { + // Mark transitions to Exact mode as split points so they become + // block terminators. + if (State != StateTransition[&MI] && StateTransition[&MI] == StateExact) { + if (State != StateWWM && canSplitBlockAt(&MBB, &MI)) + SplitPoints.push_back(&MI); + } + State = StateTransition[&MI]; + } + + switch (MI.getOpcode()) { + case AMDGPU::SI_PS_LIVE: + case AMDGPU::SI_WQM_HELPER: + lowerLiveMaskQuery(MBB, MI, LiveMaskReg, State == StateWQM); + break; + case AMDGPU::SI_DEMOTE_I1: { + MachineInstr *SplitPoint = lowerDemote( + MBB, MI, LiveMaskReg, LiveMaskRegs[&MI], State == StateWQM); + if (SplitPoint) + SplitPoints.push_back(SplitPoint); + break; + } + case AMDGPU::SI_END_CF: + if ((State == StateWQM) && NeedsDemoteCleanup.count(&MI)) { + MachineInstr *NewTerm = + insertDemoteCleanup(MBB, nullptr, &Next, LiveMaskReg); + SplitPoints.push_back(NewTerm); + } + break; + default: + break; + } + + if (LiveMaskRegs.count(&MI)) + LiveMaskReg = LiveMaskRegs[&MI]; + + II = Next; + } + + if (BI.LiveMaskOut != LiveMaskReg) { + // If the final live mask register does not match the expected + // register of successor blocks then insert a copy. + MachineBasicBlock::instr_iterator Terminator = + MBB.getFirstInstrTerminator(); + MachineInstr *MI = BuildMI(MBB, Terminator, DebugLoc(), + TII->get(AMDGPU::COPY), BI.LiveMaskOut) + .addReg(LiveMaskReg); + LIS->InsertMachineInstrInMaps(*MI); + } + + // Perform splitting after instruction scan to simplify iteration. + if (!SplitPoints.empty()) { + MachineBasicBlock *BB = &MBB; + for (MachineInstr *MI : SplitPoints) { + BB = splitBlock(BB, MI); + } + } +} + +unsigned SIWholeQuadMode::findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI, + MachineBasicBlock::iterator &Before) { + assert(BI.LiveMaskIn); + if (BI.LiveMaskIn == BI.LiveMaskOut) + return BI.LiveMaskIn; + + // FIXME: make this more efficient than scanning all instructions in a block + unsigned LiveMaskReg = BI.LiveMaskIn; + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + + while ((II != IE) && (II != Before)) { + MachineInstr *I = &*II; + if (LiveMaskRegs.count(I)) + LiveMaskReg = LiveMaskRegs[I]; + II++; + } + + assert(LiveMaskReg); + return LiveMaskReg; +} + +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + BlockInfo &BI = BII->second; + // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { + BI.InitialState = StateWQM; return; + } LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); @@ -694,6 +1162,10 @@ // FirstWQM since if it's safe to switch to/from WWM, it must be safe to // switch to/from WQM as well. MachineBasicBlock::iterator FirstWWM = IE; + + // Record initial state is block information. + BI.InitialState = State; + for (;;) { MachineBasicBlock::iterator Next = II; char Needs = StateExact | StateWQM; // WWM is disabled by default @@ -755,13 +1227,13 @@ First = FirstWQM; } - MachineBasicBlock::iterator Before = - prepareInsertion(MBB, First, II, Needs == StateWQM, - Needs == StateExact || WQMFromExec); + MachineBasicBlock::iterator Before = prepareInsertion( + MBB, First, II, Needs == StateWQM, Needs == StateExact || WQMFromExec, + Needs == StateExact && isEntry); if (State == StateWWM) { assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); State = NonWWMState; } @@ -775,7 +1247,7 @@ if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(BoolRC); - toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + toExact(MBB, Before, SavedWQMReg, findLiveMaskReg(MBB, BI, Before)); State = StateExact; } else if (State == StateExact && (Needs & StateWQM) && !(Needs & StateExact)) { @@ -804,11 +1276,13 @@ if (II == IE) break; + II = Next; } } -void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { +bool SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { + bool Changed = false; for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -818,7 +1292,20 @@ LIS->ReplaceMachineInstrInMaps(*MI, *Copy); MI->eraseFromParent(); + Changed = true; } + return Changed; +} + +bool SIWholeQuadMode::lowerDemoteInstrs() { + bool Changed = false; + for (MachineInstr *MI : DemoteInstrs) { + MachineBasicBlock *MBB = MI->getParent(); + MI->setDesc(TII->get(AMDGPU::SI_KILL_I1_TERMINATOR)); + splitBlock(MBB, MI); + Changed = true; + } + return Changed; } void SIWholeQuadMode::lowerCopyInstrs() { @@ -865,6 +1352,10 @@ LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); + DemoteInstrs.clear(); + LiveMaskRegs.clear(); + StateTransition.clear(); + CallingConv = MF.getFunction().getCallingConv(); ST = &MF.getSubtarget(); @@ -873,39 +1364,55 @@ TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); + MDT = &getAnalysis(); + PDT = &getAnalysis(); + + const char GlobalFlags = analyzeFunction(MF); + const bool NeedsLiveMask = !(DemoteInstrs.empty() && LiveMaskQueries.empty()); + const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned LiveMaskReg = Exec; + + if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() && + LowerToMovInstrs.empty()) { + // Shader only needs Exact mode + const bool LoweredQueries = lowerLiveMaskQueries(LiveMaskReg); + const bool LoweredDemotes = lowerDemoteInstrs(); + return LoweredQueries || LoweredDemotes; + } - char GlobalFlags = analyzeFunction(MF); - unsigned LiveMaskReg = 0; - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) - return !LiveMaskQueries.empty(); - } else { - // Store a copy of the original live mask when required - MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - - if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); - MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); - } + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - lowerLiveMaskQueries(LiveMaskReg); + // Store a copy of the original live mask when required + if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstr *MI = + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); + } - if (GlobalFlags == StateWQM) { - // For a shader that needs only WQM, we can just set it once. - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); + if ((GlobalFlags == StateWQM) && DemoteInstrs.empty()) { + // Shader only needs WQM + BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); - lowerCopyInstrs(); - // EntryMI may become invalid here - return true; + lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); + return true; + } + + if (NeedsLiveMask && (GlobalFlags & StateWQM)) { + BlockInfo &BI = Blocks[&Entry]; + BI.LiveMaskIn = LiveMaskReg; + analyzeLiveLanes(MF); + } else { + for (auto BII : Blocks) { + BlockInfo &BI = Blocks[&*BII.first]; + BI.LiveMaskIn = LiveMaskReg; + BI.LiveMaskOut = LiveMaskReg; } } @@ -913,14 +1420,20 @@ lowerCopyInstrs(); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + for (auto BII : Blocks) { + processBlock(*BII.first, BII.first == &Entry); + } - // Physical registers like SCC aren't tracked by default anyway, so just - // removing the ranges we computed is the simplest option for maintaining - // the analysis results. - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); + if (NeedsLiveMask && (GlobalFlags & StateWQM)) { + // Lowering blocks causes block splitting. + // Hence live ranges and slot indexes cease to be valid here. + for (auto BII : Blocks) { + lowerBlock(*BII.first); + } + } else { + lowerLiveMaskQueries(LiveMaskReg); + lowerDemoteInstrs(); + } return true; } diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll --- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -49,6 +49,14 @@ ret void } +; CHECK: DIVERGENT: %tmp0 = call i1 @llvm.amdgcn.wqm.helper() +define amdgpu_kernel void @wqm_helper(i32 addrspace(1)* %out) #0 { + %tmp0 = call i1 @llvm.amdgcn.wqm.helper() + %tmp1 = select i1 %tmp0, i32 0, i32 1 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 @@ -56,6 +64,7 @@ declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1 +declare i1 @llvm.amdgcn.wqm.helper() #0 attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,352 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,GCN-64,PRE-GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GCN-64,PRE-GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-64 %s + +; GCN-LABEL: {{^}}static_exact: +; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0 +; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0 +; GCN-32: s_mov_b32 exec_lo, 0 +; GCN-64: s_mov_b64 exec, 0 +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; GCN-LABEL: {{^}}dynamic_exact: +; GCN-32: v_cmp_le_f32_e64 [[CND:s[0-9]+]], 0, v1 +; GCN-64: v_cmp_le_f32_e64 [[CND:s\[[0-9]+:[0-9]+\]]], 0, v1 +; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0 +; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[CND]] +; GCN-64: s_and_b64 exec, exec, [[CND]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; GCN-LABEL: {{^}}branch: +; GCN-32: s_and_saveexec_b32 s1, s0 +; GCN-64: s_and_saveexec_b64 s[2:3], s[0:1] +; GCN-32: s_xor_b32 s0, exec_lo, s1 +; GCN-64: s_xor_b64 s[0:1], exec, s[2:3] +; GCN-32: s_mov_b32 exec_lo, 0 +; GCN-64: s_mov_b64 exec, 0 +; GCN-32: s_or_b32 exec_lo, exec_lo, s0 +; GCN-64: s_or_b64 exec, exec, s[0:1] +; GCN: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @branch(float %arg0, float %arg1) { +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + + +; GCN-LABEL: {{^}}wqm_demote_1: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec_lo +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN: ; %.continue +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execnz +; GCN: exp null +; GCN: image_sample +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +; GCN-LABEL: {{^}}wqm_demote_2: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: image_sample +; GCN: ; %.demote +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN: ; %.continue +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execnz +; GCN: exp null +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +; GCN-LABEL: {{^}}wqm_demote_dynamic: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: image_sample +; GCN: v_cmp_gt_f32_e32 vcc +; GCN-32-NEXT: s_and_b32 [[LIVE:s[0-9]+]], [[ORIG]], vcc +; GCN-64-NEXT: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN: s_cbranch_execnz +; GCN: exp null +; GCN: s_endpgm +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + + +; GCN-LABEL: {{^}}wqm_deriv: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote0 +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN-NOT: s_cbranch_execnz +; GCN: ; %.continue0 +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execnz +; GCN: exp null +; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], 1.0, 0, [[LIVE]] +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: ; %.demote1 +; GCN-32-NEXT: s_mov_b32 exec_lo, 0 +; GCN-64-NEXT: s_mov_b64 exec, 0 +; GCN: ; %.continue1 +; GCN: exp mrt0 +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.wqm.helper() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +; GCN-LABEL: {{^}}wqm_deriv_loop: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote0 +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN-NOT: s_cbranch_execnz +; GCN: ; %.continue0.preheader +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execnz +; GCN: exp null +; GCN: ; %.demote1 +; GCN-32: s_andn2_b32 [[LIVE]], [[LIVE]], exec +; GCN-64: s_andn2_b64 [[LIVE]], [[LIVE]], exec +; GCN-NOT: s_cbranch_execnz +; GCN: ; %.continue1 +; GCN-32: s_or_b32 exec_lo +; GCN-64: s_or_b64 exec +; GCN: ; %.continue0 +; PRE-GFX10: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:v[0-9]+]], 0, [[LIVE]] +; GFX10: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:s[0-9]+]], 0, [[LIVE]] +; GCN: ; %.return +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: exp mrt0 +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.wqm.helper() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.wqm.helper() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone }