diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1320,6 +1320,12 @@ [], [IntrNoMem, IntrWillReturn]>; +// Like ps.live, but cannot be moved by LICM. +// (i.e. this returns true if not a helper) +def int_amdgcn_wqm_live : Intrinsic <[llvm_i1_ty], + [], [IntrReadMem, IntrInaccessibleMemOnly] +>; + def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], @@ -1545,6 +1551,11 @@ // FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; +// If false, mark all active lanes as helper lanes until the end of program. +def int_amdgcn_wqm_demote : Intrinsic<[], + [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly] +>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4302,6 +4302,11 @@ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); break; } + case Intrinsic::amdgcn_wqm_live: { + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } + case Intrinsic::amdgcn_wqm_demote: case Intrinsic::amdgcn_kill: { OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -243,6 +243,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -69,6 +69,8 @@ bool kill(MachineInstr &MI); + void demoteCleanup(MachineInstr &MI); + bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); public: @@ -382,6 +384,28 @@ } } +void SIInsertSkips::demoteCleanup(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + switch (MI.getOpcode()) { + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B32), AMDGPU::EXEC_LO) + .addReg(AMDGPU::EXEC_LO) + .add(MI.getOperand(0)); + break; + + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(0)); + break; + + default: + llvm_unreachable("invalid opcode, expected SI_DEMOTE_CLEANUP_*_TERMINATOR"); + } +} + // Returns true if a branch over the block was inserted. bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { @@ -456,6 +480,18 @@ break; } + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: + assert(MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS); + demoteCleanup(MI); + if (dominatesAllReachable(MBB)) { + // As with kill we can null export if all lanes are demoted. + KillInstrs.push_back(&MI); + } else { + MI.eraseFromParent(); + } + break; + case AMDGPU::SI_KILL_CLEANUP: if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && dominatesAllReachable(MBB)) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1669,6 +1669,18 @@ MI.setDesc(get(AMDGPU::S_ANDN2_B32)); break; + case AMDGPU::S_AND_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B64)); + break; + + case AMDGPU::S_AND_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2228,15 +2240,19 @@ case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_AND_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: + case AMDGPU::S_AND_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: case AMDGPU::SI_KILL_I1_TERMINATOR: case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: // FIXME: It's messy that these need to be considered here at all. return true; default: @@ -6804,6 +6820,8 @@ switch (Opcode) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR: + case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR: return true; default: return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -265,6 +265,7 @@ def S_MOV_B64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; +def S_AND_B64_term : WrapTerminatorInst; } let WaveSizePredicate = isWave32 in { @@ -272,6 +273,7 @@ def S_XOR_B32_term : WrapTerminatorInst; def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; +def S_AND_B32_term : WrapTerminatorInst; } @@ -403,6 +405,31 @@ let SALU = 1; } +let Uses = [EXEC] in { +def SI_WQM_LIVE : PseudoInstSI < + (outs SReg_1:$dst), (ins), + [(set i1:$dst, (int_amdgcn_wqm_live))]> { + let SALU = 1; +} + +let Defs = [EXEC] in { +// Demote: Turn a pixel shader thread into a helper lane. +def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> { +} +// Demote clean up terminators are added to indicate points where the execmask +// can be cleaned using the specified livemask. +// This allows the early termination of threads if their quad has become only +// helper lanes as a result of demotes. +def SI_DEMOTE_CLEANUP_B32_TERMINATOR : SPseudoInstSI <(outs), (ins SSrc_b32:$livemask)> { + let isTerminator = 1; +} +def SI_DEMOTE_CLEANUP_B64_TERMINATOR : SPseudoInstSI <(outs), (ins SSrc_b64:$livemask)> { + let isTerminator = 1; +} +} // End Defs = [EXEC] + +} // End Uses = [EXEC] + def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), [(int_amdgcn_unreachable)], "; divergent unreachable"> { @@ -754,6 +781,16 @@ (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) >; +def : Pat < + (int_amdgcn_wqm_demote i1:$src), + (SI_DEMOTE_I1 SCSrc_i1:$src, 0) +>; + +def : Pat < + (int_amdgcn_wqm_demote (i1 (not i1:$src))), + (SI_DEMOTE_I1 SCSrc_i1:$src, -1) +>; + def : Pat < (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -214,6 +214,18 @@ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); return true; } + case AMDGPU::S_AND_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + return true; + } + case AMDGPU::S_AND_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + return true; + } default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -68,11 +68,13 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -134,6 +136,9 @@ char Needs = 0; char InNeeds = 0; char OutNeeds = 0; + char InitialState = 0; + unsigned LiveMaskIn = 0; // Initial live mask register + unsigned LiveMaskOut = 0; // Outgoing live mask register }; struct WorkItem { @@ -153,12 +158,22 @@ const GCNSubtarget *ST; MachineRegisterInfo *MRI; LiveIntervals *LIS; + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; DenseMap Instructions; - MapVector Blocks; - SmallVector LiveMaskQueries; + DenseMap Blocks; + + // Tracks live mask output of instructions + DenseMap LiveMaskRegs; + // Tracks state (WQM/WWM/Exact) after a given instruction + DenseMap StateTransition; + + SmallVector LiveMaskQueries; SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; + SmallVector DemoteInstrs; + SmallSet NeedsDemoteCleanup; void printInfo(); @@ -171,12 +186,16 @@ void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); + void scanLiveLanes(MachineBasicBlock &MBB, + std::vector &Worklist); + void analyzeLiveLanes(MachineFunction &MF); + MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before); MachineBasicBlock::iterator prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, MachineBasicBlock::iterator Last, bool PreferLast, - bool SaveSCC); + bool SaveSCC, bool CheckPhys); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, @@ -184,11 +203,28 @@ void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveOrig); void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig); - void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + unsigned SavedOrig, char NonWWMState); + + bool canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI); + MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); + void lowerBlock(MachineBasicBlock &MBB); + + unsigned findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI, + MachineBasicBlock::iterator &Before); + void processBlock(MachineBasicBlock &MBB, bool isEntry); - void lowerLiveMaskQueries(unsigned LiveMaskReg); + bool lowerLiveMaskQueries(unsigned LiveMaskReg); void lowerCopyInstrs(); + bool lowerDemoteInstrs(); + + void lowerLiveMaskQuery(MachineBasicBlock &MBB, MachineInstr &MI, + unsigned LiveMaskReg, bool isWQM); + MachineInstr *lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI, + unsigned LiveMaskIn, unsigned LiveMaskOut, + bool isWQM); + MachineInstr *insertDemoteCleanup(MachineBasicBlock &MBB, MachineInstr *MI, + MachineBasicBlock::iterator *Before, + unsigned LiveMask); public: static char ID; @@ -201,10 +237,14 @@ StringRef getPassName() const override { return "SI Whole Quad Mode"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addPreserved(); + AU.addRequired(); AU.addPreserved(); - AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -216,6 +256,8 @@ INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -323,6 +365,7 @@ for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = **BI; BlockInfo &BBI = Blocks[&MBB]; + bool HasDemoteInBlock = false; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; @@ -381,8 +424,39 @@ III.Disabled = StateWQM | StateWWM; continue; } else { - if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_WQM_LIVE) { LiveMaskQueries.push_back(&MI); + } else if (Opcode == AMDGPU::SI_DEMOTE_I1) { + // Only perform a demote dominance test once per block + if (!HasDemoteInBlock) { + SmallVector ControlFlowInstrs; + bool DominatesAllReachable = true; + + // Simultaneously check if this demote is in control flow + // (dominates all blocks) and find all control flow ends + // which post dominate this block. + for (MachineBasicBlock *Other : depth_first(&MBB)) { + if (DominatesAllReachable && !MDT->dominates(&MBB, Other)) + DominatesAllReachable = false; + if (PDT->dominates(Other, &MBB)) { + auto FirstMI = Other->getFirstNonPHI(); + if ((FirstMI != Other->end()) && + (FirstMI->getOpcode() == AMDGPU::SI_END_CF)) { + ControlFlowInstrs.push_back(&*FirstMI); + } + } + } + + if (!DominatesAllReachable) { + // Demote is in control flow hence we must mark all control + // flow end instructions requiring clean up. + for (MachineInstr *CF : ControlFlowInstrs) + NeedsDemoteCleanup.insert(CF); + } + } + + DemoteInstrs.push_back(&MI); + HasDemoteInBlock = true; } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -524,6 +598,115 @@ return GlobalFlags; } +// Trace live mask manipulate through block, creating new virtual registers. +// Additionally insert PHI nodes when block has multiple predecessors +// which manipulated the mask. +void SIWholeQuadMode::scanLiveLanes( + MachineBasicBlock &MBB, std::vector &Worklist) { + BlockInfo &BI = Blocks[&MBB]; + + if (BI.LiveMaskIn && BI.LiveMaskOut) + return; // Block has been fully traced already. + + if (!BI.LiveMaskIn) { + // Find the incoming live mask, or insert PHI if there are multiple. + unsigned LastPredReg = 0; + unsigned Count = 0; + bool Valid = true; + + // Find predecessor live masks while performing basic deduplication. + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (!PredBI.LiveMaskOut) { + Valid = false; + break; + } + if (PredBI.LiveMaskOut != LastPredReg) { + LastPredReg = PredBI.LiveMaskOut; + Count++; + } + } + + if (Valid) { + // All predecessors have live mask outputs. + if (Count > 1) { + BI.LiveMaskIn = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstrBuilder PHI = + BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(TargetOpcode::PHI), + BI.LiveMaskIn); + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + PHI.addReg(PredBI.LiveMaskOut); + PHI.addMBB(Pred); + } + LIS->InsertMachineInstrInMaps(*PHI); + } else { + BI.LiveMaskIn = LastPredReg; + } + } else { + // Not all predecessor blocks have live mask outputs, + // so this block will need to be revisited. + + if (!BI.LiveMaskOut) { + // Give this block a live mask output to ensure forward progress. + BI.LiveMaskOut = MRI->createVirtualRegister(TRI->getBoolRC()); + } + + // Queue this block to be revisited and visit predecessors. + Worklist.push_back(&MBB); + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (!PredBI.LiveMaskOut) + Worklist.push_back(Pred); + } + return; + } + } + + assert(BI.LiveMaskIn); + + // Now that the initial live mask register is known the block can + // be traced and intermediate live mask registers assigned for instructions + // which manipulate the mask. + unsigned CurrentLive = BI.LiveMaskIn; + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + MachineInstr &MI = *II; + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + unsigned NewLive = MRI->createVirtualRegister(TRI->getBoolRC()); + LiveMaskRegs[&MI] = NewLive; + CurrentLive = NewLive; + } + II++; + } + + // If an output register was assigned to guarantee forward progress + // then it is possible the current live register will not become the output + // live mask register. This will be resolved during block lowering. + if (!BI.LiveMaskOut) { + BI.LiveMaskOut = CurrentLive; + } +} + +// Scan blocks for live mask manipulation operations in reverse post order +// to minimise rescans: a block will have to be rescanned if it's +// predecessors live mask output is not defined. +void SIWholeQuadMode::analyzeLiveLanes(MachineFunction &MF) { + std::vector Worklist; + + ReversePostOrderTraversal RPOT(&MF); + for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = **BI; + scanLiveLanes(MBB, Worklist); + } + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.back(); + Worklist.pop_back(); + scanLiveLanes(*MBB, Worklist); + } +} + MachineBasicBlock::iterator SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before) { @@ -548,7 +731,8 @@ // instructions we want to add necessarily clobber SCC. MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( MachineBasicBlock &MBB, MachineBasicBlock::iterator First, - MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { + MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC, + bool CheckPhys) { if (!SaveSCC) return PreferLast ? Last : First; @@ -581,9 +765,25 @@ MachineBasicBlock::iterator MBBI; - if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) + if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) { MBBI = MI; - else { + + if (CheckPhys) { + // Make sure insertion point is after any COPY instructions + // accessing physical live in registers. This is ensures that + // block splitting does not occur before all live ins have been copied. + while (MBBI != Last) { + if (MBBI->getOpcode() != AMDGPU::COPY) + break; + unsigned Register = MBBI->getOperand(1).getReg(); + if (!Register::isVirtualRegister(Register) && MBB.isLiveIn(Register)) { + MBBI++; + } else { + break; + } + } + } + } else { assert(Idx == LIS->getMBBEndIdx(&MBB)); MBBI = MBB.end(); } @@ -614,6 +814,7 @@ } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateExact; } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, @@ -633,6 +834,7 @@ } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWQM; } void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, @@ -644,11 +846,12 @@ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWWM; } void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig) { + unsigned SavedOrig, char NonWWMState) { MachineInstr *MI; assert(SavedOrig); @@ -656,20 +859,285 @@ ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = NonWWMState; +} + +void SIWholeQuadMode::lowerLiveMaskQuery(MachineBasicBlock &MBB, + MachineInstr &MI, unsigned LiveMaskReg, + bool isWQM) { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Dest = MI.getOperand(0).getReg(); + MachineInstr *Copy = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dest).addReg(LiveMaskReg); + LIS->ReplaceMachineInstrInMaps(MI, *Copy); + MBB.remove(&MI); +} + +MachineInstr * +SIWholeQuadMode::insertDemoteCleanup(MachineBasicBlock &MBB, MachineInstr *MI, + MachineBasicBlock::iterator *Before, + unsigned LiveMask) { + const DebugLoc &DL = DebugLoc(); + const unsigned TermOp = ST->isWave32() + ? AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR + : AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR; + const unsigned WQMOp = ST->isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; + unsigned LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); + + MachineInstr *LiveMaskMI = + BuildMI(MBB, MI ? *MI : *Before, DL, TII->get(WQMOp), LiveMaskWQM) + .addReg(LiveMask); + MachineInstr *NewTerm = BuildMI(MBB, MI ? *MI : *Before, DL, TII->get(TermOp)) + .addReg(LiveMaskWQM); + + LIS->InsertMachineInstrInMaps(*LiveMaskMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + return NewTerm; } -void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, - bool isEntry) { +// Lower an instruction which demotes lanes to helpers by adding +// appropriate live mask manipulation. Note this is also applied to kills. +MachineInstr *SIWholeQuadMode::lowerDemote(MachineBasicBlock &MBB, + MachineInstr &MI, + unsigned LiveMaskIn, + unsigned LiveMaskOut, bool isWQM) { + const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned AndN2 = + ST->isWave32() ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + const unsigned And = ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineInstr *NewMI = nullptr; + + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + if (Imm == KillVal) { + NewMI = BuildMI(MBB, MI, DL, TII->get(AndN2), LiveMaskOut) + .addReg(LiveMaskIn) + .addReg(Exec); + } + } else { + unsigned Opcode = KillVal ? AndN2 : And; + NewMI = BuildMI(MBB, MI, DL, TII->get(Opcode), LiveMaskOut) + .addReg(LiveMaskIn) + .add(Op); + } + + if (NewMI) { + LIS->InsertMachineInstrInMaps(*NewMI); + } + + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + if (isWQM) { + // Inside WQM demotes are replaced with live mask manipulation + // and a terminator which is later lowered to remove unused helpers + MachineInstr *NewTerm = + insertDemoteCleanup(MBB, &MI, nullptr, LiveMaskOut); + LIS->RemoveMachineInstrFromMaps(MI); + MBB.remove(&MI); + return NewTerm; + } else { + // Outside WQM demotes become kills terminating the block + MI.setDesc(TII->get(AMDGPU::SI_KILL_I1_TERMINATOR)); + return &MI; + } + } + + return nullptr; +} + +bool SIWholeQuadMode::canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI) { + // Cannot split immediately before the epilog + // because there are values in physical registers + if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + return false; + } + + return true; +} + +MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, + MachineInstr *TermMI) { + MachineBasicBlock::iterator SplitPoint(TermMI); + SplitPoint++; + + LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " + << *TermMI << "\n"); + + MachineBasicBlock *SplitBB = nullptr; + + // Only split the block if the split point is not + // already the end of the block. + if ((SplitPoint != BB->getFirstTerminator()) && (SplitPoint != BB->end())) { + MachineFunction *MF = BB->getParent(); + SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + MachineFunction::iterator MBBI(BB); + ++MBBI; + MF->insert(MBBI, SplitBB); + + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + SplitBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(SplitBB); + + // Update dominator trees + using DomTreeT = DomTreeBase; + SmallVector DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, BB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB}); + if (MDT) + MDT->getBase().applyUpdates(DTUpdates); + if (PDT) + PDT->getBase().applyUpdates(DTUpdates); + + // Update live intervals + MachineInstr &InsertionPoint = SplitBB->front(); + LIS->insertMBBInMaps(SplitBB, &InsertionPoint); + } + + // Convert last instruction in to a terminator. + // Note: this only covers the expected patterns + switch (TermMI->getOpcode()) { + case AMDGPU::S_AND_B32: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B32_term)); + break; + case AMDGPU::S_AND_B64: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B64_term)); + break; + default: + if (BB->getFirstTerminator() == BB->end()) { + assert(SplitBB != nullptr); + MachineInstr *MI = + BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(SplitBB); + LIS->InsertMachineInstrInMaps(*MI); + } + break; + } + + return SplitBB; +} + +// Replace (or supplement) instructions accessing live mask. +// This can only happen once all the live mask registers have been created +// and the execute state (WQM/WWM/Exact) of instructions is known. +void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) return; + LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); + const BlockInfo &BI = BII->second; + SmallVector SplitPoints; + unsigned LiveMaskReg = BI.LiveMaskIn; + char State = BI.InitialState; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + auto Next = std::next(II); + MachineInstr &MI = *II; + + if (StateTransition.count(&MI)) { + // Mark transitions to Exact mode as split points so they become + // block terminators. + if (State != StateTransition[&MI] && StateTransition[&MI] == StateExact) { + if (State != StateWWM && canSplitBlockAt(&MBB, &MI)) + SplitPoints.push_back(&MI); + } + State = StateTransition[&MI]; + } + + switch (MI.getOpcode()) { + case AMDGPU::SI_PS_LIVE: + case AMDGPU::SI_WQM_LIVE: + lowerLiveMaskQuery(MBB, MI, LiveMaskReg, State == StateWQM); + break; + case AMDGPU::SI_DEMOTE_I1: { + MachineInstr *SplitPoint = lowerDemote( + MBB, MI, LiveMaskReg, LiveMaskRegs[&MI], State == StateWQM); + if (SplitPoint) + SplitPoints.push_back(SplitPoint); + break; + } + case AMDGPU::SI_END_CF: + if ((State == StateWQM) && NeedsDemoteCleanup.count(&MI)) { + MachineInstr *NewTerm = + insertDemoteCleanup(MBB, nullptr, &Next, LiveMaskReg); + SplitPoints.push_back(NewTerm); + } + break; + default: + break; + } + + if (LiveMaskRegs.count(&MI)) + LiveMaskReg = LiveMaskRegs[&MI]; + + II = Next; + } + + if (BI.LiveMaskOut != LiveMaskReg) { + // If the final live mask register does not match the expected + // register of successor blocks then insert a copy. + MachineBasicBlock::instr_iterator Terminator = + MBB.getFirstInstrTerminator(); + MachineInstr *MI = BuildMI(MBB, Terminator, DebugLoc(), + TII->get(AMDGPU::COPY), BI.LiveMaskOut) + .addReg(LiveMaskReg); + LIS->InsertMachineInstrInMaps(*MI); + } + + // Perform splitting after instruction scan to simplify iteration. + if (!SplitPoints.empty()) { + MachineBasicBlock *BB = &MBB; + for (MachineInstr *MI : SplitPoints) { + BB = splitBlock(BB, MI); + } + } +} + +unsigned SIWholeQuadMode::findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI, + MachineBasicBlock::iterator &Before) { + assert(BI.LiveMaskIn); + if (BI.LiveMaskIn == BI.LiveMaskOut) + return BI.LiveMaskIn; + + // FIXME: make this more efficient than scanning all instructions in a block + unsigned LiveMaskReg = BI.LiveMaskIn; + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + + while ((II != IE) && (II != Before)) { + MachineInstr *I = &*II; + if (LiveMaskRegs.count(I)) + LiveMaskReg = LiveMaskRegs[I]; + II++; + } + + assert(LiveMaskReg); + return LiveMaskReg; +} + +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + BlockInfo &BI = BII->second; + // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { + BI.InitialState = StateWQM; return; + } LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); @@ -694,6 +1162,10 @@ // FirstWQM since if it's safe to switch to/from WWM, it must be safe to // switch to/from WQM as well. MachineBasicBlock::iterator FirstWWM = IE; + + // Record initial state is block information. + BI.InitialState = State; + for (;;) { MachineBasicBlock::iterator Next = II; char Needs = StateExact | StateWQM; // WWM is disabled by default @@ -755,13 +1227,13 @@ First = FirstWQM; } - MachineBasicBlock::iterator Before = - prepareInsertion(MBB, First, II, Needs == StateWQM, - Needs == StateExact || WQMFromExec); + MachineBasicBlock::iterator Before = prepareInsertion( + MBB, First, II, Needs == StateWQM, Needs == StateExact || WQMFromExec, + Needs == StateExact && isEntry); if (State == StateWWM) { assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); SavedNonWWMReg = 0; State = NonWWMState; @@ -780,7 +1252,7 @@ SavedWQMReg = MRI->createVirtualRegister(BoolRC); } - toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + toExact(MBB, Before, SavedWQMReg, findLiveMaskReg(MBB, BI, Before)); State = StateExact; } else if (State == StateExact && (Needs & StateWQM) && !(Needs & StateExact)) { @@ -809,13 +1281,15 @@ if (II == IE) break; + II = Next; } assert(!SavedWQMReg); assert(!SavedNonWWMReg); } -void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { +bool SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { + bool Changed = false; for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -825,7 +1299,20 @@ LIS->ReplaceMachineInstrInMaps(*MI, *Copy); MI->eraseFromParent(); + Changed = true; + } + return Changed; +} + +bool SIWholeQuadMode::lowerDemoteInstrs() { + bool Changed = false; + for (MachineInstr *MI : DemoteInstrs) { + MachineBasicBlock *MBB = MI->getParent(); + MI->setDesc(TII->get(AMDGPU::SI_KILL_I1_TERMINATOR)); + splitBlock(MBB, MI); + Changed = true; } + return Changed; } void SIWholeQuadMode::lowerCopyInstrs() { @@ -872,6 +1359,10 @@ LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); + DemoteInstrs.clear(); + LiveMaskRegs.clear(); + StateTransition.clear(); + CallingConv = MF.getFunction().getCallingConv(); ST = &MF.getSubtarget(); @@ -880,41 +1371,56 @@ TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); + MDT = &getAnalysis(); + PDT = &getAnalysis(); + + const char GlobalFlags = analyzeFunction(MF); + const bool NeedsLiveMask = !(DemoteInstrs.empty() && LiveMaskQueries.empty()); + const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned LiveMaskReg = Exec; + + if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() && + LowerToMovInstrs.empty()) { + // Shader only needs Exact mode + const bool LoweredQueries = lowerLiveMaskQueries(LiveMaskReg); + const bool LoweredDemotes = lowerDemoteInstrs(); + return LoweredQueries || LoweredDemotes; + } - char GlobalFlags = analyzeFunction(MF); - unsigned LiveMaskReg = 0; - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) - return !LiveMaskQueries.empty(); - } else { - // Store a copy of the original live mask when required - MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - - if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); - MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); - } + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - lowerLiveMaskQueries(LiveMaskReg); + // Store a copy of the original live mask when required + if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstr *MI = + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); + } - if (GlobalFlags == StateWQM) { - // For a shader that needs only WQM, we can just set it once. - auto MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 - : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); + if ((GlobalFlags == StateWQM) && DemoteInstrs.empty()) { + // Shader only needs WQM + auto MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); - lowerCopyInstrs(); - // EntryMI may become invalid here - return true; + lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); + return true; + } + + if (NeedsLiveMask && (GlobalFlags & StateWQM)) { + BlockInfo &BI = Blocks[&Entry]; + BI.LiveMaskIn = LiveMaskReg; + analyzeLiveLanes(MF); + } else { + for (auto BII : Blocks) { + BlockInfo &BI = Blocks[&*BII.first]; + BI.LiveMaskIn = LiveMaskReg; + BI.LiveMaskOut = LiveMaskReg; } } @@ -922,12 +1428,20 @@ lowerCopyInstrs(); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + for (auto BII : Blocks) { + processBlock(*BII.first, BII.first == &Entry); + } - if (LiveMaskReg) - LIS->createAndComputeVirtRegInterval(LiveMaskReg); + if (NeedsLiveMask && (GlobalFlags & StateWQM)) { + // Lowering blocks causes block splitting. + // Hence live ranges and slot indexes cease to be valid here. + for (auto BII : Blocks) { + lowerBlock(*BII.first); + } + } else { + lowerLiveMaskQueries(LiveMaskReg); + lowerDemoteInstrs(); + } // Physical registers like SCC aren't tracked by default anyway, so just // removing the ranges we computed is the simplest option for maintaining diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll --- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -49,6 +49,14 @@ ret void } +; CHECK: DIVERGENT: %tmp0 = call i1 @llvm.amdgcn.wqm.live() +define amdgpu_kernel void @wqm.live(i32 addrspace(1)* %out) #0 { + %tmp0 = call i1 @llvm.amdgcn.wqm.live() + %tmp1 = select i1 %tmp0, i32 0, i32 1 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 @@ -56,6 +64,8 @@ declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1 +declare i1 @llvm.amdgcn.wqm.live() #2 attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } +attributes #2 = { nounwind readonly inaccessiblememonly } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1275 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 0 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execz BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s0, 0, 0 +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 0 +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execz BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; SI-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB2_2 +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: BB2_2: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execz BB2_4 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB2_2 +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: BB2_2: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB2_4 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s0, 0, 1 +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, s0 +; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB2_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s1, 0, 0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: BB2_2: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB2_4 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 1 +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; GFX10-64-NEXT: s_cbranch_execz BB2_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: BB2_2: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB2_4 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; SI-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; SI-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; SI-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; SI-NEXT: s_cbranch_execz BB3_2 +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: BB3_2: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_5 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_5 +; SI-NEXT: ; %bb.4: ; %.continue +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_6 +; SI-NEXT: BB3_5: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_6: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX9-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX9-NEXT: s_cbranch_execz BB3_2 +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: BB3_2: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_5 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_5 +; GFX9-NEXT: ; %bb.4: ; %.continue +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_6 +; GFX9-NEXT: BB3_5: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_6: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s13, 0, 1 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, s13 +; GFX10-32-NEXT: s_and_saveexec_b32 s14, s13 +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s14 +; GFX10-32-NEXT: s_cbranch_execz BB3_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s14, 0, 0 +; GFX10-32-NEXT: s_and_b32 s12, s12, s14 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-32-NEXT: BB3_2: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_5 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_5 +; GFX10-32-NEXT: ; %bb.4: ; %.continue +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_6 +; GFX10-32-NEXT: BB3_5: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_6: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX10-64-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX10-64-NEXT: s_cbranch_execz BB3_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: BB3_2: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_5 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_5 +; GFX10-64-NEXT: ; %bb.4: ; %.continue +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_6 +; GFX10-64-NEXT: BB3_5: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_6: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; SI-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; SI-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; SI-NEXT: s_cbranch_execz BB4_2 +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: BB4_2: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_5 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_5 +; SI-NEXT: ; %bb.4: ; %.continue +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_6 +; SI-NEXT: BB4_5: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_6: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX9-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX9-NEXT: s_cbranch_execz BB4_2 +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: BB4_2: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_5 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_5 +; GFX9-NEXT: ; %bb.4: ; %.continue +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_6 +; GFX9-NEXT: BB4_5: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_6: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s13, 0, 1 +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, s13 +; GFX10-32-NEXT: s_and_saveexec_b32 s14, s13 +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s14 +; GFX10-32-NEXT: s_cbranch_execz BB4_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s14, 0, 0 +; GFX10-32-NEXT: s_and_b32 s12, s12, s14 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-32-NEXT: BB4_2: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_5 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_5 +; GFX10-32-NEXT: ; %bb.4: ; %.continue +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_6 +; GFX10-32-NEXT: BB4_5: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_6: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX10-64-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX10-64-NEXT: s_cbranch_execz BB4_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: BB4_2: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_5 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_5 +; GFX10-64-NEXT: ; %bb.4: ; %.continue +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_6 +; GFX10-64-NEXT: BB4_5: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_6: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], vcc +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], vcc +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_b32 s12, s12, vcc_lo +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 s[12:13], s[12:13], vcc +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s0, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s1, s0, 0x100000 +; SI-NEXT: s_lshl_b32 s0, s4, 16 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s1, s4, s1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_cbranch_execz BB6_2 +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] +; SI-NEXT: s_wqm_b64 s[6:7], s[2:3] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: BB6_2: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_8 +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_wqm_b64 s[4:5], s[2:3] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_8 +; SI-NEXT: ; %bb.4: ; %.continue0 +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; SI-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_6 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: BB6_6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB6_8 +; SI-NEXT: ; %bb.7: ; %.continue1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_8: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX9-NEXT: s_cbranch_execz BB6_2 +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: BB6_2: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_7 +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_5 +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX9-NEXT: BB6_5: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB6_7 +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s2, 0, 1 +; GFX10-32-NEXT: s_movk_i32 s1, 0x3c00 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, s2 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 +; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_execz BB6_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 0 +; GFX10-32-NEXT: s_and_b32 s0, s0, s3 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: BB6_2: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB6_7 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, s1, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s1, 0, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 1 +; GFX10-32-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s0, s0, s3 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s0 +; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_execz BB6_5 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: BB6_5: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB6_7 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX10-64-NEXT: s_cbranch_execz BB6_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: BB6_2: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_7 +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_5 +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: BB6_5: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB6_7 +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.wqm.live() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB7_2 +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: BB7_2: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_9 +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: BB7_4: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1] +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB7_6 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: BB7_6: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.7: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; SI-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_and_b64 s[6:7], exec, s[6:7] +; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execnz BB7_4 +; SI-NEXT: ; %bb.8: ; %.return +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_2 +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: BB7_2: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_9 +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: BB7_4: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_6 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: BB7_6: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.7: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_and_b64 s[6:7], exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz BB7_4 +; GFX9-NEXT: ; %bb.8: ; %.return +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 1 +; GFX10-32-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s3, vcc_lo, s3 +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 +; GFX10-32-NEXT: s_cbranch_execz BB7_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 0 +; GFX10-32-NEXT: s_and_b32 s0, s0, s4 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: BB7_2: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_execz BB7_9 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-32-NEXT: BB7_4: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 1 +; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX10-32-NEXT: s_and_b32 s5, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s4, s5, s4 +; GFX10-32-NEXT: s_and_saveexec_b32 s5, s4 +; GFX10-32-NEXT: s_xor_b32 s4, exec_lo, s5 +; GFX10-32-NEXT: s_cbranch_execz BB7_6 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s5, 0, 0 +; GFX10-32-NEXT: s_and_b32 s0, s0, s5 +; GFX10-32-NEXT: s_wqm_b32 s5, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s5 +; GFX10-32-NEXT: BB7_6: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: ; %bb.7: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 1 +; GFX10-32-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v1 +; GFX10-32-NEXT: s_xor_b32 s4, vcc_lo, s4 +; GFX10-32-NEXT: s_and_b32 s4, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 s1, s4, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execnz BB7_4 +; GFX10-32-NEXT: ; %bb.8: ; %.return +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: s_mov_b32 s3, 0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX10-64-NEXT: s_cbranch_execz BB7_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: BB7_2: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_9 +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: BB7_4: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1] +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX10-64-NEXT: s_and_b64 s[8:9], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[8:9], s[6:7] +; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX10-64-NEXT: s_cbranch_execz BB7_6 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: BB7_6: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.7: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX10-64-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX10-64-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; GFX10-64-NEXT: s_and_b64 s[6:7], exec, s[6:7] +; GFX10-64-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execnz BB7_4 +; GFX10-64-NEXT: ; %bb.8: ; %.return +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.wqm.live() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.wqm.live() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir @@ -0,0 +1,69 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s| FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s| FileCheck %s + +--- +name: wqm_demote_scc +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: wqm_demote_scc + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_vcc +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: wqm_demote_vcc + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[ICMP]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_constant_true +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_true + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 true + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... + +--- +name: wqm_demote_constant_false +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_false + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 false + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.live.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.live.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.live.mir @@ -0,0 +1,17 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: wqm_live +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_live + ; CHECK: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.live) + ; CHECK: S_ENDPGM 0, implicit [[INT]](s1) + %0:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.live) + S_ENDPGM 0, implicit %0 +... + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,363 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,GCN-64,PRE-GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GCN-64,PRE-GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-64 %s + +; GCN-LABEL: {{^}}static_exact: +; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0 +; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0 +; GCN-32: s_mov_b32 exec_lo, 0 +; GCN-64: s_mov_b64 exec, 0 +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; GCN-LABEL: {{^}}dynamic_exact: +; GCN-32: v_cmp_le_f32_e64 [[CND:s[0-9]+]], 0, v1 +; GCN-64: v_cmp_le_f32_e64 [[CND:s\[[0-9]+:[0-9]+\]]], 0, v1 +; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0 +; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[CND]] +; GCN-64: s_and_b64 exec, exec, [[CND]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; GCN-LABEL: {{^}}branch: +; GCN-32: s_and_saveexec_b32 s1, s0 +; GCN-64: s_and_saveexec_b64 s[2:3], s[0:1] +; GCN-32: s_xor_b32 s0, exec_lo, s1 +; GCN-64: s_xor_b64 s[0:1], exec, s[2:3] +; GCN-32: s_mov_b32 exec_lo, 0 +; GCN-64: s_mov_b64 exec, 0 +; GCN-32: s_or_b32 exec_lo, exec_lo, s0 +; GCN-64: s_or_b64 exec, exec, s[0:1] +; GCN: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GCN: exp mrt1 v0, v0, v0, v0 done vm +define amdgpu_ps void @branch(float %arg0, float %arg1) { +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + + +; GCN-LABEL: {{^}}wqm_demote_1: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec_lo +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN: ; %.continue +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]*]] +; GCN: image_sample +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +; GCN: [[EXITBB]]: +; GCN: exp null +; GCN: s_endpgm +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +; GCN-LABEL: {{^}}wqm_demote_2: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: image_sample +; GCN: ; %.demote +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN: ; %.continue +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]*]] +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +; GCN: [[EXITBB]]: +; GCN: exp null +; GCN: s_endpgm +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +; GCN-LABEL: {{^}}wqm_demote_dynamic: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: image_sample +; GCN: v_cmp_gt_f32_e32 vcc +; GCN-32-NEXT: s_and_b32 [[LIVE:s[0-9]+]], [[ORIG]], vcc +; GCN-64-NEXT: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]*]] +; GCN: v_add_f32_e32 +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: image_sample +; GCN: [[EXITBB]]: +; GCN: exp null +; GCN: s_endpgm +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + + +; GCN-LABEL: {{^}}wqm_deriv: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote0 +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN-NOT: s_cbranch_execz +; GCN: ; %.continue0 +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]*]] +; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], 1.0, 0, [[LIVE]] +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: ; %.demote1 +; GCN-32-NEXT: s_mov_b32 exec_lo, 0 +; GCN-64-NEXT: s_mov_b64 exec, 0 +; GCN: ; %.continue1 +; GCN: exp mrt0 +; GCN: [[EXITBB]]: +; GCN: exp null +; GCN: s_endpgm +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.wqm.live() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +; GCN-LABEL: {{^}}wqm_deriv_loop: +; GCN-NEXT: ; %.entry +; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo +; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; GCN-32: s_wqm_b32 exec_lo, exec_lo +; GCN-64: s_wqm_b64 exec, exec +; GCN: ; %.demote0 +; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec +; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec +; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]] +; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]] +; GCN-NOT: s_cbranch_execz +; GCN: ; %.continue0.preheader +; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]] +; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]] +; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]] +; GCN: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]*]] +; GCN: ; %.demote1 +; GCN-32: s_andn2_b32 [[LIVE]], [[LIVE]], exec +; GCN-64: s_andn2_b64 [[LIVE]], [[LIVE]], exec +; GCN-NOT: s_cbranch_execz +; GCN: ; %.continue1 +; GCN-32: s_or_b32 exec_lo +; GCN-64: s_or_b64 exec +; GCN: ; %.continue0 +; PRE-GFX10: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:v[0-9]+]], 0, [[LIVE]] +; GFX10: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:s[0-9]+]], 0, [[LIVE]] +; GCN: ; %.return +; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]] +; GCN-64: s_and_b64 exec, exec, [[LIVE]] +; GCN: exp mrt0 +; GCN: [[EXITBB]]: +; GCN: exp null +; GCN: s_endpgm +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.wqm.live() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 +declare void @llvm.amdgcn.wqm.demote(i1) #5 +declare i1 @llvm.amdgcn.wqm.live() #6 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone } +attributes #5 = { writeonly inaccessiblememonly nounwind } +attributes #6 = { readonly inaccessiblememonly nounwind }