diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1354,6 +1354,12 @@ [], [IntrNoMem, IntrWillReturn]>; +// Similar to int_amdgcn_ps_live, but cannot be moved by LICM. +// Returns true if lane is not a helper. +def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty], + [], [IntrReadMem, IntrInaccessibleMemOnly] +>; + def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], @@ -1583,6 +1589,11 @@ Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects] >; +// If false, mark all active lanes as helper lanes until the end of program. +def int_amdgcn_wqm_demote : Intrinsic<[], + [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly] +>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4318,6 +4318,11 @@ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); break; } + case Intrinsic::amdgcn_live_mask: { + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } + case Intrinsic::amdgcn_wqm_demote: case Intrinsic::amdgcn_kill: { OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -243,6 +243,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -43,12 +43,9 @@ bool shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const; - bool dominatesAllReachable(MachineBasicBlock &MBB); void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec); - void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL); - bool kill(MachineInstr &MI); + bool tidySCCDef(MachineInstr &MI); void earlyTerm(MachineInstr &MI); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); @@ -56,6 +53,9 @@ public: static char ID; + unsigned MovOpc; + Register ExecReg; + SIInsertSkips() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -137,15 +137,6 @@ return false; } -/// Check whether \p MBB dominates all blocks that are reachable from it. -bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) { - for (MachineBasicBlock *Other : depth_first(&MBB)) { - if (!MDT->dominates(&MBB, Other)) - return false; - } - return true; -} - static void generateEndPgm(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, const SIInstrInfo *TII, bool IsPS) { @@ -180,11 +171,8 @@ } if (ClearExec && !EarlyExitClearsExec) { - const GCNSubtarget &ST = MF->getSubtarget(); - unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto ExitI = EarlyExitBlock->getFirstNonPHI(); - BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0); + BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0); EarlyExitClearsExec = true; } } @@ -204,173 +192,30 @@ MDT->getBase().applyUpdates(DTUpdates); } -/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given -/// iterator. Only applies to pixel shaders. -void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL) { - MachineFunction *MF = MBB.getParent(); - (void)MF; - assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS); - - // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a - // basic block that has no further successors (e.g., there was an - // `unreachable` there in IR). This can happen with original source of the - // form: - // - // if (uniform_condition) { - // write_to_memory(); - // discard; - // } - // - // In this case, we write the "null_export; s_endpgm" skip code in the - // already-existing basic block. - auto NextBBI = std::next(MBB.getIterator()); - bool NoSuccessor = - I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI); - - if (NoSuccessor) { - generateEndPgm(MBB, I, DL, TII, true); - } else { - ensureEarlyExitBlock(MBB, false); - - MachineInstr *BranchMI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(EarlyExitBlock); - - // Split the block if the branch will not come at the end. - auto Next = std::next(BranchMI->getIterator()); - if (Next != MBB.end() && !Next->isTerminator()) - splitBlock(MBB, *BranchMI, MDT); - - MBB.addSuccessor(EarlyExitBlock); - MDT->getBase().insertEdge(&MBB, EarlyExitBlock); - } -} - -/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions. -/// Return true unless the terminator is a no-op. -bool SIInsertSkips::kill(MachineInstr &MI) { +bool SIInsertSkips::tidySCCDef(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - switch (MI.getOpcode()) { - case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { - unsigned Opcode = 0; - - // The opcodes are inverted because the inline immediate has to be - // the first operand, e.g. from "x < imm" to "imm > x" - switch (MI.getOperand(2).getImm()) { - case ISD::SETOEQ: - case ISD::SETEQ: - Opcode = AMDGPU::V_CMPX_EQ_F32_e64; - break; - case ISD::SETOGT: - case ISD::SETGT: - Opcode = AMDGPU::V_CMPX_LT_F32_e64; - break; - case ISD::SETOGE: - case ISD::SETGE: - Opcode = AMDGPU::V_CMPX_LE_F32_e64; - break; - case ISD::SETOLT: - case ISD::SETLT: - Opcode = AMDGPU::V_CMPX_GT_F32_e64; - break; - case ISD::SETOLE: - case ISD::SETLE: - Opcode = AMDGPU::V_CMPX_GE_F32_e64; - break; - case ISD::SETONE: - case ISD::SETNE: - Opcode = AMDGPU::V_CMPX_LG_F32_e64; - break; - case ISD::SETO: - Opcode = AMDGPU::V_CMPX_O_F32_e64; - break; - case ISD::SETUO: - Opcode = AMDGPU::V_CMPX_U_F32_e64; - break; - case ISD::SETUEQ: - Opcode = AMDGPU::V_CMPX_NLG_F32_e64; - break; - case ISD::SETUGT: - Opcode = AMDGPU::V_CMPX_NGE_F32_e64; - break; - case ISD::SETUGE: - Opcode = AMDGPU::V_CMPX_NGT_F32_e64; - break; - case ISD::SETULT: - Opcode = AMDGPU::V_CMPX_NLE_F32_e64; - break; - case ISD::SETULE: - Opcode = AMDGPU::V_CMPX_NLT_F32_e64; - break; - case ISD::SETUNE: - Opcode = AMDGPU::V_CMPX_NEQ_F32_e64; - break; - default: - llvm_unreachable("invalid ISD:SET cond code"); - } - - const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); - if (ST.hasNoSdstCMPX()) - Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode); - - assert(MI.getOperand(0).isReg()); - - if (TRI->isVGPR(MBB.getParent()->getRegInfo(), - MI.getOperand(0).getReg())) { - Opcode = AMDGPU::getVOPe32(Opcode); - BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .add(MI.getOperand(1)) - .add(MI.getOperand(0)); - } else { - auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode)); - if (!ST.hasNoSdstCMPX()) - I.addReg(AMDGPU::VCC, RegState::Define); - - I.addImm(0) // src0 modifiers - .add(MI.getOperand(1)) - .addImm(0) // src1 modifiers - .add(MI.getOperand(0)); - - I.addImm(0); // omod - } - return true; - } - case AMDGPU::SI_KILL_I1_TERMINATOR: { - const MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - const MachineOperand &Op = MI.getOperand(0); - int64_t KillVal = MI.getOperand(1).getImm(); - assert(KillVal == 0 || KillVal == -1); - - // Kill all threads if Op0 is an immediate and equal to the Kill value. - if (Op.isImm()) { - int64_t Imm = Op.getImm(); - assert(Imm == 0 || Imm == -1); - - if (Imm == KillVal) { - BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 - : AMDGPU::S_MOV_B64), Exec) - .addImm(0); - return true; + // Peek at the previous instruction in case this can be unconditional + assert(MI.getIterator() != MBB.begin()); + auto Prev = std::prev(MI.getIterator()); + if (Prev->getOpcode() == AMDGPU::S_ANDN2_B32 || + Prev->getOpcode() == AMDGPU::S_ANDN2_B64) { + auto Src0 = Prev->getOperand(1); + auto Src1 = Prev->getOperand(2); + if (Src0.isReg() && Src0.getReg() == ExecReg && Src1.isReg() && + Src1.getReg() == ExecReg) { + // SCC will always be 0; use unconditional branch + Register Dst = Prev->getOperand(0).getReg(); + // Simplify S_ANDN2, remove entirely for exec, as it is set in exit block + if (Dst != ExecReg) { + BuildMI(MBB, Prev, Prev->getDebugLoc(), TII->get(MovOpc), Dst) + .addImm(0); } - return false; + Prev->eraseFromParent(); + return true; } - - unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; - if (ST.isWave32()) - Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32; - BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) - .addReg(Exec) - .add(Op); - return true; - } - default: - llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); } + return false; } void SIInsertSkips::earlyTerm(MachineInstr &MI) { @@ -379,15 +224,37 @@ ensureEarlyExitBlock(MBB, true); - auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) - .addMBB(EarlyExitBlock); - auto Next = std::next(MI.getIterator()); + // Can we make branch unconditional? + bool ReplaceSuccessor = MBB.succ_size() <= 1; + if (ReplaceSuccessor) + ReplaceSuccessor = tidySCCDef(MI); + MachineInstr *BranchMI = nullptr; + if (ReplaceSuccessor) { + // Branch is always taken + BranchMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(EarlyExitBlock); + } else { + BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(EarlyExitBlock); + } + + auto Next = std::next(MI.getIterator()); if (Next != MBB.end() && !Next->isTerminator()) splitBlock(MBB, *BranchMI, MDT); - MBB.addSuccessor(EarlyExitBlock); + MachineBasicBlock *OldSuccessor = nullptr; + if (ReplaceSuccessor && !MBB.succ_empty()) { + OldSuccessor = *MBB.succ_begin(); + MBB.replaceSuccessor(OldSuccessor, EarlyExitBlock); + } else { + MBB.addSuccessor(EarlyExitBlock); + } + + // Update MDT MDT->getBase().insertEdge(&MBB, EarlyExitBlock); + if (OldSuccessor) + MDT->getBase().deleteEdge(&MBB, OldSuccessor); MI.eraseFromParent(); } @@ -416,7 +283,9 @@ MDT = &getAnalysis(); SkipThreshold = SkipThresholdFlag; - SmallVector KillInstrs; + MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + SmallVector EarlyTermInstrs; bool MadeChange = false; @@ -441,41 +310,6 @@ } break; - case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: - case AMDGPU::SI_KILL_I1_TERMINATOR: { - MadeChange = true; - bool CanKill = kill(MI); - - // Check if we can add an early "if exec=0 { end shader }". - // - // Note that we _always_ do this if it is correct, even if the kill - // happens fairly late in the shader, because the null export should - // generally still be cheaper than normal export(s). - // - // TODO: The dominatesAllReachable check is conservative: if the - // dominance is only missing due to _uniform_ branches, we could - // in fact insert the early-exit as well. - if (CanKill && - MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && - dominatesAllReachable(MBB)) { - // Mark the instruction for kill-if-dead insertion. We delay this - // change because it modifies the CFG. - KillInstrs.push_back(&MI); - } else { - MI.eraseFromParent(); - } - break; - } - - case AMDGPU::SI_KILL_CLEANUP: - if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && - dominatesAllReachable(MBB)) { - KillInstrs.push_back(&MI); - } else { - MI.eraseFromParent(); - } - break; - case AMDGPU::SI_EARLY_TERMINATE_SCC0: EarlyTermInstrs.push_back(&MI); break; @@ -488,17 +322,13 @@ for (MachineInstr *Instr : EarlyTermInstrs) { // Early termination in GS does nothing - if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) + if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) { earlyTerm(*Instr); - else + } else { + tidySCCDef(*Instr); Instr->eraseFromParent(); + } } - for (MachineInstr *Kill : KillInstrs) { - skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()), - Kill->getDebugLoc()); - Kill->eraseFromParent(); - } - KillInstrs.clear(); EarlyTermInstrs.clear(); EarlyExitBlock = nullptr; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1635,6 +1635,18 @@ MI.setDesc(get(AMDGPU::S_ANDN2_B32)); break; + case AMDGPU::S_AND_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B64)); + break; + + case AMDGPU::S_AND_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2266,10 +2278,12 @@ case AMDGPU::S_XOR_B64_term: case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_AND_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: + case AMDGPU::S_AND_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -230,6 +230,7 @@ def S_XOR_B64_term : WrapTerminatorInst; def S_OR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; +def S_AND_B64_term : WrapTerminatorInst; } let WaveSizePredicate = isWave32 in { @@ -237,6 +238,7 @@ def S_XOR_B32_term : WrapTerminatorInst; def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; +def S_AND_B32_term : WrapTerminatorInst; } @@ -352,9 +354,6 @@ defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; -let Defs = [EXEC] in -def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; - let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), @@ -376,6 +375,19 @@ let SALU = 1; } +let Uses = [EXEC] in { +def SI_LIVE_MASK : PseudoInstSI < + (outs SReg_1:$dst), (ins), + [(set i1:$dst, (int_amdgcn_live_mask))]> { + let SALU = 1; +} +let Defs = [EXEC,SCC] in { +// Demote: Turn a pixel shader thread into a helper lane. +def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> { +} +} // End Defs = [EXEC,SCC] +} // End Uses = [EXEC] + def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), [(int_amdgcn_unreachable)], "; divergent unreachable"> { @@ -768,6 +780,16 @@ (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; +def : Pat < + (int_amdgcn_wqm_demote i1:$src), + (SI_DEMOTE_I1 SCSrc_i1:$src, 0) +>; + +def : Pat < + (int_amdgcn_wqm_demote (i1 (not i1:$src))), + (SI_DEMOTE_I1 SCSrc_i1:$src, -1) +>; + // TODO: we could add more variants for other types of conditionals def : Pat < diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -71,10 +71,8 @@ MachineRegisterInfo *MRI = nullptr; SetVector LoweredEndCf; DenseSet LoweredIf; - SmallSet NeedsKillCleanup; const TargetRegisterClass *BoolRC = nullptr; - bool InsertKillCleanups; unsigned AndOpc; unsigned OrOpc; unsigned XorOpc; @@ -208,28 +206,7 @@ // just cleared bits. bool SimpleIf = isSimpleIf(MI, MRI); - if (InsertKillCleanups) { - // Check for SI_KILL_*_TERMINATOR on full path of control flow and - // flag the associated SI_END_CF for insertion of a kill cleanup. - auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); - while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { - assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); - assert(UseMI->getOpcode() == AMDGPU::SI_ELSE); - MachineOperand &NextExec = UseMI->getOperand(0); - Register NextExecReg = NextExec.getReg(); - if (NextExec.isDead()) { - assert(!SimpleIf); - break; - } - UseMI = MRI->use_instr_nodbg_begin(NextExecReg); - } - if (UseMI->getOpcode() == AMDGPU::SI_END_CF) { - if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { - NeedsKillCleanup.insert(&*UseMI); - SimpleIf = false; - } - } - } else if (SimpleIf) { + if (SimpleIf) { // Check for SI_KILL_*_TERMINATOR on path from if to endif. // if there is any such terminator simplifications are not safe. auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); @@ -448,8 +425,6 @@ auto E = B->end(); for ( ; It != E; ++It) { - if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) - continue; if (TII->mayReadEXEC(*MRI, *It)) break; } @@ -502,18 +477,8 @@ LoweredEndCf.insert(NewMI); - // If this ends control flow which contains kills (as flagged in emitIf) - // then insert an SI_KILL_CLEANUP immediately following the exec mask - // manipulation. This can be lowered to early termination if appropriate. - MachineInstr *CleanUpMI = nullptr; - if (NeedsKillCleanup.count(&MI)) - CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); - - if (LIS) { + if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - if (CleanUpMI) - LIS->InsertMachineInstrInMaps(*CleanUpMI); - } MI.eraseFromParent(); @@ -724,8 +689,6 @@ LIS = getAnalysisIfAvailable(); MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); - InsertKillCleanups = - MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; @@ -774,10 +737,7 @@ case AMDGPU::SI_LOOP: case AMDGPU::SI_END_CF: // Only build worklist if SI_IF instructions must be processed first. - if (InsertKillCleanups) - Worklist.push_back(&MI); - else - SplitMBB = process(MI); + SplitMBB = process(MI); break; default: @@ -798,7 +758,6 @@ LoweredEndCf.clear(); LoweredIf.clear(); - NeedsKillCleanup.clear(); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -219,6 +219,18 @@ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); return true; } + case AMDGPU::S_AND_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + return true; + } + case AMDGPU::S_AND_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + return true; + } default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -11,10 +11,8 @@ /// shaders, and whole wavefront mode for all programs. /// /// Whole quad mode is required for derivative computations, but it interferes -/// with shader side effects (stores and atomics). This pass is run on the -/// scheduled machine IR but before register coalescing, so that machine SSA is -/// available for analysis. It ensures that WQM is enabled when necessary, but -/// disabled around stores and atomics. +/// with shader side effects (stores and atomics). It ensures that WQM is +/// enabled when necessary, but disabled around stores and atomics. /// /// When necessary, this pass creates a function prolog /// @@ -61,8 +59,10 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/IR/CallingConv.h" #include "llvm/InitializePasses.h" #include "llvm/Support/raw_ostream.h" @@ -115,6 +115,8 @@ char Needs = 0; char InNeeds = 0; char OutNeeds = 0; + char InitialState = 0; + bool NeedsLowering = false; }; struct WorkItem { @@ -128,23 +130,33 @@ class SIWholeQuadMode : public MachineFunctionPass { private: - CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; const GCNSubtarget *ST; MachineRegisterInfo *MRI; LiveIntervals *LIS; + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; unsigned AndOpc; - unsigned XorTermrOpc; + unsigned AndN2Opc; + unsigned XorOpc; + unsigned AndSaveExecOpc; unsigned OrSaveExecOpc; - unsigned Exec; + unsigned WQMOpc; + Register Exec; + Register LiveMaskReg; DenseMap Instructions; - MapVector Blocks; - SmallVector LiveMaskQueries; + DenseMap Blocks; + + // Tracks state (WQM/WWM/Exact) after a given instruction + DenseMap StateTransition; + + SmallVector LiveMaskQueries; SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; + SmallVector KillInstrs; void printInfo(); @@ -166,17 +178,27 @@ MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg); + Register SaveWQM); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedWQM); + Register SavedWQM); void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveOrig); + Register SaveOrig); void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig); - void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + Register SavedOrig, char NonWWMState); + + bool canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI); + MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); + + MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, + bool isDemote); + MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); - void lowerLiveMaskQueries(unsigned LiveMaskReg); + void lowerBlock(MachineBasicBlock &MBB); + void processBlock(MachineBasicBlock &MBB, bool isEntry); + + void lowerLiveMaskQueries(); void lowerCopyInstrs(); + void lowerKillInstrs(); public: static char ID; @@ -189,10 +211,14 @@ StringRef getPassName() const override { return "SI Whole Quad Mode"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -204,6 +230,8 @@ INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -261,8 +289,6 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, unsigned SubReg, char Flag, std::vector &Worklist) { - assert(!MRI->isSSA()); - LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); @@ -341,28 +367,14 @@ if (!Value) continue; - if (MRI->isSSA()) { - // Since we're in machine SSA, we do not need to track physical - // registers across basic blocks. - if (Value->isPHIDef()) - continue; - markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, - Worklist); - } else { - markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); - } + markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); } continue; } - if (MRI->isSSA()) { - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, Flag, Worklist); - } else { - LiveRange &LR = LIS->getInterval(Reg); - markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); - } + LiveRange &LR = LIS->getInterval(Reg); + markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); } } @@ -441,12 +453,18 @@ III.Disabled = StateWQM | StateWWM; continue; } else { - if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { LiveMaskQueries.push_back(&MI); + } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || + Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || + Opcode == AMDGPU::SI_DEMOTE_I1) { + KillInstrs.push_back(&MI); + BBI.NeedsLowering = true; } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are // only used, outputs are only defined. + // FIXME: is this still valid? for (const MachineOperand &MO : MI.defs()) { if (!MO.isReg()) continue; @@ -603,6 +621,340 @@ return Restore; } +bool SIWholeQuadMode::canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI) { + // Cannot split immediately before the epilog + // because there are values in physical registers + if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + return false; + } + + return true; +} + +MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, + MachineInstr *TermMI) { + LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " + << *TermMI << "\n"); + + MachineBasicBlock *SplitBB = + BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS); + + // Convert last instruction in to a terminator. + // Note: this only covers the expected patterns + switch (TermMI->getOpcode()) { + case AMDGPU::S_AND_B32: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B32_term)); + break; + case AMDGPU::S_AND_B64: + TermMI->setDesc(TII->get(AMDGPU::S_AND_B64_term)); + break; + case AMDGPU::S_MOV_B32: + TermMI->setDesc(TII->get(AMDGPU::S_MOV_B32_term)); + break; + case AMDGPU::S_MOV_B64: + TermMI->setDesc(TII->get(AMDGPU::S_MOV_B64_term)); + break; + default: + break; + } + + if (SplitBB != BB) { + // Update dominator trees + using DomTreeT = DomTreeBase; + SmallVector DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, BB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB}); + if (MDT) + MDT->getBase().applyUpdates(DTUpdates); + if (PDT) + PDT->getBase().applyUpdates(DTUpdates); + + // Link blocks + MachineInstr *MI = + BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(SplitBB); + LIS->InsertMachineInstrInMaps(*MI); + } + + return SplitBB; +} + +MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, + MachineInstr &MI) { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opcode = 0; + + assert(MI.getOperand(0).isReg()); + + // Operands are reverse for comparison as inline immediate must be first + // argument. However comparison is for live lanes, and here we compute killed + // lanes. + switch (MI.getOperand(2).getImm()) { + case ISD::SETOEQ: + case ISD::SETEQ: + Opcode = AMDGPU::V_CMP_LG_F32_e64; + break; + case ISD::SETOGT: + case ISD::SETGT: + Opcode = AMDGPU::V_CMP_GT_F32_e64; + break; + case ISD::SETOGE: + case ISD::SETGE: + Opcode = AMDGPU::V_CMP_GE_F32_e64; + break; + case ISD::SETOLT: + case ISD::SETLT: + Opcode = AMDGPU::V_CMP_LT_F32_e64; + break; + case ISD::SETOLE: + case ISD::SETLE: + Opcode = AMDGPU::V_CMP_LE_F32_e64; + break; + case ISD::SETONE: + case ISD::SETNE: + Opcode = AMDGPU::V_CMP_EQ_F32_e64; + break; + case ISD::SETO: + Opcode = AMDGPU::V_CMP_O_F32_e64; + break; + case ISD::SETUO: + Opcode = AMDGPU::V_CMP_U_F32_e64; + break; + case ISD::SETUEQ: + Opcode = AMDGPU::V_CMP_NEQ_F32_e64; + break; + case ISD::SETUGT: + Opcode = AMDGPU::V_CMP_NLE_F32_e64; + break; + case ISD::SETUGE: + Opcode = AMDGPU::V_CMP_NLT_F32_e64; + break; + case ISD::SETULT: + Opcode = AMDGPU::V_CMP_NGE_F32_e64; + break; + case ISD::SETULE: + Opcode = AMDGPU::V_CMP_NGT_F32_e64; + break; + case ISD::SETUNE: + Opcode = AMDGPU::V_CMP_NLG_F32_e64; + break; + default: + llvm_unreachable("invalid ISD:SET cond code"); + } + + // Pick opcode based on comparison type. + MachineInstr *VcmpMI; + const MachineOperand &Op0 = MI.getOperand(0); + const MachineOperand &Op1 = MI.getOperand(1); + if (TRI->isVGPR(*MRI, Op0.getReg())) { + Opcode = AMDGPU::getVOPe32(Opcode); + VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); + } else { + VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .addReg(AMDGPU::VCC, RegState::Define) + .addImm(0) // src0 modifiers + .add(Op1) + .addImm(0) // src1 modifiers + .add(Op0) + .addImm(0); // omod + } + + // VCC represents lanes killed. + Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + + MachineInstr *MaskUpdateMI = + BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(VCC); + + // State of SCC represents whether any lanes are live in mask, + // if SCC is 0 then no lanes will be alive anymore. + MachineInstr *EarlyTermMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); + + MachineInstr *ExecMaskMI = + BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); + + assert(MBB.succ_size() == 1); + MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + + // Update live intervals + LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI); + MBB.remove(&MI); + + LIS->InsertMachineInstrInMaps(*MaskUpdateMI); + LIS->InsertMachineInstrInMaps(*ExecMaskMI); + LIS->InsertMachineInstrInMaps(*EarlyTermMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + return NewTerm; +} + +MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, + MachineInstr &MI, bool isDemote) { + const DebugLoc &DL = MI.getDebugLoc(); + MachineInstr *MaskUpdateMI = nullptr; + + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + MachineInstr *ComputeKilledMaskMI = nullptr; + Register CndReg = !Op.isImm() ? Op.getReg() : Register(); + Register TmpReg; + + // Is this a static or dynamic kill? + if (Op.isImm()) { + if (Op.getImm() == KillVal) { + // Static: all active lanes are killed + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(Exec); + } else { + // Static: kill does nothing + MachineInstr *NewTerm = nullptr; + if (!isDemote) { + assert(MBB.succ_size() == 1); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); + } else { + LIS->RemoveMachineInstrFromMaps(MI); + } + MBB.remove(&MI); + return NewTerm; + } + } else { + if (!KillVal) { + // Op represents live lanes after kill, + // so exec mask needs to be factored in. + TmpReg = MRI->createVirtualRegister(TRI->getBoolRC()); + ComputeKilledMaskMI = + BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec); + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(TmpReg); + } else { + // Op represents lanes to kill + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .add(Op); + } + } + + // State of SCC represents whether any lanes are live in mask, + // if SCC is 0 then no lanes will be alive anymore. + MachineInstr *EarlyTermMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); + + // In the case we got this far some lanes are still live, + // update EXEC to deactivate lanes as appropriate. + MachineInstr *NewTerm; + MachineInstr *WQMMaskMI = nullptr; + Register LiveMaskWQM; + if (isDemote) { + // Demotes deactive quads with only helper lanes + LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); + WQMMaskMI = + BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) + .addReg(Exec) + .addReg(LiveMaskWQM); + } else { + // Kills deactivate lanes + if (Op.isImm()) { + unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); + } else { + NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) + .addReg(Exec) + .addReg(LiveMaskReg); + } + } + + // Update live intervals + LIS->RemoveMachineInstrFromMaps(MI); + MBB.remove(&MI); + assert(EarlyTermMI); + assert(MaskUpdateMI); + assert(NewTerm); + if (ComputeKilledMaskMI) + LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); + LIS->InsertMachineInstrInMaps(*MaskUpdateMI); + LIS->InsertMachineInstrInMaps(*EarlyTermMI); + if (WQMMaskMI) + LIS->InsertMachineInstrInMaps(*WQMMaskMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + if (CndReg) { + LIS->removeInterval(CndReg); + LIS->createAndComputeVirtRegInterval(CndReg); + } + if (TmpReg) + LIS->createAndComputeVirtRegInterval(TmpReg); + if (LiveMaskWQM) + LIS->createAndComputeVirtRegInterval(LiveMaskWQM); + + return NewTerm; +} + +// Replace (or supplement) instructions accessing live mask. +// This can only happen once all the live mask registers have been created +// and the execute state (WQM/WWM/Exact) of instructions is known. +void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + const BlockInfo &BI = BII->second; + if (!BI.NeedsLowering) + return; + + LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); + + SmallVector SplitPoints; + char State = BI.InitialState; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + auto Next = std::next(II); + MachineInstr &MI = *II; + + if (StateTransition.count(&MI)) + State = StateTransition[&MI]; + + MachineInstr *SplitPoint = nullptr; + switch (MI.getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: { + SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); + break; + case AMDGPU::SI_KILL_I1_TERMINATOR: + SplitPoint = lowerKillI1(MBB, MI, false); + break; + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + SplitPoint = lowerKillF32(MBB, MI); + break; + } + default: + break; + } + if (SplitPoint) + SplitPoints.push_back(SplitPoint); + + II = Next; + } + + // Perform splitting after instruction scan to simplify iteration. + if (!SplitPoints.empty()) { + MachineBasicBlock *BB = &MBB; + for (MachineInstr *MI : SplitPoints) { + BB = splitBlock(BB, MI); + } + } +} + // Return an iterator in the (inclusive) range [First, Last] at which // instructions can be safely inserted, keeping in mind that some of the // instructions we want to add necessarily clobber SCC. @@ -679,86 +1031,81 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg) { + Register SaveWQM) { MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64), - SaveWQM) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM) .addReg(LiveMaskReg); } else { - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64), - Exec) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec) .addReg(Exec) .addReg(LiveMaskReg); } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateExact; } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedWQM) { + Register SavedWQM) { MachineInstr *MI; - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (SavedWQM) { MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWQM; } void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveOrig) { + Register SaveOrig) { MachineInstr *MI; assert(SaveOrig); MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWWM; } void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig) { + Register SavedOrig, char NonWWMState) { MachineInstr *MI; assert(SavedOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), - ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = NonWWMState; } -void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, - bool isEntry) { +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) return; - const BlockInfo &BI = BII->second; + BlockInfo &BI = BII->second; // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { + BI.InitialState = StateWQM; return; + } LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); - unsigned SavedWQMReg = 0; - unsigned SavedNonWWMReg = 0; + Register SavedWQMReg; + Register SavedNonWWMReg; bool WQMFromExec = isEntry; char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; char NonWWMState = 0; @@ -781,6 +1128,9 @@ // switch to/from WQM as well. MachineBasicBlock::iterator FirstWWM = IE; + // Record initial state is block information. + BI.InitialState = State; + for (;;) { MachineBasicBlock::iterator Next = II; char Needs = StateExact | StateWQM; // WWM is disabled by default @@ -845,7 +1195,7 @@ if (State == StateWWM) { assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); SavedNonWWMReg = 0; State = NonWWMState; @@ -864,7 +1214,7 @@ SavedWQMReg = MRI->createVirtualRegister(BoolRC); } - toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + toExact(MBB, Before, SavedWQMReg); State = StateExact; } else if (State == StateExact && (Needs & StateWQM) && !(Needs & StateExact)) { @@ -900,7 +1250,7 @@ assert(!SavedNonWWMReg); } -void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { +void SIWholeQuadMode::lowerLiveMaskQueries() { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -932,7 +1282,7 @@ // And make it implicitly depend on exec (like all VALU movs should do). MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - } else if (!MRI->isSSA()) { + } else { // Remove early-clobber and exec dependency from simple SGPR copies. // This allows some to be eliminated during/post RA. LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); @@ -968,13 +1318,36 @@ } } +void SIWholeQuadMode::lowerKillInstrs() { + for (MachineInstr *MI : KillInstrs) { + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *SplitPoint = nullptr; + switch (MI->getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: + SplitPoint = lowerKillI1(*MBB, *MI, true); + break; + case AMDGPU::SI_KILL_I1_TERMINATOR: + SplitPoint = lowerKillI1(*MBB, *MI, false); + break; + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + SplitPoint = lowerKillF32(*MBB, *MI); + break; + default: + continue; + } + if (SplitPoint) + splitBlock(MBB, SplitPoint); + } +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); - CallingConv = MF.getFunction().getCallingConv(); + KillInstrs.clear(); + StateTransition.clear(); ST = &MF.getSubtarget(); @@ -982,64 +1355,72 @@ TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); + MDT = &getAnalysis(); + PDT = &getAnalysis(); if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; - XorTermrOpc = AMDGPU::S_XOR_B32_term; + AndN2Opc = AMDGPU::S_ANDN2_B32; + XorOpc = AMDGPU::S_XOR_B32; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + WQMOpc = AMDGPU::S_WQM_B32; Exec = AMDGPU::EXEC_LO; } else { AndOpc = AMDGPU::S_AND_B64; - XorTermrOpc = AMDGPU::S_XOR_B64_term; + AndN2Opc = AMDGPU::S_ANDN2_B64; + XorOpc = AMDGPU::S_XOR_B64; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + WQMOpc = AMDGPU::S_WQM_B64; Exec = AMDGPU::EXEC; } - char GlobalFlags = analyzeFunction(MF); - unsigned LiveMaskReg = 0; - if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) - return !LiveMaskQueries.empty(); - } else { - // Store a copy of the original live mask when required - MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - - if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); - MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); - } + const char GlobalFlags = analyzeFunction(MF); + const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); - lowerLiveMaskQueries(LiveMaskReg); + LiveMaskReg = Exec; - if (GlobalFlags == StateWQM) { - // For a shader that needs only WQM, we can just set it once. - auto MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 - : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); + // Shader is simple, only needs WQM or WWM + if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() && + LowerToMovInstrs.empty() && KillInstrs.empty()) { + lowerLiveMaskQueries(); + return !LiveMaskQueries.empty(); + } - lowerCopyInstrs(); - // EntryMI may become invalid here - return true; - } + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); + + // Store a copy of the original live mask when required + if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstr *MI = + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); } LLVM_DEBUG(printInfo()); + lowerLiveMaskQueries(); lowerCopyInstrs(); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + // Shader only needs WQM + if (GlobalFlags == StateWQM) { + auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); + lowerKillInstrs(); + } else { + for (auto BII : Blocks) + processBlock(*BII.first, BII.first == &Entry); + // Lowering blocks causes block splitting so perform as a second pass. + for (auto BII : Blocks) + lowerBlock(*BII.first); + } - if (LiveMaskReg) + // Compute live range for live mask + if (LiveMaskReg != Exec) LIS->createAndComputeVirtRegInterval(LiveMaskReg); // Physical registers like SCC aren't tracked by default anyway, so just @@ -1047,5 +1428,9 @@ // the analysis results. LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + // If we performed any kills then recompute EXEC + if (!KillInstrs.empty()) + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1255 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_xor_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s1, 0, 0 +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s1, s1, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-32-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX10-64-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 +; GFX10-32-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[2:3] +; SI-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; SI-NEXT: s_cbranch_scc0 BB2_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cbranch_scc0 BB2_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s1, 0, 1 +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, s1 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, s2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s2, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, s[2:3] +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_cbranch_scc0 BB3_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_5 +; SI-NEXT: BB3_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_5: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_cbranch_scc0 BB3_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_5 +; GFX9-NEXT: BB3_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_5: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s28, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s14, s28, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s14 +; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_5 +; GFX10-32-NEXT: BB3_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_5: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_5 +; GFX10-64-NEXT: BB3_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_5: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_cbranch_scc0 BB4_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_5 +; SI-NEXT: BB4_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_5: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_cbranch_scc0 BB4_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_5 +; GFX9-NEXT: BB4_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_5: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s28, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s14, s28, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s14 +; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_5 +; GFX10-32-NEXT: BB4_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_5: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_5 +; GFX10-64-NEXT: BB4_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_5: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_cbranch_scc0 BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 +; GFX10-32-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; SI-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; SI-NEXT: ; %bb.4: ; %.demote1 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_7: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_movk_i32 s1, 0x3c00 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s3, s3, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s3 +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 1 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, s1, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s1, 0, s1 +; GFX10-32-NEXT: s_and_b32 s3, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s4, s3, s4 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s4 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_4: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_8 +; SI-NEXT: BB7_5: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[8:9], s[6:7], s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SI-NEXT: s_cbranch_execz BB7_4 +; SI-NEXT: ; %bb.6: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.7: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_branch BB7_4 +; SI-NEXT: BB7_8: ; %.return +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_branch BB7_5 +; GFX9-NEXT: BB7_4: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_8 +; GFX9-NEXT: BB7_5: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[8:9], s[6:7], s[8:9] +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_4 +; GFX9-NEXT: ; %bb.6: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.7: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_branch BB7_4 +; GFX9-NEXT: BB7_8: ; %.return +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-32-NEXT: s_branch BB7_5 +; GFX10-32-NEXT: BB7_4: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 +; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB7_8 +; GFX10-32-NEXT: BB7_5: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: s_mov_b32 s4, s0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s5, 0, 1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 +; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX10-32-NEXT: s_and_b32 s4, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s5, s4, s5 +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s5 +; GFX10-32-NEXT: s_cbranch_execz BB7_4 +; GFX10-32-NEXT: ; %bb.6: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s5, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s5, s5, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s5 +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.7: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_wqm_b32 s5, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s5 +; GFX10-32-NEXT: s_branch BB7_4 +; GFX10-32-NEXT: BB7_8: ; %.return +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: s_mov_b32 s3, 0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: s_branch BB7_5 +; GFX10-64-NEXT: BB7_4: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_8 +; GFX10-64-NEXT: BB7_5: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX10-64-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[8:9], s[6:7], s[8:9] +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX10-64-NEXT: s_cbranch_execz BB7_4 +; GFX10-64-NEXT: ; %bb.6: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.7: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_branch BB7_4 +; GFX10-64-NEXT: BB7_8: ; %.return +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.live.mask() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir @@ -0,0 +1,16 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: live_mask +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: live_mask + ; CHECK: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask) + ; CHECK: S_ENDPGM 0, implicit [[INT]](s1) + %0:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask) + S_ENDPGM 0, implicit %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir @@ -0,0 +1,69 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s| FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s| FileCheck %s + +--- +name: wqm_demote_scc +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: wqm_demote_scc + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_vcc +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: wqm_demote_vcc + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[ICMP]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_constant_true +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_true + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 true + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... + +--- +name: wqm_demote_constant_false +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_false + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 false + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -206,7 +206,8 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[10:11], exec +; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] @@ -261,7 +262,8 @@ ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[10:11], exec +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] @@ -316,8 +318,9 @@ ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[10:11], exec +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz BB1_4 @@ -378,8 +381,9 @@ ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_mov_b32 s9, s8 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz BB1_4 diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir --- a/llvm/test/CodeGen/AMDGPU/early-term.mir +++ b/llvm/test/CodeGen/AMDGPU/early-term.mir @@ -14,10 +14,6 @@ ret void } - define amdgpu_ps void @early_term_scc0_with_kill() { - ret void - } - define amdgpu_gs void @early_term_scc0_gs() { ret void } @@ -149,58 +145,6 @@ S_ENDPGM 0 ... ---- -name: early_term_scc0_with_kill -tracksRegLiveness: true -liveins: - - { reg: '$sgpr0' } - - { reg: '$vgpr2' } -body: | - ; CHECK-LABEL: name: early_term_scc0_with_kill - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x80000000), %bb.3(0x00000000) - ; CHECK: liveins: $sgpr0, $vgpr2 - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: V_CMPX_LE_F32_nosdst_e32 0, killed $vgpr2, implicit-def $exec, implicit $mode, implicit $exec - ; CHECK: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; CHECK: bb.1: - ; CHECK: successors: %bb.4(0x40000000), %bb.3(0x40000000) - ; CHECK: liveins: $sgpr0, $vgpr0 - ; CHECK: S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc - ; CHECK: S_CBRANCH_SCC0 %bb.3, implicit $scc - ; CHECK: bb.4: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $vgpr0, $scc - ; CHECK: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; CHECK: bb.2: - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK: EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec - ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec - ; CHECK: S_ENDPGM 0 - ; CHECK: bb.3: - ; CHECK: $exec_lo = S_MOV_B32 0 - ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec - ; CHECK: S_ENDPGM 0 - bb.0: - liveins: $sgpr0, $vgpr2 - successors: %bb.1 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR killed $vgpr2, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec - - bb.1: - liveins: $sgpr0, $vgpr0 - successors: %bb.2 - S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc - SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - - bb.2: - liveins: $vgpr0, $vgpr1 - EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec - EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec - S_ENDPGM 0 -... - --- name: early_term_scc0_gs tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir +++ /dev/null @@ -1,40 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold-legacy=1 %s -o - | FileCheck %s -# https://bugs.freedesktop.org/show_bug.cgi?id=99019 ---- | - define amdgpu_ps void @kill_uncond_branch() { - ret void - } -... ---- - -# CHECK-LABEL: name: kill_uncond_branch - -# CHECK: bb.0: -# CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc - -# CHECK: bb.1: -# CHECK: V_CMPX_LE_F32_e32 -# CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - -# CHECK: bb.2: -# CHECK: S_ENDPGM 0 - -# CHECK: bb.3: -# CHECK-NEXT: EXP_DONE -# CHECK: S_ENDPGM 0 - -name: kill_uncond_branch - -body: | - bb.0: - successors: %bb.1 - S_CBRANCH_VCCNZ %bb.1, implicit $vcc - - bb.1: - successors: %bb.2 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec - S_BRANCH %bb.2 - - bb.2: - S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -14,14 +14,14 @@ %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 %c2 = fcmp oge float %tmp3, 0.0 call void @llvm.amdgcn.kill(i1 %c2) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}vcc_implicit_def: -; GCN-NOT: v_cmp_gt_f32_e32 vcc, +; GCN: v_cmp_ge_f32_e32 vcc, 0, v{{[0-9]+}} ; GCN: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} -; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} -; GFX10: v_cmpx_le_f32_e32 0, v{{[0-9]+}} +; GCN: s_andn2_b64 exec, exec, vcc ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) { %tmp0 = fcmp olt float %arg13, 0.000000e+00 @@ -29,12 +29,12 @@ call void @llvm.amdgcn.kill(i1 %c1) %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}true: ; GCN-NEXT: %bb. -; GCN-NEXT: %bb. ; GCN-NEXT: s_endpgm define amdgpu_gs void @true() { call void @llvm.amdgcn.kill(i1 true) @@ -46,6 +46,7 @@ ; GCN: s_mov_b64 exec, 0 define amdgpu_gs void @false() { call void @llvm.amdgcn.kill(i1 false) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } @@ -53,12 +54,15 @@ ; GCN: v_cmp_lt_i32 ; GCN: v_cmp_lt_i32 ; GCN: s_or_b64 s[0:1] -; GCN: s_and_b64 exec, exec, s[0:1] +; GCN: s_xor_b64 s[0:1], s[0:1], exec +; GCN: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GCN: s_and_b64 exec, exec, s[2:3] define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = or i1 %c1, %c2 call void @llvm.amdgcn.kill(i1 %x) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } @@ -66,154 +70,151 @@ ; GCN: v_cmp_lt_i32 ; GCN: v_cmp_lt_i32 ; GCN: s_xor_b64 s[0:1] -; GCN: s_andn2_b64 exec, exec, s[0:1] +; GCN: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GCN: s_and_b64 exec, exec, s[2:3] define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = xor i1 %c1, %c2 %y = xor i1 %x, 1 call void @llvm.amdgcn.kill(i1 %y) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}oeq: -; GCN: v_cmpx_eq_f32 -; GCN-NOT: s_and +; GCN: v_cmp_lg_f32 define amdgpu_gs void @oeq(float %a) { %c1 = fcmp oeq float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ogt: -; GCN: v_cmpx_lt_f32 -; GCN-NOT: s_and +; GCN: v_cmp_gt_f32 define amdgpu_gs void @ogt(float %a) { %c1 = fcmp ogt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}oge: -; GCN: v_cmpx_le_f32 -; GCN-NOT: s_and +; GCN: v_cmp_ge_f32 define amdgpu_gs void @oge(float %a) { %c1 = fcmp oge float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}olt: -; GCN: v_cmpx_gt_f32 -; GCN-NOT: s_and +; GCN: v_cmp_lt_f32 define amdgpu_gs void @olt(float %a) { %c1 = fcmp olt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ole: -; GCN: v_cmpx_ge_f32 -; GCN-NOT: s_and +; GCN: v_cmp_le_f32 define amdgpu_gs void @ole(float %a) { %c1 = fcmp ole float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}one: -; GCN: v_cmpx_lg_f32 -; GCN-NOT: s_and +; GCN: v_cmp_eq_f32 define amdgpu_gs void @one(float %a) { %c1 = fcmp one float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ord: -; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. ; GCN: v_cmp_o_f32 define amdgpu_gs void @ord(float %a) { %c1 = fcmp ord float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}uno: -; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. ; GCN: v_cmp_u_f32 define amdgpu_gs void @uno(float %a) { %c1 = fcmp uno float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ueq: -; GCN: v_cmpx_nlg_f32 -; GCN-NOT: s_and +; GCN: v_cmp_neq_f32 define amdgpu_gs void @ueq(float %a) { %c1 = fcmp ueq float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ugt: -; GCN: v_cmpx_nge_f32 -; GCN-NOT: s_and +; GCN: v_cmp_nle_f32 define amdgpu_gs void @ugt(float %a) { %c1 = fcmp ugt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}uge: -; SI: v_cmpx_ngt_f32_e32 vcc, -1.0 -; GFX10: v_cmpx_ngt_f32_e32 -1.0 -; GCN-NOT: s_and +; GCN: v_cmp_nlt_f32_e32 vcc, -1.0 define amdgpu_gs void @uge(float %a) { %c1 = fcmp uge float %a, -1.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ult: -; SI: v_cmpx_nle_f32_e32 vcc, -2.0 -; GFX10: v_cmpx_nle_f32_e32 -2.0 -; GCN-NOT: s_and +; GCN: v_cmp_nge_f32_e32 vcc, -2.0 define amdgpu_gs void @ult(float %a) { %c1 = fcmp ult float %a, -2.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ule: -; SI: v_cmpx_nlt_f32_e32 vcc, 2.0 -; GFX10: v_cmpx_nlt_f32_e32 2.0 -; GCN-NOT: s_and +; GCN: v_cmp_ngt_f32_e32 vcc, 2.0 define amdgpu_gs void @ule(float %a) { %c1 = fcmp ule float %a, 2.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}une: -; SI: v_cmpx_neq_f32_e32 vcc, 0 -; GFX10: v_cmpx_neq_f32_e32 0 -; GCN-NOT: s_and +; GCN: v_cmp_nlg_f32_e32 vcc, 0 define amdgpu_gs void @une(float %a) { %c1 = fcmp une float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}neg_olt: -; SI: v_cmpx_ngt_f32_e32 vcc, 1.0 -; GFX10: v_cmpx_ngt_f32_e32 1.0 -; GCN-NOT: s_and +; GCN: v_cmp_nlt_f32_e32 vcc, 1.0 define amdgpu_gs void @neg_olt(float %a) { %c1 = fcmp olt float %a, 1.0 %c2 = xor i1 %c1, 1 call void @llvm.amdgcn.kill(i1 %c2) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } @@ -222,7 +223,7 @@ ; SI: v_cmp_lt_f32_e32 vcc, s{{[0-9]+}}, v0 ; GFX10: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0 ; GCN: v_cndmask_b32 -; GCN: v_cmpx_le_f32 +; GCN: v_cmp_ge_f32 define amdgpu_ps void @fcmp_x2(float %a) #0 { %ogt = fcmp nsz ogt float %a, 2.500000e-01 %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00 @@ -231,20 +232,24 @@ ret void } +; Note: an almost identical test for this exists in llvm.amdgcn.wqm.vote.ll ; GCN-LABEL: {{^}}wqm: ; GCN: v_cmp_neq_f32_e32 vcc, 0 -; GCN: s_wqm_b64 s[0:1], vcc +; GCN-DAG: s_wqm_b64 s[2:3], vcc +; GCN-DAG: s_mov_b64 s[0:1], exec +; GCN: s_xor_b64 s[2:3], s[2:3], exec +; GCN: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GCN: s_and_b64 exec, exec, s[0:1] -define amdgpu_ps void @wqm(float %a) { +define amdgpu_ps float @wqm(float %a) { %c1 = fcmp une float %a, 0.0 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) call void @llvm.amdgcn.kill(i1 %c2) - ret void + ret float 0.0 } ; This checks that we use the 64-bit encoding when the operand is a SGPR. ; GCN-LABEL: {{^}}test_sgpr: -; GCN: v_cmpx_ge_f32_e64 +; GCN: v_cmp_ge_f32_e64 define amdgpu_ps void @test_sgpr(float inreg %a) #0 { %c = fcmp ole float %a, 1.000000e+00 call void @llvm.amdgcn.kill(i1 %c) #1 @@ -252,7 +257,7 @@ } ; GCN-LABEL: {{^}}test_non_inline_imm_sgpr: -; GCN-NOT: v_cmpx_ge_f32_e64 +; GCN-NOT: v_cmp_le_f32_e64 define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 { %c = fcmp ole float %a, 1.500000e+00 call void @llvm.amdgcn.kill(i1 %c) #1 @@ -281,6 +286,7 @@ declare void @llvm.amdgcn.kill(i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 declare i1 @llvm.amdgcn.wqm.vote(i1) attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_branch BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_branch BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_branch BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_branch BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 +; GFX10-32-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cbranch_scc0 BB2_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cbranch_scc0 BB2_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB3_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_5 +; SI-NEXT: BB3_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_5: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB3_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_5 +; GFX9-NEXT: BB3_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_5: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_5 +; GFX10-32-NEXT: BB3_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_5: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_5 +; GFX10-64-NEXT: BB3_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_5: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB4_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_5 +; SI-NEXT: BB4_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_5: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB4_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_5 +; GFX9-NEXT: BB4_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_5: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_5 +; GFX10-32-NEXT: BB4_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_5: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_5 +; GFX10-64-NEXT: BB4_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_5: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_cbranch_scc0 BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 +; GFX10-32-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_or_b64 s[4:5], s[2:3], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SI-NEXT: ; %bb.4: ; %.demote1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: v_bfrev_b32_e32 v0, 60 +; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_7: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s2, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s1, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 +; GFX10-32-NEXT: s_or_b32 s2, s1, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s1, s2 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX10-64-NEXT: s_or_b64 s[4:5], s[2:3], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_4: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_add_i32 s2, s2, 1 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_8 +; SI-NEXT: BB7_5: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SI-NEXT: s_cbranch_execz BB7_4 +; SI-NEXT: ; %bb.6: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.7: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_branch BB7_4 +; SI-NEXT: BB7_8: ; %.return +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_bfrev_b32_e32 v0, 60 +; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch BB7_5 +; GFX9-NEXT: BB7_4: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_8 +; GFX9-NEXT: BB7_5: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_4 +; GFX9-NEXT: ; %bb.6: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.7: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_branch BB7_4 +; GFX9-NEXT: BB7_8: ; %.return +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_mov_b32 s2, 0 +; GFX10-32-NEXT: s_branch BB7_5 +; GFX10-32-NEXT: BB7_4: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_add_i32 s2, s2, 1 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 +; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB7_8 +; GFX10-32-NEXT: BB7_5: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 +; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_or_b32 s4, s3, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s4 +; GFX10-32-NEXT: s_cbranch_execz BB7_4 +; GFX10-32-NEXT: ; %bb.6: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.7: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_branch BB7_4 +; GFX10-32-NEXT: BB7_8: ; %.return +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: s_branch BB7_5 +; GFX10-64-NEXT: BB7_4: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_add_i32 s2, s2, 1 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_8 +; GFX10-64-NEXT: BB7_5: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s2, 0, s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX10-64-NEXT: s_cbranch_execz BB7_4 +; GFX10-64-NEXT: ; %bb.6: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.7: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_branch BB7_4 +; GFX10-64-NEXT: BB7_8: ; %.return +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.live.mask() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll @@ -34,22 +34,27 @@ ret float %r } +; Note: an almost identical test for this exists in llvm.amdgcn.kill.ll ;CHECK-LABEL: {{^}}kill: ;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1 ;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]] -;WAVE64: s_and_b64 exec, exec, [[WQM]] +;WAVE64: s_xor_b64 [[KILL:[^,]+]], [[WQM]], exec +;WAVE64: s_andn2_b64 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]] +;WAVE64: s_and_b64 exec, exec, [[MASK]] ;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]] -;WAVE32: s_and_b32 exec_lo, exec_lo, [[WQM]] +;WAVE32: s_xor_b32 [[KILL:[^,]+]], [[WQM]], exec +;WAVE32: s_andn2_b32 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]] +;WAVE32: s_and_b32 exec_lo, exec_lo, [[MASK]] ;CHECK: s_endpgm -define amdgpu_ps void @kill(i32 %v0, i32 %v1) #1 { +define amdgpu_ps float @kill(i32 %v0, i32 %v1) #1 { main_body: %c = icmp eq i32 %v0, %v1 %w = call i1 @llvm.amdgcn.wqm.vote(i1 %c) call void @llvm.amdgcn.kill(i1 %w) - ret void + ret float 0.0 } declare void @llvm.amdgcn.kill(i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -2,7 +2,6 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { call void @llvm.amdgcn.kill(i1 true) @@ -11,11 +10,10 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execz BB1_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_branch [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB1_2: +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { @@ -26,15 +24,17 @@ ; FIXME: Ideally only one would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execz BB2_3 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execz BB2_3 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB2_3: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 { call void @llvm.amdgcn.kill(i1 false) @@ -44,12 +44,13 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB3_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB3_2: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -60,15 +61,19 @@ ; FIXME: Ideally only one would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB4_3 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB4_3 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB4_3: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -80,15 +85,19 @@ ; FIXME: Ideally only one early-exit would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_var_x2: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB5_3 -; CHECK-NEXT: ; %bb.1 -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1 -; CHECK-NEXT: s_cbranch_execz BB5_3 -; CHECK-NEXT: ; %bb.2 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB5_3: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { %cmp.x = fcmp olt float %x, 0.0 @@ -100,15 +109,19 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB6_3 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc ; CHECK: v_mov_b32_e64 v7, -1 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 -; CHECK-NEXT: s_cbranch_execz BB6_3 -; CHECK-NEXT: ; %bb.2: +; CHECK: v_cmp_lt_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB6_3: +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { @@ -124,9 +137,12 @@ ; CHECK-LABEL: {{^}}test_kill_control_flow: ; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_cbranch_scc0 [[BODY_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; %bb.1: +; CHECK: v_mov_b32_e32 v0, 1.0 +; CHECK: s_branch [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; [[BODY_BB]]: ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 @@ -139,12 +155,17 @@ ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 - -; TODO: We could do an early-exit here (the branch above is uniform!) -; CHECK-NOT: exp null +; CHECK: v_cmp_lt_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_andn2_b64 exec, exec, vcc ; CHECK: v_mov_b32_e32 v0, 1.0 + +; CHECK: [[EXIT_BB]] +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 { entry: %cmp = icmp eq i32 %arg, 0 @@ -189,10 +210,9 @@ ; CHECK: ;;#ASMEND ; CHECK: v_mov_b32_e64 v8, -1 ; CHECK: ;;#ASMEND -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 - -; TODO: We could do an early-exit here (the branch above is uniform!) -; CHECK-NOT: exp null +; CHECK: v_cmp_lt_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK: buffer_store_dword v8 ; CHECK: v_mov_b32_e64 v9, -2 @@ -200,6 +220,11 @@ ; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}: ; CHECK: buffer_store_dword v9 ; CHECK-NEXT: s_endpgm + +; CHECK: [[EXIT_BB]] +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 { entry: %cmp = icmp eq i32 %arg, 0 @@ -234,9 +259,12 @@ ; CHECK-LABEL: {{^}}test_kill_control_flow_return: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: v_cmp_eq_u32_e64 [[KILL_CC:s\[[0-9]+:[0-9]+\]]], s0, 1 -; CHECK: s_and_b64 exec, exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_xor_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[KILL_CC]], exec +; CHECK: s_andn2_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], [[LIVE]], [[TMP]] +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_and_b64 exec, exec, [[MASK]] ; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 ; CHECK: s_cbranch_scc0 [[COND_BB:BB[0-9]+_[0-9]+]] @@ -257,6 +285,7 @@ ; CHECK: v_mov_b32_e32 v0, v7 ; CHECK: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm @@ -301,7 +330,9 @@ ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_nop_e64 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 +; CHECK: v_cmp_lt_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.3: ; CHECK: buffer_load_dword [[LOAD:v[0-9]+]] @@ -313,6 +344,11 @@ ; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]] ; CHECK: buffer_store_dword ; CHECK: s_endpgm + +; CHECK: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { entry: %cmp = icmp eq i32 %arg, 0 @@ -345,10 +381,12 @@ ; bug 28550 ; CHECK-LABEL: {{^}}phi_use_def_before_kill: ; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0, -; CHECK: v_cmpx_lt_f32_e32 vcc, 0, -; CHECK-NEXT: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]+]] +; CHECK: v_cmp_gt_f32_e32 vcc, 0, +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXITBB:BB[0-9]+_[0-9]+]] ; CHECK: ; %[[KILLBB:bb.[0-9]+]]: +; CHECK-NEXT: s_andn2_b64 ; CHECK-NEXT: s_cbranch_scc0 [[PHIBB:BB[0-9]+_[0-9]+]] ; CHECK: [[PHIBB]]: @@ -363,7 +401,8 @@ ; CHECK-NEXT: s_endpgm ; CHECK: [[EXITBB]]: -; CHECK: exp null +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { bb: @@ -395,13 +434,21 @@ ; CHECK: v_cmp_nge_f32 ; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]] -; CHECK: ; %bb6 -; CHECK: s_mov_b64 exec, 0 +; FIXME: ideally this should just be a s_branch +; CHECK: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; %bb6 +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK: [[SKIPKILL]]: ; CHECK: v_cmp_nge_f32_e32 vcc -; CHECK: %bb.3: ; %bb5 -; CHECK-NEXT: .Lfunc_end{{[0-9]+}} +; CHECK: %bb.4: ; %bb5 + +; CHECK: [[EXIT_BB]] +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 { bb: %tmp = fcmp ult float %arg1, 0.000000e+00 @@ -430,8 +477,9 @@ ; CHECK: s_and_saveexec_b64 ; CHECK: s_xor_b64 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, -; CHECK: BB{{[0-9]+_[0-9]+}}: +; CHECK: v_cmp_lt_f32_e32 vcc, 0, +; CHECK: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] + ; CHECK: s_or_b64 exec, exec ; CHECK: image_sample_c @@ -445,6 +493,12 @@ ; CHECK: [[END]]: ; CHECK: s_endpgm + +; CHECK: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm + define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 { bb: %tmp = fcmp ult float %arg1, 0.000000e+00 @@ -470,10 +524,13 @@ } ; CHECK-LABEL: {{^}}cbranch_kill: +; CHECK: ; %bb.{{[0-9]+}}: ; %kill +; CHECK-NEXT: s_andn2 +; CHECK-NEXT: s_cbranch_scc0 [[EXIT:BB[0-9]+_[0-9]+]] ; CHECK: ; %bb.{{[0-9]+}}: ; %export ; CHECK-NEXT: s_or_b64 -; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] ; CHECK: [[EXIT]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) { .entry: @@ -512,7 +569,7 @@ ; CHECK-LABEL: {{^}}complex_loop: ; CHECK: s_mov_b64 exec, 0 -; CHECK-NOT: exp null +; CHECK: exp null define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { .entry: %flaga = icmp sgt i32 %cmpa, 0 diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -84,46 +84,51 @@ ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN: liveins: $vgpr0 ; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN: $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN: renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GCN: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN: bb.1.flow.preheader: ; GCN: successors: %bb.2(0x80000000) - ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN: renamable $sgpr2_sgpr3 = S_MOV_B64 0 + ; GCN: renamable $sgpr4_sgpr5 = S_MOV_B64 0 ; GCN: bb.2.flow: ; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000) - ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc - ; GCN: renamable $sgpr2_sgpr3 = S_OR_B64 killed renamable $sgpr4_sgpr5, killed renamable $sgpr2_sgpr3, implicit-def $scc - ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; GCN: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc + ; GCN: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; GCN: bb.3.Flow: ; GCN: successors: %bb.4(0x80000000) - ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: bb.4.Flow1: - ; GCN: successors: %bb.5(0x40000000), %bb.6(0x40000000) - ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc - ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GCN: successors: %bb.5(0x40000000) + ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc ; GCN: bb.5.kill0: + ; GCN: successors: %bb.8(0x40000000), %bb.7(0x40000000) + ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc + ; GCN: S_CBRANCH_SCC0 %bb.7, implicit $scc + ; GCN: bb.8.kill0: ; GCN: successors: %bb.6(0x80000000) - ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: liveins: $sgpr2_sgpr3, $scc ; GCN: $exec = S_MOV_B64 0 ; GCN: bb.6.end: - ; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000) - ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc - ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN: S_BRANCH %bb.8 + ; GCN: successors: %bb.9(0x80000000) + ; GCN: liveins: $sgpr2_sgpr3 + ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: S_BRANCH %bb.9 ; GCN: bb.7: + ; GCN: $exec = S_MOV_B64 0 ; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec ; GCN: S_ENDPGM 0 - ; GCN: bb.8: + ; GCN: bb.9: entry: %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val %cmp0 = fcmp olt float %.i0, 0.000000e+00 diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_vcmpx_smov_exec_lo # GCN: $sgpr0 = S_MOV_B32 $exec_lo @@ -11,7 +11,7 @@ successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -29,7 +29,7 @@ successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0_sgpr1 = S_MOV_B64 $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -45,7 +45,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 $exec_lo, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -61,7 +61,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -80,7 +80,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo $vgpr0 = V_ADDC_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -99,7 +99,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -118,7 +118,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo S_WAITCNT_DEPCTR 65534 - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -137,7 +137,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo S_WAITCNT_DEPCTR 65535 - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -156,7 +156,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo S_WAITCNT_DEPCTR 61438 - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_vcmpx_permlane16 # GCN: V_CMPX_LE_F32_nosdst_e32 @@ -11,7 +11,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -33,7 +33,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -56,7 +56,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -79,7 +79,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -110,7 +110,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -133,7 +133,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -34,10 +34,10 @@ ret void } -; GCN-LABEL: {{^}}test_vopc_vcmpx: -; GFX1032: v_cmpx_le_f32_e32 0, v{{[0-9]+}} -; GFX1064: v_cmpx_le_f32_e32 0, v{{[0-9]+}} -define amdgpu_ps void @test_vopc_vcmpx(float %x) { +; GCN-LABEL: {{^}}test_vopc_vcmp: +; GFX1032: v_cmp_ge_f32_e32 vcc_lo, 0, v{{[0-9]+}} +; GFX1064: v_cmp_ge_f32_e32 vcc, 0, v{{[0-9]+}} +define amdgpu_ps void @test_vopc_vcmp(float %x) { %cmp = fcmp oge float %x, 0.0 call void @llvm.amdgcn.kill(i1 %cmp) ret void @@ -658,15 +658,22 @@ } ; GCN-LABEL: {{^}}test_kill_i1_terminator_i1: +; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo ; GFX1032: s_or_b32 [[OR:s[0-9]+]], -; GFX1032: s_and_b32 exec_lo, exec_lo, [[OR]] +; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[OR]], exec_lo +; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]] +; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]] +; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec ; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]], -; GFX1064: s_and_b64 exec, exec, [[OR]] +; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[OR]], exec +; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]] +; GFX1064: s_and_b64 exec, exec, [[MASK]] define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = or i1 %c1, %c2 call void @llvm.amdgcn.kill(i1 %x) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) ret void } @@ -828,15 +835,22 @@ ; GCN-LABEL: {{^}}test_wqm_vote: ; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0 +; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo ; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo -; GFX1032: s_and_b32 exec_lo, exec_lo, [[WQM]] +; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[WQM]], exec_lo +; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]] +; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]] ; GFX1064: v_cmp_neq_f32_e32 vcc, 0 -; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc{{$}} -; GFX1064: s_and_b64 exec, exec, [[WQM]] +; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec +; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc +; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[WQM]], exec +; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]] +; GFX1064: s_and_b64 exec, exec, [[MASK]] define amdgpu_ps void @test_wqm_vote(float %a) { %c1 = fcmp une float %a, 0.0 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) call void @llvm.amdgcn.kill(i1 %c2) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) ret void } @@ -1125,9 +1139,11 @@ declare i1 @llvm.amdgcn.ps.live() declare i64 @llvm.cttz.i64(i64, i1) declare i32 @llvm.cttz.i32(i32, i1) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5 attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } attributes #2 = { nounwind readnone optnone noinline } attributes #3 = { "target-features"="+wavefrontsize32" } attributes #4 = { "target-features"="+wavefrontsize64" } +attributes #5 = { inaccessiblememonly nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -576,7 +576,7 @@ ;CHECK: image_sample ;CHECK: buffer_store_dword ;CHECK: s_wqm_b64 exec, exec -;CHECK: v_cmpx_ +;CHECK: v_cmp_ ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: image_sample @@ -611,9 +611,9 @@ ; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[ORIG]] ; CHECK: image_sample -; CHECK: buffer_store_dword ; CHECK-NOT: wqm -; CHECK: v_cmpx_ +; CHECK-DAG: buffer_store_dword +; CHECK-DAG: v_cmp_ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0