diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1354,6 +1354,12 @@ [], [IntrNoMem, IntrWillReturn]>; +// Similar to int_amdgcn_ps_live, but cannot be moved by LICM. +// Returns true if lane is not a helper. +def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty], + [], [IntrReadMem, IntrInaccessibleMemOnly] +>; + def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], @@ -1583,6 +1589,11 @@ Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects] >; +// If false, mark all active lanes as helper lanes until the end of program. +def int_amdgcn_wqm_demote : Intrinsic<[], + [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly] +>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4306,6 +4306,11 @@ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); break; } + case Intrinsic::amdgcn_live_mask: { + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } + case Intrinsic::amdgcn_wqm_demote: case Intrinsic::amdgcn_kill: { OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -243,6 +243,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -375,6 +375,18 @@ let SALU = 1; } +let Uses = [EXEC] in { +def SI_LIVE_MASK : PseudoInstSI < + (outs SReg_1:$dst), (ins), + [(set i1:$dst, (int_amdgcn_live_mask))]> { + let SALU = 1; +} +let Defs = [EXEC,SCC] in { +// Demote: Turn a pixel shader thread into a helper lane. +def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; +} // End Defs = [EXEC,SCC] +} // End Uses = [EXEC] + def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), [(int_amdgcn_unreachable)], "; divergent unreachable"> { @@ -767,6 +779,16 @@ (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; +def : Pat < + (int_amdgcn_wqm_demote i1:$src), + (SI_DEMOTE_I1 SCSrc_i1:$src, 0) +>; + +def : Pat < + (int_amdgcn_wqm_demote (i1 (not i1:$src))), + (SI_DEMOTE_I1 SCSrc_i1:$src, -1) +>; + // TODO: we could add more variants for other types of conditionals def : Pat < diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -115,6 +115,7 @@ char Needs = 0; char InNeeds = 0; char OutNeeds = 0; + char InitialState = 0; bool NeedsLowering = false; }; @@ -149,6 +150,9 @@ DenseMap Instructions; MapVector Blocks; + // Tracks state (WQM/WWM/Exact) after a given instruction + DenseMap StateTransition; + SmallVector LiveMaskQueries; SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; @@ -180,11 +184,12 @@ void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SaveOrig); void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - Register SavedOrig); + Register SavedOrig, char NonWWMState); MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); - MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI); + MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, + bool isDemote); MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); void lowerBlock(MachineBasicBlock &MBB); @@ -447,10 +452,11 @@ III.Disabled = StateWQM | StateWWM; continue; } else { - if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { LiveMaskQueries.push_back(&MI); } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || - Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR) { + Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || + Opcode == AMDGPU::SI_DEMOTE_I1) { KillInstrs.push_back(&MI); BBI.NeedsLowering = true; } else if (WQMOutputs) { @@ -778,7 +784,7 @@ } MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, - MachineInstr &MI) { + MachineInstr &MI, bool isDemote) { const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *MaskUpdateMI = nullptr; @@ -798,10 +804,14 @@ } else { // Static: kill does nothing MachineInstr *NewTerm = nullptr; - assert(MBB.succ_size() == 1); - NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) - .addMBB(*MBB.succ_begin()); - LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); + if (isDemote) { + LIS->RemoveMachineInstrFromMaps(MI); + } else { + assert(MBB.succ_size() == 1); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); + } MBB.remove(&MI); return NewTerm; } @@ -831,14 +841,26 @@ // In the case we got this far some lanes are still live, // update EXEC to deactivate lanes as appropriate. MachineInstr *NewTerm; + MachineInstr *WQMMaskMI = nullptr; Register LiveMaskWQM; - if (Op.isImm()) { - unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); - } else { - NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) + if (isDemote) { + // Demotes deactive quads with only helper lanes + LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); + WQMMaskMI = + BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) .addReg(Exec) - .addReg(LiveMaskReg); + .addReg(LiveMaskWQM); + } else { + // Kills deactivate lanes + if (Op.isImm()) { + unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); + } else { + NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) + .addReg(Exec) + .addReg(LiveMaskReg); + } } // Update live intervals @@ -851,6 +873,8 @@ LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); LIS->InsertMachineInstrInMaps(*MaskUpdateMI); LIS->InsertMachineInstrInMaps(*EarlyTermMI); + if (WQMMaskMI) + LIS->InsertMachineInstrInMaps(*WQMMaskMI); LIS->InsertMachineInstrInMaps(*NewTerm); if (CndReg) { @@ -880,20 +904,28 @@ LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); SmallVector SplitPoints; + char State = BI.InitialState; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); while (II != IE) { auto Next = std::next(II); MachineInstr &MI = *II; + if (StateTransition.count(&MI)) + State = StateTransition[&MI]; + MachineInstr *SplitPoint = nullptr; switch (MI.getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: { + SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); + break; case AMDGPU::SI_KILL_I1_TERMINATOR: - SplitPoint = lowerKillI1(MBB, MI); + SplitPoint = lowerKillI1(MBB, MI, false); break; case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(MBB, MI); break; + } default: break; } @@ -1001,6 +1033,7 @@ } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateExact; } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, @@ -1016,6 +1049,7 @@ } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWQM; } void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, @@ -1027,17 +1061,19 @@ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWWM; } void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - Register SavedOrig) { + Register SavedOrig, char NonWWMState) { MachineInstr *MI; assert(SavedOrig); MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = NonWWMState; } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) { @@ -1050,6 +1086,7 @@ // This is a non-entry block that is WQM throughout, so no need to do // anything. if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { + BI.InitialState = StateWQM; return; } @@ -1080,6 +1117,9 @@ // switch to/from WQM as well. MachineBasicBlock::iterator FirstWWM = IE; + // Record initial state is block information. + BI.InitialState = State; + for (;;) { MachineBasicBlock::iterator Next = II; char Needs = StateExact | StateWQM; // WWM is disabled by default @@ -1144,7 +1184,7 @@ if (State == StateWWM) { assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); SavedNonWWMReg = 0; State = NonWWMState; @@ -1272,8 +1312,11 @@ MachineBasicBlock *MBB = MI->getParent(); MachineInstr *SplitPoint = nullptr; switch (MI->getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: + SplitPoint = lowerKillI1(*MBB, *MI, true); + break; case AMDGPU::SI_KILL_I1_TERMINATOR: - SplitPoint = lowerKillI1(*MBB, *MI); + SplitPoint = lowerKillI1(*MBB, *MI, false); break; case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(*MBB, *MI); @@ -1293,6 +1336,7 @@ LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); KillInstrs.clear(); + StateTransition.clear(); ST = &MF.getSubtarget(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1255 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_xor_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s1, 0, 0 +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s1, s1, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-32-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX10-64-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 +; GFX10-32-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[2:3] +; SI-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; SI-NEXT: s_cbranch_scc0 BB2_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cbranch_scc0 BB2_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s1, 0, 1 +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, s1 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, s2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s2, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, s[2:3] +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_cbranch_scc0 BB3_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_5 +; SI-NEXT: BB3_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_5: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_cbranch_scc0 BB3_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_5 +; GFX9-NEXT: BB3_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_5: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s28, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s14, s28, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s14 +; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_5 +; GFX10-32-NEXT: BB3_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_5: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_5 +; GFX10-64-NEXT: BB3_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_5: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_cbranch_scc0 BB4_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_5 +; SI-NEXT: BB4_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_5: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_cbranch_scc0 BB4_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_5 +; GFX9-NEXT: BB4_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_5: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s28, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s14, s28, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s14 +; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_5 +; GFX10-32-NEXT: BB4_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_5: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[16:17], s[16:17], exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_5 +; GFX10-64-NEXT: BB4_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_5: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_cbranch_scc0 BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 +; GFX10-32-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; SI-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; SI-NEXT: ; %bb.4: ; %.demote1 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_7: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_movk_i32 s1, 0x3c00 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s3, s3, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s3 +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 1 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, s1, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s1, 0, s1 +; GFX10-32-NEXT: s_and_b32 s3, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s4, s3, s4 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s4 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_4: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_8 +; SI-NEXT: BB7_5: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[8:9], s[6:7], s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SI-NEXT: s_cbranch_execz BB7_4 +; SI-NEXT: ; %bb.6: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.7: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_branch BB7_4 +; SI-NEXT: BB7_8: ; %.return +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_branch BB7_5 +; GFX9-NEXT: BB7_4: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_8 +; GFX9-NEXT: BB7_5: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[8:9], s[6:7], s[8:9] +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_4 +; GFX9-NEXT: ; %bb.6: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.7: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_branch BB7_4 +; GFX9-NEXT: BB7_8: ; %.return +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-32-NEXT: s_branch BB7_5 +; GFX10-32-NEXT: BB7_4: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 +; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB7_8 +; GFX10-32-NEXT: BB7_5: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: s_mov_b32 s4, s0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s5, 0, 1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 +; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX10-32-NEXT: s_and_b32 s4, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s5, s4, s5 +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s5 +; GFX10-32-NEXT: s_cbranch_execz BB7_4 +; GFX10-32-NEXT: ; %bb.6: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s5, 0, 0 +; GFX10-32-NEXT: s_xor_b32 s5, s5, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s0, s0, s5 +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.7: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_wqm_b32 s5, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s5 +; GFX10-32-NEXT: s_branch BB7_4 +; GFX10-32-NEXT: BB7_8: ; %.return +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: s_mov_b32 s3, 0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: s_branch BB7_5 +; GFX10-64-NEXT: BB7_4: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_8 +; GFX10-64-NEXT: BB7_5: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX10-64-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[8:9], s[6:7], s[8:9] +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX10-64-NEXT: s_cbranch_execz BB7_4 +; GFX10-64-NEXT: ; %bb.6: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX10-64-NEXT: s_xor_b64 s[8:9], s[8:9], exec +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[8:9] +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.7: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_branch BB7_4 +; GFX10-64-NEXT: BB7_8: ; %.return +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.live.mask() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir @@ -0,0 +1,16 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: live_mask +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: live_mask + ; CHECK: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask) + ; CHECK: S_ENDPGM 0, implicit [[INT]](s1) + %0:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask) + S_ENDPGM 0, implicit %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir @@ -0,0 +1,69 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s| FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s| FileCheck %s + +--- +name: wqm_demote_scc +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: wqm_demote_scc + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_vcc +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: wqm_demote_vcc + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[ICMP]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_constant_true +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_true + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 true + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... + +--- +name: wqm_demote_constant_false +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_false + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 false + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 exec, exec, exec +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, exec +; GFX9-NEXT: s_cbranch_scc0 BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec +; GFX10-64-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 +; GFX10-32-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cbranch_scc0 BB2_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cbranch_scc0 BB2_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB3_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_5 +; SI-NEXT: BB3_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_5: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB3_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_5 +; GFX9-NEXT: BB3_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_5: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_5 +; GFX10-32-NEXT: BB3_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_5: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_5 +; GFX10-64-NEXT: BB3_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_5: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB4_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_5 +; SI-NEXT: BB4_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_5: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB4_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_5 +; GFX9-NEXT: BB4_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_5: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_5 +; GFX10-32-NEXT: BB4_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_5: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_5 +; GFX10-64-NEXT: BB4_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_5: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_cbranch_scc0 BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 +; GFX10-32-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_or_b64 s[4:5], s[2:3], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SI-NEXT: ; %bb.4: ; %.demote1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: v_bfrev_b32_e32 v0, 60 +; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_7: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s2, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s1, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 +; GFX10-32-NEXT: s_or_b32 s2, s1, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s1, s2 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX10-64-NEXT: s_or_b64 s[4:5], s[2:3], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_4: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_add_i32 s2, s2, 1 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_8 +; SI-NEXT: BB7_5: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SI-NEXT: s_cbranch_execz BB7_4 +; SI-NEXT: ; %bb.6: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.7: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_branch BB7_4 +; SI-NEXT: BB7_8: ; %.return +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_bfrev_b32_e32 v0, 60 +; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch BB7_5 +; GFX9-NEXT: BB7_4: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_8 +; GFX9-NEXT: BB7_5: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_4 +; GFX9-NEXT: ; %bb.6: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.7: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_branch BB7_4 +; GFX9-NEXT: BB7_8: ; %.return +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_mov_b32 s2, 0 +; GFX10-32-NEXT: s_branch BB7_5 +; GFX10-32-NEXT: BB7_4: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_add_i32 s2, s2, 1 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 +; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB7_8 +; GFX10-32-NEXT: BB7_5: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 +; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_or_b32 s4, s3, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s4 +; GFX10-32-NEXT: s_cbranch_execz BB7_4 +; GFX10-32-NEXT: ; %bb.6: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.7: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_branch BB7_4 +; GFX10-32-NEXT: BB7_8: ; %.return +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: s_branch BB7_5 +; GFX10-64-NEXT: BB7_4: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_add_i32 s2, s2, 1 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_8 +; GFX10-64-NEXT: BB7_5: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s2, 0, s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; GFX10-64-NEXT: s_cbranch_execz BB7_4 +; GFX10-64-NEXT: ; %bb.6: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.7: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_branch BB7_4 +; GFX10-64-NEXT: BB7_8: ; %.return +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.live.mask() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone }