diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1349,13 +1349,18 @@ [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>]>; -// Pixel shaders only: whether the current pixel is live (i.e. not a helper -// invocation for derivative computation). +// Deprecated: use llvm.amdgcn.live.mask instead. def int_amdgcn_ps_live : Intrinsic < [llvm_i1_ty], [], [IntrNoMem, IntrWillReturn]>; +// Query currently live lanes. +// Returns true if lane is live (and not a helper lane). +def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty], + [], [IntrReadMem, IntrInaccessibleMemOnly] +>; + def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], @@ -1585,6 +1590,11 @@ Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects] >; +// If false, mark all active lanes as helper lanes until the end of program. +def int_amdgcn_wqm_demote : Intrinsic<[], + [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly] +>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4194,6 +4194,11 @@ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); break; } + case Intrinsic::amdgcn_live_mask: { + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } + case Intrinsic::amdgcn_wqm_demote: case Intrinsic::amdgcn_kill: { OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -243,6 +243,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -378,6 +378,18 @@ let SALU = 1; } +let Uses = [EXEC] in { +def SI_LIVE_MASK : PseudoInstSI < + (outs SReg_1:$dst), (ins), + [(set i1:$dst, (int_amdgcn_live_mask))]> { + let SALU = 1; +} +let Defs = [EXEC,SCC] in { +// Demote: Turn a pixel shader thread into a helper lane. +def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; +} // End Defs = [EXEC,SCC] +} // End Uses = [EXEC] + def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), [(int_amdgcn_unreachable)], "; divergent unreachable"> { @@ -751,6 +763,16 @@ (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; +def : Pat < + (int_amdgcn_wqm_demote i1:$src), + (SI_DEMOTE_I1 SCSrc_i1:$src, 0) +>; + +def : Pat < + (int_amdgcn_wqm_demote (i1 (not i1:$src))), + (SI_DEMOTE_I1 SCSrc_i1:$src, -1) +>; + // TODO: we could add more variants for other types of conditionals def : Pat < diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -72,6 +72,7 @@ MachineRegisterInfo *MRI = nullptr; SetVector LoweredEndCf; DenseSet LoweredIf; + SmallSet KillBlocks; const TargetRegisterClass *BoolRC = nullptr; unsigned AndOpc; @@ -84,6 +85,8 @@ unsigned OrSaveExecOpc; unsigned Exec; + bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End); + void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); @@ -161,8 +164,8 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool hasKill(const MachineBasicBlock *Begin, - const MachineBasicBlock *End, const SIInstrInfo *TII) { +bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin, + const MachineBasicBlock *End) { DenseSet Visited; SmallVector Worklist(Begin->successors()); @@ -171,9 +174,8 @@ if (MBB == End || !Visited.insert(MBB).second) continue; - for (auto &Term : MBB->terminators()) - if (TII->isKillTerminator(Term.getOpcode())) - return true; + if (KillBlocks.contains(MBB)) + return true; Worklist.append(MBB->succ_begin(), MBB->succ_end()); } @@ -213,7 +215,7 @@ // Check for SI_KILL_*_TERMINATOR on path from if to endif. // if there is any such terminator simplifications are not safe. auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); - SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent()); } // Add an implicit def of exec to discourage scheduling VALU after this which @@ -799,6 +801,28 @@ Exec = AMDGPU::EXEC; } + // Compute set of blocks with kills + const bool CanDemote = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; + for (auto &MBB : MF) { + bool IsKillBlock = false; + for (auto &Term : MBB.terminators()) { + if (TII->isKillTerminator(Term.getOpcode())) { + KillBlocks.insert(&MBB); + IsKillBlock = true; + break; + } + } + if (CanDemote && !IsKillBlock) { + for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + KillBlocks.insert(&MBB); + break; + } + } + } + } + MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(); BI != MF.end(); BI = NextBB) { @@ -848,6 +872,7 @@ LoweredEndCf.clear(); LoweredIf.clear(); + KillBlocks.clear(); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -457,10 +457,11 @@ III.Disabled = StateWQM | StateWWM; continue; } else { - if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { LiveMaskQueries.push_back(&MI); } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || - Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR) { + Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || + Opcode == AMDGPU::SI_DEMOTE_I1) { KillInstrs.push_back(&MI); BBI.NeedsLowering = true; } else if (WQMOutputs) { @@ -799,6 +800,7 @@ const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *MaskUpdateMI = nullptr; + const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); const MachineOperand &Op = MI.getOperand(0); int64_t KillVal = MI.getOperand(1).getImm(); MachineInstr *ComputeKilledMaskMI = nullptr; @@ -815,10 +817,14 @@ } else { // Static: kill does nothing MachineInstr *NewTerm = nullptr; - assert(MBB.succ_size() == 1); - NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) - .addMBB(*MBB.succ_begin()); - LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); + if (IsDemote) { + LIS->RemoveMachineInstrFromMaps(MI); + } else { + assert(MBB.succ_size() == 1); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); + } MBB.remove(&MI); return NewTerm; } @@ -848,17 +854,30 @@ // In the case we got this far some lanes are still live, // update EXEC to deactivate lanes as appropriate. MachineInstr *NewTerm; - if (Op.isImm()) { - unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); - } else if (!IsWQM) { - NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) + MachineInstr *WQMMaskMI = nullptr; + Register LiveMaskWQM; + if (IsDemote) { + // Demotes deactive quads with only helper lanes + LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); + WQMMaskMI = + BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) .addReg(Exec) - .addReg(LiveMaskReg); + .addReg(LiveMaskWQM); } else { - unsigned Opcode = KillVal ? AndN2Opc : AndOpc; - NewTerm = - BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); + // Kills deactivate lanes + if (Op.isImm()) { + unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); + } else if (!IsWQM) { + NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) + .addReg(Exec) + .addReg(LiveMaskReg); + } else { + unsigned Opcode = KillVal ? AndN2Opc : AndOpc; + NewTerm = + BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); + } } // Update live intervals @@ -871,6 +890,8 @@ LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); LIS->InsertMachineInstrInMaps(*MaskUpdateMI); LIS->InsertMachineInstrInMaps(*EarlyTermMI); + if (WQMMaskMI) + LIS->InsertMachineInstrInMaps(*WQMMaskMI); LIS->InsertMachineInstrInMaps(*NewTerm); if (CndReg) { @@ -879,6 +900,8 @@ } if (TmpReg) LIS->createAndComputeVirtRegInterval(TmpReg); + if (LiveMaskWQM) + LIS->createAndComputeVirtRegInterval(LiveMaskWQM); return NewTerm; } @@ -910,6 +933,7 @@ MachineInstr *SplitPoint = nullptr; switch (MI.getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: case AMDGPU::SI_KILL_I1_TERMINATOR: SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); break; @@ -1319,6 +1343,7 @@ MachineBasicBlock *MBB = MI->getParent(); MachineInstr *SplitPoint = nullptr; switch (MI->getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: case AMDGPU::SI_KILL_I1_TERMINATOR: SplitPoint = lowerKillI1(*MBB, *MI, IsWQM); break; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 exec, exec, exec +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, exec +; GFX9-NEXT: s_cbranch_scc0 BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec +; GFX10-64-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 +; GFX10-32-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB2_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB2_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB3_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_5 +; SI-NEXT: BB3_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_5: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB3_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_5 +; GFX9-NEXT: BB3_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_5: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_5 +; GFX10-32-NEXT: BB3_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_5: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_5 +; GFX10-64-NEXT: BB3_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_5: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB4_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_5 +; SI-NEXT: BB4_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_5: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB4_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_5 +; GFX9-NEXT: BB4_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_5: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_5 +; GFX10-32-NEXT: BB4_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_5: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_5 +; GFX10-64-NEXT: BB4_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_5: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_cbranch_scc0 BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 +; GFX10-32-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: ; %bb.4: ; %.demote1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_7: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s2, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s1, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_b32 s1, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, s1, -1 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 s[2:3], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_4: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_8 +; SI-NEXT: BB7_5: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB7_4 +; SI-NEXT: ; %bb.6: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.7: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_branch BB7_4 +; SI-NEXT: BB7_8: ; %.return +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_branch BB7_5 +; GFX9-NEXT: BB7_4: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB7_8 +; GFX9-NEXT: BB7_5: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX9-NEXT: s_cbranch_execz BB7_4 +; GFX9-NEXT: ; %bb.6: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.7: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_branch BB7_4 +; GFX9-NEXT: BB7_8: ; %.return +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-32-NEXT: s_branch BB7_5 +; GFX10-32-NEXT: BB7_4: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 +; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB7_8 +; GFX10-32-NEXT: BB7_5: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: s_mov_b32 s2, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 +; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 +; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_execz BB7_4 +; GFX10-32-NEXT: ; %bb.6: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.7: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_branch BB7_4 +; GFX10-32-NEXT: BB7_8: ; %.return +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-64-NEXT: s_branch BB7_5 +; GFX10-64-NEXT: BB7_4: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 +; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_cbranch_execz BB7_8 +; GFX10-64-NEXT: BB7_5: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX10-64-NEXT: s_cbranch_execz BB7_4 +; GFX10-64-NEXT: ; %bb.6: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.7: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_branch BB7_4 +; GFX10-64-NEXT: BB7_8: ; %.return +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.live.mask() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir @@ -0,0 +1,16 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: live_mask +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: live_mask + ; CHECK: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask) + ; CHECK: S_ENDPGM 0, implicit [[INT]](s1) + %0:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask) + S_ENDPGM 0, implicit %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir @@ -0,0 +1,69 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s| FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s| FileCheck %s + +--- +name: wqm_demote_scc +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: wqm_demote_scc + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_vcc +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: wqm_demote_vcc + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[ICMP]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s1) = G_ICMP intpred(eq), %0, %1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2 +... + +--- +name: wqm_demote_constant_true +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_true + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 true + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... + +--- +name: wqm_demote_constant_false +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: wqm_demote_constant_false + ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false + ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + %0:_(s1) = G_CONSTANT i1 false + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 exec, exec, exec +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, exec +; GFX9-NEXT: s_cbranch_scc0 BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec +; GFX10-64-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cbranch_scc0 BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 +; GFX10-32-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cbranch_scc0 BB2_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cbranch_scc0 BB2_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 +; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB3_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_5 +; SI-NEXT: BB3_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_5: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB3_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_5 +; GFX9-NEXT: BB3_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_5: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_5 +; GFX10-32-NEXT: BB3_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_5: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_5 +; GFX10-64-NEXT: BB3_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_5: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cbranch_scc0 BB4_4 +; SI-NEXT: ; %bb.2: ; %.demote +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_5 +; SI-NEXT: BB4_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_5: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX9-NEXT: s_cbranch_scc0 BB4_4 +; GFX9-NEXT: ; %bb.2: ; %.demote +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_5 +; GFX9-NEXT: BB4_4: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_5: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-32-NEXT: ; %bb.2: ; %.demote +; GFX10-32-NEXT: s_wqm_b32 s28, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_5 +; GFX10-32-NEXT: BB4_4: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_5: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc +; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 +; GFX10-64-NEXT: ; %bb.2: ; %.demote +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_5 +; GFX10-64-NEXT: BB4_4: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_5: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, exec +; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_cbranch_scc0 BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 +; GFX10-32-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] +; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: ; %bb.4: ; %.demote1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB6_7 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: v_bfrev_b32_e32 v0, 60 +; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_7: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB6_7 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s2, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s1, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 +; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.2: ; %.demote0 +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_4: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_add_i32 s2, s2, 1 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_8 +; SI-NEXT: BB7_5: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB7_4 +; SI-NEXT: ; %bb.6: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cbranch_scc0 BB7_9 +; SI-NEXT: ; %bb.7: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_branch BB7_4 +; SI-NEXT: BB7_8: ; %.return +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_bfrev_b32_e32 v0, 60 +; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.2: ; %.demote0 +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch BB7_5 +; GFX9-NEXT: BB7_4: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_8 +; GFX9-NEXT: BB7_5: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_4 +; GFX9-NEXT: ; %bb.6: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cbranch_scc0 BB7_9 +; GFX9-NEXT: ; %bb.7: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_branch BB7_4 +; GFX9-NEXT: BB7_8: ; %.return +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.2: ; %.demote0 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_mov_b32 s2, 0 +; GFX10-32-NEXT: s_branch BB7_5 +; GFX10-32-NEXT: BB7_4: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_add_i32 s2, s2, 1 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 +; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB7_8 +; GFX10-32-NEXT: BB7_5: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 +; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 +; GFX10-32-NEXT: s_cbranch_execz BB7_4 +; GFX10-32-NEXT: ; %bb.6: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-32-NEXT: ; %bb.7: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_branch BB7_4 +; GFX10-32-NEXT: BB7_8: ; %.return +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.2: ; %.demote0 +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: s_branch BB7_5 +; GFX10-64-NEXT: BB7_4: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_add_i32 s2, s2, 1 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 +; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_8 +; GFX10-64-NEXT: BB7_5: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s2, 0, s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_or_b64 s[6:7], s[6:7], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX10-64-NEXT: s_cbranch_execz BB7_4 +; GFX10-64-NEXT: ; %bb.6: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 +; GFX10-64-NEXT: ; %bb.7: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_branch BB7_4 +; GFX10-64-NEXT: BB7_8: ; %.return +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.live.mask() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.live.mask() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone }