diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1258,6 +1258,10 @@ LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode); + /// \returns v_cmpx version of a v_cmp instruction. + LLVM_READONLY + int getVCMPXOpFromVCMP(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2596,6 +2596,15 @@ let ValueCols = [["0"]]; } +// Maps an v_cmp instruction to its v_cmpx equivalent. +def getVCMPXOpFromVCMP : InstrMapping { + let FilterClass = "VCMPVCMPXTable"; + let RowFields = ["VCMPXOp"]; + let ColFields = ["IsVCMP"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -292,6 +292,113 @@ return false; } +static MachineInstr * +findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, + const SIRegisterInfo *TRI, + MachineRegisterInfo &MRI) { + size_t CurrentOperandIdx = 0; + MachineInstr *VCmp = nullptr; + for (const MachineOperand &op : SaveExec.operands()) { + if (!op.isReg()) + continue; + + // Check if the dest of the saveexec instruction is an SGPR. + const Register &Reg = op.getReg(); + if (CurrentOperandIdx == 0) { + if (!TRI->isSGPRReg(MRI, Reg)) + return nullptr; + } else { + ++CurrentOperandIdx; + continue; + } + + // Try to find the last v_cmp instruction that defs the saveexec input operand. + for (MachineInstr &I : MRI.def_instructions(Reg)) { + if (AMDGPU::getVCMPXOpFromVCMP(I.getOpcode()) == -1) + continue; + + if (!I.getOperand(0).isReg()) + continue; + + // If the v_cmp def is only used in the saveexec, mark the instr for + // optimization. + const Register &VCmpDest = I.getOperand(0).getReg(); + if (MRI.hasOneUse(VCmpDest) && VCmpDest == Reg) { + VCmp = &I; + break; + } + + ++CurrentOperandIdx; + } + + if (VCmp) + break; + } + + return VCmp; +} + +static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, MCRegister Exec, + const SIInstrInfo *TII) { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + if (!Src0 || !Src1) + return false; + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + + MachineBasicBlock::instr_iterator CmpIt = VCmp.getIterator(); + + // If there are defines of Exec between the v_cmp and the saveexec instr, insert the + // new sequence right behind the last define. + MachineBasicBlock::reverse_iterator A = SaveExecInstr.getReverseIterator(), + E = SaveExecInstr.getParent()->rend(); + + for (++A; A != E; ++A) { + if (A->modifiesRegister(Exec)) { + CmpIt = A->getIterator(); + break; + } + + if (A == VCmp) + break; + } + + if (!SaveExecInstr.uses().empty()) + BuildMI(*SaveExecInstr.getParent(), CmpIt, + SaveExecInstr.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), MoveDest) + .addReg(Exec); + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add src modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(CmpIt), VCmp.getDebugLoc(), + TII->get(NewOpcode)); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) != + -1) + Builder.addImm(0); + + Builder.add(*Src0); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) != + -1) + Builder.addImm(0); + + Builder.add(*Src1); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1) + Builder.addImm(0); + + return true; +} + bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -299,6 +406,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Optimize sequences emitted for control flow lowering. They are originally @@ -394,6 +502,7 @@ if (ReadsCopyFromExec) { SaveExecInst = &*J; LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); + continue; } else { LLVM_DEBUG(dbgs() @@ -458,6 +567,42 @@ } } - return true; + // After all s_op_saveexec instructions are inserted, + // replace (on GFX10.3) + // v_cmp_* SGPR, IMM, VGPR + // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR + // with + // s_mov_b32 EXEC_SGPR_DEST, exec_lo + // v_cmpx_* IMM, VGPR + // to reduce pipeline stalls. + if (ST.hasGFX10_3Insts()) { + DenseMap SaveExecVCmpMapping; + const unsigned AndSaveExecOpcode = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // Try to record existing s_and_saveexec instructions, iff + // they are reading from a v_cmp dest SGPR write. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = + findPossibleVCMPVCMPXOptimization(MI, TRI, *MRI)) + SaveExecVCmpMapping[&MI] = &*VCmp; + } + } + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); + + if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); + } + } + } + + return true; } diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -734,7 +734,12 @@ int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - Register DstReg = MI.getOperand(0).getReg(); + // Exclude VOPCX instructions as these don't write explicitly write a dst. + MachineOperand &Op0 = MI.getOperand(0); + if (!Op0.isReg() || !Op0.isDef()) + continue; + + Register DstReg = Op0.getReg(); if (DstReg.isVirtual()) { // VOPC instructions can only write to the VCC register. We can't // force them to use VCC here, because this is only one register and @@ -744,7 +749,7 @@ // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we will run // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); + MRI.setRegAllocationHint(Op0.getReg(), 0, VCCReg); continue; } if (DstReg != VCCReg) diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -205,6 +205,11 @@ string NoSDstOp = Name; } +class VCMPVCMPXTable { + bit IsVCMP = is_vcmp; + string VCMPXOp = Name; +} + multiclass VOPC_Pseudos , Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e32"> { + VCMPXNoSDstTable<1, opName#"_e32">, + VCMPVCMPXTable<0, opName#"_e32"> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; @@ -223,7 +229,8 @@ def _e64 : VOP3_Pseudo.ret>, Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e64"> { + VCMPXNoSDstTable<1, opName#"_e64">, + VCMPVCMPXTable<0, opName#"_e64"> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; @@ -248,7 +255,8 @@ def _nosdst_e32 : VOPC_Pseudo , Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e32"> { + VCMPXNoSDstTable<0, opName#"_e32">, + VCMPVCMPXTable { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isConvergent = 1; @@ -259,7 +267,8 @@ def _nosdst_e64 : VOP3_Pseudo, Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e64"> { + VCMPXNoSDstTable<0, opName#"_e64">, + VCMPVCMPXTable { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isCompare = 1; @@ -915,7 +924,7 @@ def _e64_gfx10 : VOP3_Real(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, VOP3a_gfx10<{0, op}, !cast(NAME#"_nosdst_e64").Pfl> { - let Inst{7-0} = ?; // sdst + let Inst{7-0} = 0x7e; // sdst let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e64").Mnemonic) # "{_e64} " # !cast(NAME#"_nosdst_e64").AsmOperands; } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -9,7 +9,7 @@ ; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32( ; OPT-NEXT: entry: ; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999 -; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3 +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT: if: @@ -27,9 +27,9 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: s_mov_b32 s4, exec_lo +; GCN-NEXT: v_cmpx_ne_u32_e64 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll @@ -0,0 +1,110 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt: +; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_lt_i32_e64 15, v{{.*}} +define i32 @test_insert_vcmpx_pattern_lt(i32 %x) { +entry: + %bc = icmp slt i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt: +; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_gt_i32_e64 17, v{{.*}} +define i32 @test_insert_vcmpx_pattern_gt(i32 %x) { +entry: + %bc = icmp sgt i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq: +; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_ne_u32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_eq(i32 %x) { +entry: + %bc = icmp eq i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne: +; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_eq_u32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_ne(i32 %x) { +entry: + %bc = icmp ne i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le: +; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_lt_i32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_le(i32 %x) { +entry: + %bc = icmp sle i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge: +; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_gt_i32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_ge(i32 %x) { +entry: + %bc = icmp sge i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -380,10 +380,10 @@ ; GFX10-W32-LABEL: test_wwm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB9_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 @@ -444,10 +444,10 @@ ; GFX10-W32-LABEL: test_wwm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB10_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 @@ -570,10 +570,10 @@ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB12_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 @@ -840,10 +840,10 @@ ; GFX10-W32-LABEL: test_strict_wqm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB17_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo @@ -906,10 +906,10 @@ ; GFX10-W32-LABEL: test_strict_wqm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB18_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo @@ -1039,10 +1039,10 @@ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB20_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo @@ -1248,8 +1248,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_ne_u32_e64 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE @@ -1327,8 +1327,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_ne_u32_e64 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF @@ -1413,13 +1413,13 @@ ; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cmpx_nlt_f32_e64 0, v0 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 @@ -1505,12 +1505,12 @@ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: v_cmpx_nlt_f32_e64 0, v1 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen -; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-W32-NEXT: ; implicit-def: $vgpr0 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr1 @@ -1575,8 +1575,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e64 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 @@ -2335,9 +2335,9 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e64 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB36_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D @@ -2488,10 +2488,10 @@ ; GFX10-W32-LABEL: test_strict_wwm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB39_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 @@ -2552,10 +2552,10 @@ ; GFX10-W32-LABEL: test_strict_wwm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 @@ -2678,10 +2678,10 @@ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 -; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 +; GFX10-W32-NEXT: v_cmpx_gt_u32_e64 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB42_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 @@ -2868,9 +2868,9 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e64 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB45_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D @@ -2954,9 +2954,9 @@ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e64 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo