diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -893,7 +893,7 @@ return false; // V_NOP will be discarded by SQ. - // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* + // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* // which is always a VGPR and available. auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); Register Reg = Src0->getReg(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1258,6 +1258,10 @@ LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode); + /// \returns v_cmpx version of a v_cmp instruction. + LLVM_READONLY + int getVCMPXOpFromVCMP(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2596,6 +2596,15 @@ let ValueCols = [["0"]]; } +// Maps an v_cmp instruction to its v_cmpx equivalent. +def getVCMPXOpFromVCMP : InstrMapping { + let FilterClass = "VCMPVCMPXTable"; + let RowFields = ["VCMPOp"]; + let ColFields = ["IsVCMPX"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -292,6 +292,148 @@ return false; } +// Backwards-iterate from Origin (for n iterations) until either the beginning +// of the BB is reached or Pred evaluates to true - which can be an arbitrary +// condition based on the current MachineInstr, for instance an target +// instruction. Breaks prematurely by returning nullptr if DisallowDefBetween is +// true and one of the registers given in NonModifiableRegs is modified by the +// current instruction. +static MachineInstr *findInstrBackwards( + MachineInstr &Origin, std::function Pred, + SmallVector &NonModifiableRegs, + bool DisallowDefBetween = true, unsigned MaxInstructions = 5) { + MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), + E = Origin.getParent()->rend(); + unsigned CurrentIteration = 0; + + for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { + bool PredResult = Pred(&*A); + + if (!PredResult) { + if (DisallowDefBetween) + for (MCRegister Reg : NonModifiableRegs) + if (A->modifiesRegister(Reg)) + return nullptr; + + ++CurrentIteration; + continue; + } + + return &*A; + } + + return nullptr; +} + +// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence +// by looking at an instance of a s_and_saveexec instruction. Returns a pointer +// to the v_cmp instruction if there is no write to any of the v_cmp input +// operands and no write to exec inbetween. +static MachineInstr *findPossibleVCMPVCMPXOptimization( + MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, + const SIInstrInfo *TII, MachineRegisterInfo &MRI) { + + MachineInstr *VCmp = nullptr; + + Register SaveExecDest = SaveExec.getOperand(0).getReg(); + if (SaveExec.getOperand(0).getSubReg() || !TRI->isSGPRReg(MRI, SaveExecDest)) + return nullptr; + + MachineOperand *SaveExecSrc0 = + TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg() || SaveExecSrc0->getSubReg()) + return nullptr; + + SmallVector NonDefRegs; + NonDefRegs.push_back(SaveExecSrc0->getReg()); + // Try to find the last v_cmp instruction that defs the saveexec input + // operand. + VCmp = findInstrBackwards( + SaveExec, + [](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1; + }, + NonDefRegs); + + if (!VCmp) + return nullptr; + + // Try to determine if there is either a write to Exec or one of the VCmp + // operands between the saveexec and the vcmp. + // In the first case, the transformation does not make sense. + // In the second case, additional VGPR spilling might need to be inserted + // which might not be worth it. + // In either case, don't replace the instruction sequence. + NonDefRegs.clear(); + NonDefRegs.push_back(Exec); + + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + + if (!Src0 || !Src1) + return nullptr; + + if (Src0->isReg() && !Src0->getSubReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg() && !Src1->getSubReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, + NonDefRegs)) + return nullptr; + + return VCmp; +} + +// Inserts the optimized s_mov_b32 s*, exec_lo / v_cmpx sequence based on the +// operands extracted from a v_cmp ..., s_and_saveexec pattern. +static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, MCRegister Exec, + const SIInstrInfo *TII, + MachineRegisterInfo &MRI) { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + if (SaveExecInstr.getOperand(0).getSubReg()) + return false; + + MachineBasicBlock::instr_iterator InsertPosIt = VCmp.getIterator(); + if (!SaveExecInstr.uses().empty()) + BuildMI(*SaveExecInstr.getParent(), InsertPosIt, + SaveExecInstr.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), MoveDest) + .addReg(Exec); + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add src modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), + VCmp.getDebugLoc(), TII->get(NewOpcode)); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) != + -1) + Builder.addImm(0); + + Builder.add(*Src0); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) != + -1) + Builder.addImm(0); + + Builder.add(*Src1); + + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1) + Builder.addImm(0); + + return true; +} + bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -299,6 +441,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Optimize sequences emitted for control flow lowering. They are originally @@ -458,6 +601,42 @@ } } - return true; + // After all s_op_saveexec instructions are inserted, + // replace (on GFX10.3 and later) + // v_cmp_* SGPR, IMM, VGPR + // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR + // with + // s_mov_b32 EXEC_SGPR_DEST, exec_lo + // v_cmpx_* IMM, VGPR + // to reduce pipeline stalls. + if (ST.hasGFX10_3Insts()) { + DenseMap SaveExecVCmpMapping; + const unsigned AndSaveExecOpcode = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // Try to record existing s_and_saveexec instructions, iff + // they are reading from a v_cmp dest SGPR write. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + if (MachineInstr *VCmp = + findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) + SaveExecVCmpMapping[&MI] = &*VCmp; + } + } + + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); + + if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, *MRI)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); + } + } + } + + return true; } diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -734,7 +734,13 @@ int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - Register DstReg = MI.getOperand(0).getReg(); + // Exclude VOPCX instructions as these don't write explicitly write a + // dst. + MachineOperand &Op0 = MI.getOperand(0); + if (!Op0.isReg() || !Op0.isDef()) + continue; + + Register DstReg = Op0.getReg(); if (DstReg.isVirtual()) { // VOPC instructions can only write to the VCC register. We can't // force them to use VCC here, because this is only one register and @@ -744,7 +750,7 @@ // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we will run // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); + MRI.setRegAllocationHint(Op0.getReg(), 0, VCCReg); continue; } if (DstReg != VCCReg) diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -205,6 +205,11 @@ string NoSDstOp = Name; } +class VCMPVCMPXTable { + bit IsVCMPX = 0; + string VCMPOp = Name; +} + multiclass VOPC_Pseudos , Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e32"> { + VCMPXNoSDstTable<1, opName#"_e32">, + VCMPVCMPXTable { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; @@ -223,7 +229,8 @@ def _e64 : VOP3_Pseudo.ret>, Commutable_REV, - VCMPXNoSDstTable<1, opName#"_e64"> { + VCMPXNoSDstTable<1, opName#"_e64">, + VCMPVCMPXTable { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; @@ -248,23 +255,29 @@ def _nosdst_e32 : VOPC_Pseudo , Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e32"> { + VCMPXNoSDstTable<0, opName#"_e32">, + VCMPVCMPXTable { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isConvergent = 1; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; + // If the result of the substitution is not equal to the original + // opName, this is likely to be a v_cmpx instruction. + let IsVCMPX = !ne(!subst("v_cmpx", "v_cmp", opName), opName); } def _nosdst_e64 : VOP3_Pseudo, Commutable_REV, - VCMPXNoSDstTable<0, opName#"_e64"> { + VCMPXNoSDstTable<0, opName#"_e64">, + VCMPVCMPXTable { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; + let IsVCMPX = !ne(!subst("v_cmpx", "v_cmp", opName), opName); } foreach _ = BoolToList.ret in @@ -915,7 +928,7 @@ def _e64_gfx10 : VOP3_Real(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, VOP3a_gfx10<{0, op}, !cast(NAME#"_nosdst_e64").Pfl> { - let Inst{7-0} = ?; // sdst + let Inst{7-0} = 0x7e; // sdst let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e64").Mnemonic) # "{_e64} " # !cast(NAME#"_nosdst_e64").AsmOperands; } diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s @@ -6,7 +7,7 @@ ; GCN-LABEL: long_forward_scc_branch_3f_offset_bug: ; GFX1030: s_cmp_lg_u32 -; GFX1030-NEXT: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]] +; GFX1030: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]] ; GFX1010: s_cmp_lg_u32 ; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:.LBB[0-9]+_[0-9]+]] @@ -51,9 +52,9 @@ } ; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug: -; GFX1030: v_cmp_eq_u32 -; GFX1030: s_and_saveexec_b32 -; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]] +; GFX1030: s_mov_b32 +; GFX1030: v_cmpx_eq_u32 +; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]] ; GFX1010: v_cmp_eq_u32 ; GFX1010: s_and_saveexec_b32 diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll @@ -0,0 +1,110 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt: +; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_lt_i32_e64 15, v{{.*}} +define i32 @test_insert_vcmpx_pattern_lt(i32 %x) { +entry: + %bc = icmp slt i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt: +; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_gt_i32_e64 17, v{{.*}} +define i32 @test_insert_vcmpx_pattern_gt(i32 %x) { +entry: + %bc = icmp sgt i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq: +; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_ne_u32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_eq(i32 %x) { +entry: + %bc = icmp eq i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne: +; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_eq_u32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_ne(i32 %x) { +entry: + %bc = icmp ne i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le: +; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_lt_i32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_le(i32 %x) { +entry: + %bc = icmp sle i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} + +; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge: +; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}} +; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo +; GFX1030: s_mov_b32 s{{.*}}, exec_lo +; GFX1030-NEXT: v_cmpx_gt_i32_e64 16, v{{.*}} +define i32 @test_insert_vcmpx_pattern_ge(i32 %x) { +entry: + %bc = icmp sge i32 %x, 16 + br i1 %bc, label %endif, label %if + +if: + %ret = shl i32 %x, 2 + ret i32 %ret + +endif: + ret i32 %x +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1248,8 +1248,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_ne_u32_e64 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE @@ -1327,8 +1327,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_ne_u32_e64 0, v1 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 ; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF @@ -1505,12 +1505,12 @@ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: v_cmpx_nlt_f32_e64 0, v1 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen -; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-W32-NEXT: ; implicit-def: $vgpr0 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr1 @@ -1575,8 +1575,8 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e64 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 @@ -2954,9 +2954,9 @@ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: v_cmpx_eq_u32_e64 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo