Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,6 +37,15 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def move_uniform_icmp_w_one_use_matchdata : GIDefMatchData<"Register">; + +def move_uniform_icmp_w_one_use : GICombineRule< + (defs root:$i1_use, move_uniform_icmp_w_one_use_matchdata:$matchinfo), + (match (wip_match_opcode G_SELECT, + G_BRCOND):$i1_use, + [{ return RegBankHelper.matchMoveUniformICmpWOneUse(*${i1_use}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyMoveUniformICmpWOneUse(*${i1_use}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -56,6 +65,8 @@ } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", []> { + "AMDGPUGenRegBankCombinerHelper", [move_uniform_icmp_w_one_use]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; + let StateClass = "AMDGPURegBankCombinerHelperState"; + let AdditionalArguments = []; } Index: llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -13,6 +13,7 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -27,6 +28,91 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPURegBankCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + const RegisterBankInfo &RBI; + const TargetRegisterInfo &TRI; + CombinerHelper &Helper; + GISelChangeObserver &Observer; + +public: + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper, + GISelChangeObserver &Observer) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + RBI(*MF.getSubtarget().getRegBankInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper), + Observer(Observer){}; + + bool isSgprRegBank(Register Reg); + + // select zext(trunc(icmp)) ? a : b -> select icmp ? a : b + // brcond zext(trunc(icmp)) %bb1, %bb2 -> brcond icmp %bb1, %bb2 + bool matchMoveUniformICmpWOneUse(MachineInstr &MI, Register &ICmpDef); + void applyMoveUniformICmpWOneUse(MachineInstr &MI, Register &ICmpDef); + MachineOperand &getConditionOp(MachineInstr &MI); +}; + +bool AMDGPURegBankCombinerHelper::isSgprRegBank(Register Reg) { + return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; +} + +MachineOperand &AMDGPURegBankCombinerHelper::getConditionOp(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case TargetOpcode::G_SELECT: + return MI.getOperand(1); + case TargetOpcode::G_BRCOND: + return MI.getOperand(0); + } +} + +bool AMDGPURegBankCombinerHelper::matchMoveUniformICmpWOneUse( + MachineInstr &MI, Register &ICmpDef) { + Register Condition = getConditionOp(MI).getReg(); + if (!isSgprRegBank(Condition) || !MRI.hasOneNonDBGUse(Condition)) + return false; + + Register ZextSrc; + if (!mi_match(Condition, MRI, m_GZExt(m_Reg(ZextSrc))) || + !MRI.hasOneNonDBGUse(ZextSrc)) + return false; + + Register TruncSrc; + if (!mi_match(ZextSrc, MRI, m_GTrunc(m_Reg(TruncSrc))) || + !MRI.hasOneNonDBGUse(TruncSrc) || + MRI.getVRegDef(TruncSrc)->getOpcode() != TargetOpcode::G_ICMP) + return false; + + ICmpDef = TruncSrc; + return true; +} + +void AMDGPURegBankCombinerHelper::applyMoveUniformICmpWOneUse( + MachineInstr &MI, Register &ICmpDef) { + MachineInstr &ICmp = *MRI.getVRegDef(ICmpDef); + B.setInstrAndDebugLoc(MI); + B.buildICmp((CmpInst::Predicate)ICmp.getOperand(1).getPredicate(), + ICmp.getOperand(0), ICmp.getOperand(2), ICmp.getOperand(3)); + Observer.changingInstr(MI); + getConditionOp(MI).setReg(ICmpDef); + Observer.changedInstr(MI); + ICmp.eraseFromParent(); +} + +class AMDGPURegBankCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPURegBankCombinerHelper &RegBankHelper; + +public: + AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, + AMDGPURegBankCombinerHelper &RegBankHelper) + : Helper(Helper), RegBankHelper(RegBankHelper) {} +}; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" @@ -62,9 +148,11 @@ MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper, Observer); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, + RegBankHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -228,6 +228,7 @@ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); + initializeAMDGPURegBankCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); @@ -803,6 +804,7 @@ bool addLegalizeMachineIR() override; void addPreRegBankSelect() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1109,6 +1111,11 @@ return false; } +void GCNPassConfig::addPreGlobalInstructionSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPURegBankCombiner(IsOptNone)); +} + bool GCNPassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); // TODO: Fix instruction selection to do the right thing for image Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-move-uniform-icmp-with-one-use.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-move-uniform-icmp-with-one-use.mir @@ -0,0 +1,226 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: uniform_icmp_select +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1 + + ; CHECK-LABEL: name: uniform_icmp_select + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY1]], [[AND]] + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY4]](s32), [[COPY3]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:vgpr(p1) = COPY $vgpr0_vgpr1 + %7:sgpr(s32) = G_CONSTANT i32 -1 + %12:sgpr(s32) = G_ICMP intpred(sgt), %0(s32), %7 + %8:sgpr(s1) = G_TRUNC %12(s32) + %9:sgpr(s32) = G_CONSTANT i32 268435455 + %10:sgpr(s32) = G_AND %2, %9 + %13:sgpr(s32) = G_ZEXT %8(s1) + %11:sgpr(s32) = G_SELECT %13(s32), %1, %10 + %14:vgpr(s32) = COPY %11(s32) + G_STORE %14(s32), %3(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... + +--- +name: uniform_icmp_select_not_single_use +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; CHECK-LABEL: name: uniform_icmp_select_not_single_use + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT]](s32), [[COPY1]], [[AND]] + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY6]](s32), [[COPY4]](p1) :: (store 4, addrspace 1) + ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) + ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT1]](s32), [[AND]], [[COPY3]] + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[SELECT1]](s32) + ; CHECK: G_STORE [[COPY7]](s32), [[COPY5]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = COPY $sgpr3 + %4:vgpr(p1) = COPY $vgpr0_vgpr1 + %5:vgpr(p1) = COPY $vgpr2_vgpr3 + %11:sgpr(s32) = G_CONSTANT i32 -1 + %17:sgpr(s32) = G_ICMP intpred(sgt), %0(s32), %11 + %12:sgpr(s1) = G_TRUNC %17(s32) + %13:sgpr(s32) = G_CONSTANT i32 268435455 + %14:sgpr(s32) = G_AND %2, %13 + %18:sgpr(s32) = G_ZEXT %12(s1) + %15:sgpr(s32) = G_SELECT %18(s32), %1, %14 + %19:vgpr(s32) = COPY %15(s32) + G_STORE %19(s32), %4(p1) :: (store 4, addrspace 1) + %20:sgpr(s32) = G_ZEXT %12(s1) + %16:sgpr(s32) = G_SELECT %20(s32), %14, %3 + %21:vgpr(s32) = COPY %16(s32) + G_STORE %21(s32), %5(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... + +--- +name: uniform_icmp_brcond +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: uniform_icmp_brcond + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY1]], [[C1]] + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sle), [[COPY]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP]](s32), %bb.1 + ; CHECK: G_BR %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[AND]](s32) + ; CHECK: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store 4, addrspace 1) + ; CHECK: bb.2: + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AND]](s32) + ; CHECK: G_STORE [[COPY5]](s32), [[COPY3]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:vgpr(p1) = COPY $vgpr0_vgpr1 + %3:vgpr(p1) = COPY $vgpr2_vgpr3 + %9:sgpr(s32) = G_CONSTANT i32 -1 + %15:sgpr(s32) = G_ICMP intpred(sle), %0(s32), %9 + %10:sgpr(s1) = G_TRUNC %15(s32) + %11:sgpr(s32) = G_CONSTANT i32 268435455 + %12:sgpr(s32) = G_AND %1, %11 + %16:sgpr(s32) = G_ZEXT %10(s1) + G_BRCOND %16(s32), %bb.1 + G_BR %bb.2 + + bb.1: + %17:vgpr(s32) = COPY %12(s32) + G_STORE %17(s32), %2(p1) :: (store 4, addrspace 1) + + bb.2: + %18:vgpr(s32) = COPY %12(s32) + G_STORE %18(s32), %3(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... + +--- +name: uniform_icmp_brcond_not_single_use +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: uniform_icmp_brcond_not_single_use + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true + ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[C2]](s1) + ; CHECK: [[XOR:%[0-9]+]]:sgpr(s32) = G_XOR [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[XOR]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s1) + ; CHECK: G_BRCOND [[ZEXT]](s32), %bb.2 + ; CHECK: G_BR %bb.1 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT1]](s32), [[COPY1]], [[AND]] + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY5]](s32), [[COPY3]](p1) :: (store 4, addrspace 1) + ; CHECK: bb.2: + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[AND]](s32) + ; CHECK: G_STORE [[COPY6]](s32), [[COPY4]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:vgpr(p1) = COPY $vgpr0_vgpr1 + %4:vgpr(p1) = COPY $vgpr2_vgpr3 + %10:sgpr(s32) = G_CONSTANT i32 -1 + %17:sgpr(s32) = G_ICMP intpred(sgt), %0(s32), %10 + %11:sgpr(s1) = G_TRUNC %17(s32) + %12:sgpr(s32) = G_CONSTANT i32 268435455 + %13:sgpr(s32) = G_AND %2, %12 + %15:sgpr(s1) = G_CONSTANT i1 true + %18:sgpr(s32) = G_ANYEXT %11(s1) + %19:sgpr(s32) = G_ANYEXT %15(s1) + %20:sgpr(s32) = G_XOR %18, %19 + %16:sgpr(s1) = G_TRUNC %20(s32) + %21:sgpr(s32) = G_ZEXT %16(s1) + G_BRCOND %21(s32), %bb.2 + G_BR %bb.1 + + bb.1: + %22:sgpr(s32) = G_ZEXT %11(s1) + %14:sgpr(s32) = G_SELECT %22(s32), %1, %13 + %23:vgpr(s32) = COPY %14(s32) + G_STORE %23(s32), %3(p1) :: (store 4, addrspace 1) + + bb.2: + %24:vgpr(s32) = COPY %13(s32) + G_STORE %24(s32), %4(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -139,9 +139,6 @@ ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc1 BB4_4 ; CHECK-NEXT: ; %bb.1: ; %bb2 ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -161,9 +158,6 @@ ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: BB4_3: ; %bb8 ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc0 BB4_5 ; CHECK-NEXT: BB4_4: ; %bb12 ; CHECK-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -9,9 +9,6 @@ ; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cselect_b32 s1, 1, 0 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -8,9 +8,6 @@ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, 1, 0 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -53,9 +53,6 @@ ; CI-NEXT: s_load_dword s0, s[4:5], 0x11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cselect_b32 s0, 1, 0 -; CI-NEXT: s_and_b32 s0, s0, 1 -; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -71,9 +68,6 @@ ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_cmp_lg_u32 s1, s0 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -53,9 +53,6 @@ ; CI-NEXT: s_load_dword s0, s[4:5], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cselect_b32 s0, 1, 0 -; CI-NEXT: s_and_b32 s0, s0, 1 -; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -71,9 +68,6 @@ ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_cmp_lg_u32 s1, s0 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -43,39 +43,36 @@ define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 56 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_and_b32 s0, s0, 1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s3, 56 ; GCN-NEXT: s_cbranch_scc0 BB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_branch BB2_3 ; GCN-NEXT: BB2_2: -; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_mov_b32 s4, -1 ; GCN-NEXT: BB2_3: ; %Flow -; GCN-NEXT: s_xor_b32 s0, s0, -1 -; GCN-NEXT: s_and_b32 s0, s0, 1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_xor_b32 s2, s4, -1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 BB2_5 ; GCN-NEXT: ; %bb.4: ; %.zero -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: BB2_5: ; %.exit ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/move-uniform-icmp-with-one-use.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/move-uniform-icmp-with-one-use.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s + +define amdgpu_cs void @uniform_icmp_select(i32 inreg %cmp_val, i32 inreg %src1, i32 inreg %src2, i32 addrspace(1)* %out_ptr) { +; GFX10-LABEL: uniform_icmp_select: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, s2, 0xfffffff +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s0, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + %result = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_cs void @uniform_icmp_select_not_single_use(i32 inreg %cmp_val, i32 inreg %src1, i32 inreg %src2, i32 inreg %src3, i32 addrspace(1)* %out_ptr1, i32 addrspace(1)* %out_ptr2) { +; GFX10-LABEL: uniform_icmp_select_not_single_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, 0xfffffff +; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s1, s1, s2 +; GFX10-NEXT: s_cselect_b32 s0, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: global_store_dword v[2:3], v5, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + %result1 = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result1, i32 addrspace(1)* %out_ptr1 + %result2 = select i1 %cmp, i32 %maskedsrc2, i32 %src3 + store i32 %result2, i32 addrspace(1)* %out_ptr2 + ret void +} + +define amdgpu_cs void @uniform_icmp_brcond(i32 inreg %cmp_val, i32 inreg %src, i32 addrspace(1)* %out_ptr1, i32 addrspace(1)* %out_ptr2) { +; GFX10-LABEL: uniform_icmp_brcond: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xfffffff +; GFX10-NEXT: s_cmp_le_i32 s0, -1 +; GFX10-NEXT: s_cbranch_scc1 BB2_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: BB2_2: ; %bb1 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc = and i32 %src, 268435455 + br i1 %cmp, label %bb0, label %bb1 +bb0: + store i32 %maskedsrc, i32 addrspace(1)* %out_ptr1 + br label %bb1 +bb1: + store i32 %maskedsrc, i32 addrspace(1)* %out_ptr2 + ret void +} + +define amdgpu_cs void @uniform_icmp_brcond_not_single_use(i32 inreg %cmp_val, i32 inreg %src1, i32 inreg %src2, i32 addrspace(1)* %out_ptr1, i32 addrspace(1)* %out_ptr2) { +; GFX10-LABEL: uniform_icmp_brcond_not_single_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_xor_b32 s0, s3, -1 +; GFX10-NEXT: s_and_b32 s4, s0, 1 +; GFX10-NEXT: s_and_b32 s0, s2, 0xfffffff +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cbranch_scc1 BB3_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: s_and_b32 s2, s3, 1 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: s_cselect_b32 s1, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: BB3_2: ; %bb1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + br i1 %cmp, label %bb0, label %bb1 +bb0: + %result1 = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result1, i32 addrspace(1)* %out_ptr1 + br label %bb1 +bb1: + store i32 %maskedsrc2, i32 addrspace(1)* %out_ptr2 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -21,18 +21,12 @@ ; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -102,9 +96,6 @@ ; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -5293,32 +5293,29 @@ ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f +; GFX6-NEXT: s_movk_i32 s12, 0x7f ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_sub_i32 s6, s12, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_and_b32 s13, s13, 1 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_add_u32 s2, s2, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 @@ -5387,31 +5384,28 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_movk_i32 s12, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_sub_i32 s6, s12, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_and_b32 s13, s13, 1 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s2, s2, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -5480,31 +5474,28 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_movk_i32 s12, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_sub_i32 s6, s12, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_and_b32 s13, s13, 1 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_add_u32 s2, s2, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 @@ -5570,57 +5561,54 @@ ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 +; GFX10-NEXT: s_sub_i32 s14, s12, 64 +; GFX10-NEXT: s_and_b32 s13, 1, s1 ; GFX10-NEXT: s_sub_i32 s2, 64, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0 ; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s13 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 +; GFX10-NEXT: s_and_b32 s15, s15, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 ; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6586,21 +6574,19 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], 0 ; GFX6-NEXT: s_sub_i32 s21, s20, 64 ; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6613,8 +6599,7 @@ ; GFX6-NEXT: s_and_b32 s23, s23, 1 ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6670,14 +6655,12 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6689,8 +6672,7 @@ ; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6775,8 +6757,6 @@ ; GFX8-NEXT: s_sub_i32 s22, 64, s20 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6787,8 +6767,7 @@ ; GFX8-NEXT: s_and_b32 s23, s23, 1 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6858,8 +6837,6 @@ ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6870,8 +6847,7 @@ ; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6955,8 +6931,6 @@ ; GFX9-NEXT: s_sub_i32 s22, 64, s20 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6967,8 +6941,7 @@ ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -7038,8 +7011,6 @@ ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -7050,8 +7021,7 @@ ; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -7107,6 +7077,7 @@ ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_addc_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7116,144 +7087,137 @@ ; GFX10-NEXT: s_addc_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_movk_i32 s20, 0x7f +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: s_sub_i32 s21, s20, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], 0 ; GFX10-NEXT: s_sub_i32 s22, 64, s20 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 +; GFX10-NEXT: s_and_b32 s23, s10, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s31, 31 ; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_brev_b32 s23, 1 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s28 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s29 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 ; GFX10-NEXT: s_addc_u32 s3, s3, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_add_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_addc_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s12, 1, s2 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 ; GFX10-NEXT: s_and_b32 s13, s10, 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s4, s4, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -5279,32 +5279,29 @@ ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f +; GFX6-NEXT: s_movk_i32 s12, 0x7f ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_sub_i32 s6, s12, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_and_b32 s13, s13, 1 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_add_u32 s2, s2, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 @@ -5373,31 +5370,28 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_movk_i32 s12, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_sub_i32 s6, s12, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_and_b32 s13, s13, 1 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s2, s2, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -5466,31 +5460,28 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_movk_i32 s12, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_sub_i32 s6, s12, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_and_b32 s13, s13, 1 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_add_u32 s2, s2, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 @@ -5556,57 +5547,54 @@ ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 +; GFX10-NEXT: s_sub_i32 s14, s12, 64 +; GFX10-NEXT: s_and_b32 s13, 1, s1 ; GFX10-NEXT: s_sub_i32 s2, 64, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[6:7], 0 ; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s13 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 +; GFX10-NEXT: s_and_b32 s15, s15, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 ; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6572,21 +6560,19 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_subb_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 ; GFX6-NEXT: s_sub_i32 s21, s20, 64 ; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6599,8 +6585,7 @@ ; GFX6-NEXT: s_and_b32 s23, s23, 1 ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6656,14 +6641,12 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6675,8 +6658,7 @@ ; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6761,8 +6743,6 @@ ; GFX8-NEXT: s_sub_i32 s22, 64, s20 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6773,8 +6753,7 @@ ; GFX8-NEXT: s_and_b32 s23, s23, 1 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6844,8 +6823,6 @@ ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6856,8 +6833,7 @@ ; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6941,8 +6917,6 @@ ; GFX9-NEXT: s_sub_i32 s22, 64, s20 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6953,8 +6927,7 @@ ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -7024,8 +6997,6 @@ ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -7036,8 +7007,7 @@ ; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -7093,6 +7063,7 @@ ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7102,144 +7073,137 @@ ; GFX10-NEXT: s_subb_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_movk_i32 s20, 0x7f +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: s_sub_i32 s21, s20, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[10:11], 0 ; GFX10-NEXT: s_sub_i32 s22, 64, s20 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 +; GFX10-NEXT: s_and_b32 s23, s10, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s31, 31 ; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_brev_b32 s23, 1 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s28 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s29 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 ; GFX10-NEXT: s_addc_u32 s3, s3, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_subb_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s12, 1, s2 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 ; GFX10-NEXT: s_and_b32 s13, s10, 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s4, s4, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5