Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,6 +37,25 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def move_uniform_icmp_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::MoveUniformICmpMatchInfo">; + +def move_uniform_icmp : GICombineRule< + (defs root:$i1_use, move_uniform_icmp_matchdata:$matchinfo), + (match (wip_match_opcode G_SELECT, + G_BRCOND):$i1_use, + [{ return RegBankHelper.matchMoveUniformICmp(*${i1_use}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyMoveUniformICmp(*${i1_use}, ${matchinfo}); }])>; + + +def uniform_icmp_matchdata : GIDefMatchData<"Register">; + +def uniform_icmp : GICombineRule< + (defs root:$i1_use, uniform_icmp_matchdata:$matchinfo), + (match (wip_match_opcode G_SELECT, + G_BRCOND):$i1_use, + [{ return RegBankHelper.matchUniformICmp(*${i1_use}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyUniformICmp(*${i1_use}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -56,6 +75,8 @@ } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", []> { + "AMDGPUGenRegBankCombinerHelper", [move_uniform_icmp, uniform_icmp]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; + let StateClass = "AMDGPURegBankCombinerHelperState"; + let AdditionalArguments = []; } Index: llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -13,6 +13,7 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -27,6 +28,142 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPURegBankCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + const RegisterBankInfo &RBI; + const TargetRegisterInfo &TRI; + CombinerHelper &Helper; + GISelChangeObserver &Observer; + +public: + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper, + GISelChangeObserver &Observer) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + RBI(*MF.getSubtarget().getRegBankInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper), + Observer(Observer){}; + + bool isSgprRegBank(Register Reg); + + struct MoveUniformICmpMatchInfo { + Register ICmpDef; + Register TruncDef; + }; + + // Move uniform icmp (and trunc if there are multiple uses of trunc) before + // select/brcond. + // select zext(trunc(icmp)) ? a : b -> select icmp ? a : b + // brcond zext(trunc(icmp)) %bb1, %bb2 -> brcond icmp %bb1, %bb2 + bool matchMoveUniformICmp(MachineInstr &MI, + MoveUniformICmpMatchInfo &MatchInfo); + void applyMoveUniformICmp(MachineInstr &MI, + MoveUniformICmpMatchInfo &MatchInfo); + MachineOperand &getConditionOp(MachineInstr &MI); + + // Replaces uniform zext(trunc(icmp)) with icmp. + bool matchUniformICmp(MachineInstr &MI, Register &ICmp); + void applyUniformICmp(MachineInstr &MI, Register &ICmp); +}; + +bool AMDGPURegBankCombinerHelper::isSgprRegBank(Register Reg) { + return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; +} + +MachineOperand &AMDGPURegBankCombinerHelper::getConditionOp(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case TargetOpcode::G_SELECT: + return MI.getOperand(1); + case TargetOpcode::G_BRCOND: + return MI.getOperand(0); + } +} + +bool AMDGPURegBankCombinerHelper::matchMoveUniformICmp( + MachineInstr &MI, MoveUniformICmpMatchInfo &MatchInfo) { + Register Condition = getConditionOp(MI).getReg(); + if (!isSgprRegBank(Condition) || !MRI.hasOneNonDBGUse(Condition)) + return false; + + Register ZextSrc; + if (!mi_match(Condition, MRI, m_GZExt(m_Reg(ZextSrc)))) + return false; + + Register TruncSrc; + if (!mi_match(ZextSrc, MRI, m_GTrunc(m_Reg(TruncSrc))) || + !MRI.hasOneNonDBGUse(TruncSrc) || + MRI.getVRegDef(TruncSrc)->getOpcode() != TargetOpcode::G_ICMP) + return false; + + MatchInfo.TruncDef = 0; + // Multiple uses of trunc, we also have to move trunc. + if (!MRI.hasOneNonDBGUse(ZextSrc)) { + MachineInstr *Zext = MRI.getVRegDef(Condition); + for (auto &TruncUse : MRI.use_nodbg_instructions(ZextSrc)) { + if (!Helper.dominates(*Zext, TruncUse)) + return false; + } + MatchInfo.TruncDef = ZextSrc; + } + + MatchInfo.ICmpDef = TruncSrc; + return true; +} + +void AMDGPURegBankCombinerHelper::applyMoveUniformICmp( + MachineInstr &MI, MoveUniformICmpMatchInfo &MatchInfo) { + MachineInstr &ICmp = *MRI.getVRegDef(MatchInfo.ICmpDef); + B.setInstrAndDebugLoc(MI); + B.buildICmp((CmpInst::Predicate)ICmp.getOperand(1).getPredicate(), + ICmp.getOperand(0), ICmp.getOperand(2), ICmp.getOperand(3)); + if (MatchInfo.TruncDef) { + MRI.getVRegDef(MatchInfo.TruncDef)->eraseFromParent(); + B.buildTrunc(MatchInfo.TruncDef, MatchInfo.ICmpDef); + } + ICmp.eraseFromParent(); + Observer.changingInstr(MI); + getConditionOp(MI).setReg(MatchInfo.ICmpDef); + Observer.changedInstr(MI); +} + +bool AMDGPURegBankCombinerHelper::matchUniformICmp(MachineInstr &MI, + Register &ICmp) { + Register Condition = getConditionOp(MI).getReg(); + if (!isSgprRegBank(Condition)) + return false; + + Register I1_Src; + if (!mi_match(Condition, MRI, m_GZExt(m_GTrunc(m_Reg(I1_Src))))) + return false; + + if (MRI.getVRegDef(I1_Src)->getOpcode() != TargetOpcode::G_ICMP) + return false; + + ICmp = I1_Src; + return true; +} + +void AMDGPURegBankCombinerHelper::applyUniformICmp(MachineInstr &MI, + Register &ICmp) { + Observer.changingInstr(MI); + getConditionOp(MI).setReg(ICmp); + Observer.changedInstr(MI); +} + +class AMDGPURegBankCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPURegBankCombinerHelper &RegBankHelper; + +public: + AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, + AMDGPURegBankCombinerHelper &RegBankHelper) + : Helper(Helper), RegBankHelper(RegBankHelper) {} +}; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" @@ -62,9 +199,11 @@ MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper, Observer); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, + RegBankHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -228,6 +228,7 @@ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); + initializeAMDGPURegBankCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); @@ -803,6 +804,7 @@ bool addLegalizeMachineIR() override; void addPreRegBankSelect() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1109,6 +1111,11 @@ return false; } +void GCNPassConfig::addPreGlobalInstructionSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPURegBankCombiner(IsOptNone)); +} + bool GCNPassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); // TODO: Fix instruction selection to do the right thing for image Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-move-uniform-icmp.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-move-uniform-icmp.mir @@ -0,0 +1,223 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: uniform_icmp_select +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1 + + ; CHECK-LABEL: name: uniform_icmp_select + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY1]], [[AND]] + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY4]](s32), [[COPY3]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:vgpr(p1) = COPY $vgpr0_vgpr1 + %7:sgpr(s32) = G_CONSTANT i32 -1 + %12:sgpr(s32) = G_ICMP intpred(sgt), %0(s32), %7 + %8:sgpr(s1) = G_TRUNC %12(s32) + %9:sgpr(s32) = G_CONSTANT i32 268435455 + %10:sgpr(s32) = G_AND %2, %9 + %13:sgpr(s32) = G_ZEXT %8(s1) + %11:sgpr(s32) = G_SELECT %13(s32), %1, %10 + %14:vgpr(s32) = COPY %11(s32) + G_STORE %14(s32), %3(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... + +--- +name: uniform_icmp_select_not_single_use +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; CHECK-LABEL: name: uniform_icmp_select_not_single_use + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY1]], [[AND]] + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY6]](s32), [[COPY4]](p1) :: (store 4, addrspace 1) + ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[AND]], [[COPY3]] + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[SELECT1]](s32) + ; CHECK: G_STORE [[COPY7]](s32), [[COPY5]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = COPY $sgpr3 + %4:vgpr(p1) = COPY $vgpr0_vgpr1 + %5:vgpr(p1) = COPY $vgpr2_vgpr3 + %11:sgpr(s32) = G_CONSTANT i32 -1 + %17:sgpr(s32) = G_ICMP intpred(sgt), %0(s32), %11 + %12:sgpr(s1) = G_TRUNC %17(s32) + %13:sgpr(s32) = G_CONSTANT i32 268435455 + %14:sgpr(s32) = G_AND %2, %13 + %18:sgpr(s32) = G_ZEXT %12(s1) + %15:sgpr(s32) = G_SELECT %18(s32), %1, %14 + %19:vgpr(s32) = COPY %15(s32) + G_STORE %19(s32), %4(p1) :: (store 4, addrspace 1) + %20:sgpr(s32) = G_ZEXT %12(s1) + %16:sgpr(s32) = G_SELECT %20(s32), %14, %3 + %21:vgpr(s32) = COPY %16(s32) + G_STORE %21(s32), %5(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... + +--- +name: uniform_icmp_brcond +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: uniform_icmp_brcond + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY1]], [[C1]] + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sle), [[COPY]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP]](s32), %bb.1 + ; CHECK: G_BR %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[AND]](s32) + ; CHECK: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store 4, addrspace 1) + ; CHECK: bb.2: + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AND]](s32) + ; CHECK: G_STORE [[COPY5]](s32), [[COPY3]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:vgpr(p1) = COPY $vgpr0_vgpr1 + %3:vgpr(p1) = COPY $vgpr2_vgpr3 + %9:sgpr(s32) = G_CONSTANT i32 -1 + %15:sgpr(s32) = G_ICMP intpred(sle), %0(s32), %9 + %10:sgpr(s1) = G_TRUNC %15(s32) + %11:sgpr(s32) = G_CONSTANT i32 268435455 + %12:sgpr(s32) = G_AND %1, %11 + %16:sgpr(s32) = G_ZEXT %10(s1) + G_BRCOND %16(s32), %bb.1 + G_BR %bb.2 + + bb.1: + %17:vgpr(s32) = COPY %12(s32) + G_STORE %17(s32), %2(p1) :: (store 4, addrspace 1) + + bb.2: + %18:vgpr(s32) = COPY %12(s32) + G_STORE %18(s32), %3(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... + +--- +name: uniform_icmp_brcond_not_single_use +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: uniform_icmp_brcond_not_single_use + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true + ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[C2]](s1) + ; CHECK: [[XOR:%[0-9]+]]:sgpr(s32) = G_XOR [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[XOR]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s1) + ; CHECK: G_BRCOND [[ZEXT]](s32), %bb.2 + ; CHECK: G_BR %bb.1 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY1]], [[AND]] + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY5]](s32), [[COPY3]](p1) :: (store 4, addrspace 1) + ; CHECK: bb.2: + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[AND]](s32) + ; CHECK: G_STORE [[COPY6]](s32), [[COPY4]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:vgpr(p1) = COPY $vgpr0_vgpr1 + %4:vgpr(p1) = COPY $vgpr2_vgpr3 + %10:sgpr(s32) = G_CONSTANT i32 -1 + %17:sgpr(s32) = G_ICMP intpred(sgt), %0(s32), %10 + %11:sgpr(s1) = G_TRUNC %17(s32) + %12:sgpr(s32) = G_CONSTANT i32 268435455 + %13:sgpr(s32) = G_AND %2, %12 + %15:sgpr(s1) = G_CONSTANT i1 true + %18:sgpr(s32) = G_ANYEXT %11(s1) + %19:sgpr(s32) = G_ANYEXT %15(s1) + %20:sgpr(s32) = G_XOR %18, %19 + %16:sgpr(s1) = G_TRUNC %20(s32) + %21:sgpr(s32) = G_ZEXT %16(s1) + G_BRCOND %21(s32), %bb.2 + G_BR %bb.1 + + bb.1: + %22:sgpr(s32) = G_ZEXT %11(s1) + %14:sgpr(s32) = G_SELECT %22(s32), %1, %13 + %23:vgpr(s32) = COPY %14(s32) + G_STORE %23(s32), %3(p1) :: (store 4, addrspace 1) + + bb.2: + %24:vgpr(s32) = COPY %13(s32) + G_STORE %24(s32), %4(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -139,9 +139,6 @@ ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc1 BB4_4 ; CHECK-NEXT: ; %bb.1: ; %bb2 ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -161,9 +158,6 @@ ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: BB4_3: ; %bb8 ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc0 BB4_5 ; CHECK-NEXT: BB4_4: ; %bb12 ; CHECK-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -9,9 +9,6 @@ ; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cselect_b32 s1, 1, 0 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -8,9 +8,6 @@ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, 1, 0 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -53,9 +53,6 @@ ; CI-NEXT: s_load_dword s0, s[4:5], 0x11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cselect_b32 s0, 1, 0 -; CI-NEXT: s_and_b32 s0, s0, 1 -; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -71,9 +68,6 @@ ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_cmp_lg_u32 s1, s0 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -53,9 +53,6 @@ ; CI-NEXT: s_load_dword s0, s[4:5], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cselect_b32 s0, 1, 0 -; CI-NEXT: s_and_b32 s0, s0, 1 -; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -71,9 +68,6 @@ ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_cmp_lg_u32 s1, s0 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -43,39 +43,36 @@ define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 56 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_and_b32 s0, s0, 1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s3, 56 ; GCN-NEXT: s_cbranch_scc0 BB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_branch BB2_3 ; GCN-NEXT: BB2_2: -; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_mov_b32 s4, -1 ; GCN-NEXT: BB2_3: ; %Flow -; GCN-NEXT: s_xor_b32 s0, s0, -1 -; GCN-NEXT: s_and_b32 s0, s0, 1 -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_xor_b32 s2, s4, -1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 BB2_5 ; GCN-NEXT: ; %bb.4: ; %.zero -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: BB2_5: ; %.exit ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/move-uniform-icmp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/move-uniform-icmp.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s + +define amdgpu_cs void @uniform_icmp_select(i32 inreg %cmp_val, i32 inreg %src1, i32 inreg %src2, i32 addrspace(1)* %out_ptr) { +; GFX10-LABEL: uniform_icmp_select: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, s2, 0xfffffff +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s0, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + %result = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_cs void @uniform_icmp_select_not_single_use(i32 inreg %cmp_val, i32 inreg %src1, i32 inreg %src2, i32 inreg %src3, i32 addrspace(1)* %out_ptr1, i32 addrspace(1)* %out_ptr2) { +; GFX10-LABEL: uniform_icmp_select_not_single_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, s2, 0xfffffff +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s0, s1, s2 +; GFX10-NEXT: s_cselect_b32 s1, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: global_store_dword v[2:3], v5, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + %result1 = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result1, i32 addrspace(1)* %out_ptr1 + %result2 = select i1 %cmp, i32 %maskedsrc2, i32 %src3 + store i32 %result2, i32 addrspace(1)* %out_ptr2 + ret void +} + +define amdgpu_cs void @uniform_icmp_brcond(i32 inreg %cmp_val, i32 inreg %src, i32 addrspace(1)* %out_ptr1, i32 addrspace(1)* %out_ptr2) { +; GFX10-LABEL: uniform_icmp_brcond: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xfffffff +; GFX10-NEXT: s_cmp_le_i32 s0, -1 +; GFX10-NEXT: s_cbranch_scc1 BB2_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: BB2_2: ; %bb1 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc = and i32 %src, 268435455 + br i1 %cmp, label %bb0, label %bb1 +bb0: + store i32 %maskedsrc, i32 addrspace(1)* %out_ptr1 + br label %bb1 +bb1: + store i32 %maskedsrc, i32 addrspace(1)* %out_ptr2 + ret void +} + +define amdgpu_cs void @uniform_icmp_brcond_not_single_use(i32 inreg %cmp_val, i32 inreg %src1, i32 inreg %src2, i32 addrspace(1)* %out_ptr1, i32 addrspace(1)* %out_ptr2) { +; GFX10-LABEL: uniform_icmp_brcond_not_single_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_xor_b32 s0, s3, -1 +; GFX10-NEXT: s_and_b32 s4, s0, 1 +; GFX10-NEXT: s_and_b32 s0, s2, 0xfffffff +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cbranch_scc1 BB3_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s1, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: BB3_2: ; %bb1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + br i1 %cmp, label %bb0, label %bb1 +bb0: + %result1 = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result1, i32 addrspace(1)* %out_ptr1 + br label %bb1 +bb1: + store i32 %maskedsrc2, i32 addrspace(1)* %out_ptr2 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -21,18 +21,12 @@ ; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -102,9 +96,6 @@ ; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -5291,34 +5291,29 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], 0 +; GFX6-NEXT: s_movk_i32 s12, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: s_sub_i32 s4, 64, s12 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX6-NEXT: s_sub_i32 s6, s12, 64 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_add_u32 s2, s2, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 @@ -5386,32 +5381,27 @@ ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 +; GFX8-NEXT: s_movk_i32 s12, 0x7f ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_sub_i32 s4, 64, s12 +; GFX8-NEXT: s_sub_i32 s6, s12, 64 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s2, s2, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -5479,32 +5469,27 @@ ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 +; GFX9-NEXT: s_movk_i32 s12, 0x7f ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_sub_i32 s4, 64, s12 +; GFX9-NEXT: s_sub_i32 s6, s12, 64 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_add_u32 s2, s2, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 @@ -5545,7 +5530,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s8, s0, s4 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_movk_i32 s12, 0x7f +; GFX10-NEXT: s_movk_i32 s14, 0x7f ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s1, s5 @@ -5570,57 +5555,52 @@ ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 -; GFX10-NEXT: s_sub_i32 s2, 64, s12 +; GFX10-NEXT: s_sub_i32 s2, 64, s14 +; GFX10-NEXT: s_and_b32 s15, 1, s1 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 -; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s14 +; GFX10-NEXT: s_sub_i32 s12, s14, 64 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 +; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s14 ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_ashr_i64 s[12:13], s[10:11], s12 +; GFX10-NEXT: s_cmp_lt_u32 s14, 64 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s15 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[12:13] +; GFX10-NEXT: s_cmp_eq_u32 s14, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6586,21 +6566,17 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], 0 -; GFX6-NEXT: s_sub_i32 s21, s20, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 +; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_sub_i32 s22, 64, s20 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 +; GFX6-NEXT: s_sub_i32 s21, s20, 64 ; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6610,11 +6586,10 @@ ; GFX6-NEXT: s_ashr_i32 s8, s19, 31 ; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX6-NEXT: s_and_b32 s23, s23, 1 -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6670,14 +6645,10 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6686,11 +6657,10 @@ ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6766,17 +6736,13 @@ ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_movk_i32 s20, 0x7f +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_sub_i32 s22, 64, s20 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: s_sub_i32 s21, s20, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s20 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6784,11 +6750,10 @@ ; GFX8-NEXT: s_ashr_i32 s8, s19, 31 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX8-NEXT: s_and_b32 s23, s23, 1 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 +; GFX8-NEXT: s_cmp_lt_u32 s20, 64 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6856,10 +6821,6 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6867,11 +6828,10 @@ ; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lt_u32 s20, 64 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6946,17 +6906,13 @@ ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_movk_i32 s20, 0x7f +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_sub_i32 s22, 64, s20 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: s_sub_i32 s21, s20, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s20 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6964,11 +6920,10 @@ ; GFX9-NEXT: s_ashr_i32 s8, s19, 31 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX9-NEXT: s_and_b32 s23, s23, 1 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cmp_lt_u32 s20, 64 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -7036,10 +6991,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -7047,11 +6998,10 @@ ; GFX9-NEXT: s_ashr_i32 s8, s3, 31 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lt_u32 s20, 64 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -7107,6 +7057,7 @@ ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_addc_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7116,144 +7067,133 @@ ; GFX10-NEXT: s_addc_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_sub_i32 s21, s20, 64 -; GFX10-NEXT: s_sub_i32 s22, 64, s20 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: s_movk_i32 s20, 0x7f +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_sub_i32 s23, 64, s20 +; GFX10-NEXT: s_and_b32 s21, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], 0 +; GFX10-NEXT: s_sub_i32 s22, s20, 64 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s23 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 +; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s22 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 +; GFX10-NEXT: s_brev_b32 s21, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s29 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s28 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 -; GFX10-NEXT: s_addc_u32 s3, s3, s23 +; GFX10-NEXT: s_addc_u32 s3, s3, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_add_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_addc_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s23 +; GFX10-NEXT: s_and_b32 s12, 1, s2 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 -; GFX10-NEXT: s_and_b32 s13, s10, 1 +; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s22 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] +; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_cselect_b64 s[14:15], s[6:7], s[4:5] ; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s4, s4, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: s_addc_u32 s4, s14, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_addc_u32 s1, s15, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -5277,34 +5277,29 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 +; GFX6-NEXT: s_movk_i32 s12, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: s_sub_i32 s4, 64, s12 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX6-NEXT: s_sub_i32 s6, s12, 64 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_add_u32 s2, s2, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 @@ -5372,32 +5367,27 @@ ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 +; GFX8-NEXT: s_movk_i32 s12, 0x7f ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_sub_i32 s4, 64, s12 +; GFX8-NEXT: s_sub_i32 s6, s12, 64 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s2, s2, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -5465,32 +5455,27 @@ ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 +; GFX9-NEXT: s_movk_i32 s12, 0x7f ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_sub_i32 s4, 64, s12 +; GFX9-NEXT: s_sub_i32 s6, s12, 64 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_add_u32 s2, s2, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 @@ -5531,7 +5516,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s8, s0, s4 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_movk_i32 s12, 0x7f +; GFX10-NEXT: s_movk_i32 s14, 0x7f ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 @@ -5556,57 +5541,52 @@ ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 -; GFX10-NEXT: s_sub_i32 s2, 64, s12 +; GFX10-NEXT: s_sub_i32 s2, 64, s14 +; GFX10-NEXT: s_and_b32 s15, 1, s1 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 -; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s14 +; GFX10-NEXT: s_sub_i32 s12, s14, 64 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 +; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s14 ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_ashr_i64 s[12:13], s[10:11], s12 +; GFX10-NEXT: s_cmp_lt_u32 s14, 64 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s15 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[12:13] +; GFX10-NEXT: s_cmp_eq_u32 s14, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6572,21 +6552,17 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_subb_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 -; GFX6-NEXT: s_sub_i32 s21, s20, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 +; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_sub_i32 s22, 64, s20 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 +; GFX6-NEXT: s_sub_i32 s21, s20, 64 ; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6596,11 +6572,10 @@ ; GFX6-NEXT: s_ashr_i32 s8, s19, 31 ; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX6-NEXT: s_and_b32 s23, s23, 1 -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6656,14 +6631,10 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6672,11 +6643,10 @@ ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6752,17 +6722,13 @@ ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_movk_i32 s20, 0x7f +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_sub_i32 s22, 64, s20 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: s_sub_i32 s21, s20, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s20 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6770,11 +6736,10 @@ ; GFX8-NEXT: s_ashr_i32 s8, s19, 31 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX8-NEXT: s_and_b32 s23, s23, 1 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 +; GFX8-NEXT: s_cmp_lt_u32 s20, 64 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6842,10 +6807,6 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: s_cmp_lt_u32 s20, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6853,11 +6814,10 @@ ; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lt_u32 s20, 64 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6932,17 +6892,13 @@ ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_movk_i32 s20, 0x7f +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_sub_i32 s22, 64, s20 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: s_sub_i32 s21, s20, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s20 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6950,11 +6906,10 @@ ; GFX9-NEXT: s_ashr_i32 s8, s19, 31 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX9-NEXT: s_and_b32 s23, s23, 1 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cmp_lt_u32 s20, 64 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -7022,10 +6977,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: s_cmp_lt_u32 s20, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -7033,11 +6984,10 @@ ; GFX9-NEXT: s_ashr_i32 s8, s3, 31 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lt_u32 s20, 64 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -7093,6 +7043,7 @@ ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7102,144 +7053,133 @@ ; GFX10-NEXT: s_subb_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_sub_i32 s21, s20, 64 -; GFX10-NEXT: s_sub_i32 s22, 64, s20 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: s_movk_i32 s20, 0x7f +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_sub_i32 s23, 64, s20 +; GFX10-NEXT: s_and_b32 s21, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[10:11], 0 +; GFX10-NEXT: s_sub_i32 s22, s20, 64 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s23 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 +; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s22 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 +; GFX10-NEXT: s_brev_b32 s21, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s29 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s28 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 -; GFX10-NEXT: s_addc_u32 s3, s3, s23 +; GFX10-NEXT: s_addc_u32 s3, s3, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_subb_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s23 +; GFX10-NEXT: s_and_b32 s12, 1, s2 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 -; GFX10-NEXT: s_and_b32 s13, s10, 1 +; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s22 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, s[14:15], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] +; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_cselect_b64 s[14:15], s[6:7], s[4:5] ; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s4, s4, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: s_addc_u32 s4, s14, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_addc_u32 s1, s15, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5