Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,6 +37,14 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def uniform_icmp_select_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::UniformICmpSelectMatchInfo">; + +def uniform_icmp_select : GICombineRule< + (defs root:$select, uniform_icmp_select_matchdata:$matchinfo), + (match (wip_match_opcode G_SELECT):$select, + [{ return RegBankHelper.matchUniformICmpSelect(*${select}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyUniformICmpSelect(*${select}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -56,6 +64,8 @@ } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", []> { + "AMDGPUGenRegBankCombinerHelper", [uniform_icmp_select]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; + let StateClass = "AMDGPURegBankCombinerHelperState"; + let AdditionalArguments = []; } Index: llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -13,6 +13,7 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -27,6 +28,79 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPURegBankCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + const RegisterBankInfo &RBI; + const TargetRegisterInfo &TRI; + CombinerHelper &Helper; + +public: + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + RBI(*MF.getSubtarget().getRegBankInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; + + bool isSgprRegBank(Register Reg); + + struct UniformICmpSelectMatchInfo { + Register ICmpReg; + }; + + // select zext(trunc(icmp)) ? a : b -> select icmp ? a : b + bool matchUniformICmpSelect(MachineInstr &MI, + UniformICmpSelectMatchInfo &MatchInfo); + void applyUniformICmpSelect(MachineInstr &MI, + UniformICmpSelectMatchInfo &MatchInfo); +}; + +bool AMDGPURegBankCombinerHelper::isSgprRegBank(Register Reg) { + return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; +} + +bool AMDGPURegBankCombinerHelper::matchUniformICmpSelect( + MachineInstr &MI, UniformICmpSelectMatchInfo &MatchInfo) { + Register Cmp = MI.getOperand(1).getReg(); + if (!isSgprRegBank(Cmp) || !MRI.hasOneNonDBGUse(Cmp)) + return false; + + Register ZextSrc; + if (!mi_match(Cmp, MRI, m_GZExt(m_Reg(ZextSrc))) || + !MRI.hasOneNonDBGUse(ZextSrc)) + return false; + + Register TruncSrc; + if (!mi_match(ZextSrc, MRI, m_GTrunc(m_Reg(TruncSrc))) || + !MRI.hasOneNonDBGUse(TruncSrc) || + MRI.getVRegDef(TruncSrc)->getOpcode() != TargetOpcode::G_ICMP) + return false; + + MatchInfo.ICmpReg = TruncSrc; + return true; +} + +void AMDGPURegBankCombinerHelper::applyUniformICmpSelect( + MachineInstr &MI, UniformICmpSelectMatchInfo &MatchInfo) { + MachineInstr &ICMP = *MRI.getVRegDef(MatchInfo.ICmpReg); + B.setInstrAndDebugLoc(MI); + B.buildICmp((CmpInst::Predicate)ICMP.getOperand(1).getPredicate(), + ICMP.getOperand(0), ICMP.getOperand(2), ICMP.getOperand(3)); + MI.getOperand(1).setReg(MatchInfo.ICmpReg); + ICMP.eraseFromParent(); +} + +class AMDGPURegBankCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPURegBankCombinerHelper &RegBankHelper; + +public: + AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, + AMDGPURegBankCombinerHelper &RegBankHelper) + : Helper(Helper), RegBankHelper(RegBankHelper) {} +}; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" @@ -62,9 +136,11 @@ MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, + RegBankHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -228,6 +228,7 @@ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); + initializeAMDGPURegBankCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); @@ -803,6 +804,7 @@ bool addLegalizeMachineIR() override; void addPreRegBankSelect() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1109,6 +1111,11 @@ return false; } +void GCNPassConfig::addPreGlobalInstructionSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPURegBankCombiner(IsOptNone)); +} + bool GCNPassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); // TODO: Fix instruction selection to do the right thing for image Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-uniform-icmp-select.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-uniform-icmp-select.mir @@ -0,0 +1,150 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: uniform_icmp_select +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: uniform_icmp_select + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[MV:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[MV1:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[MV2:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; CHECK: [[MV3:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[MV]](p4) :: (load 4, align 32, addrspace 4) + ; CHECK: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[MV1]](p4) :: (load 4, align 32, addrspace 4) + ; CHECK: [[LOAD2:%[0-9]+]]:sgpr(s32) = G_LOAD [[MV2]](p4) :: (load 4, align 32, addrspace 4) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD2]], [[C1]] + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[LOAD]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[LOAD1]], [[AND]] + ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY8]](s32), [[MV3]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + %4:sgpr(s32) = COPY $sgpr0 + %5:sgpr(s32) = COPY $sgpr1 + %6:sgpr(s32) = COPY $sgpr2 + %7:sgpr(s32) = COPY $sgpr3 + %8:sgpr(s32) = COPY $sgpr4 + %9:sgpr(s32) = COPY $sgpr5 + %10:vgpr(s32) = COPY $vgpr0 + %11:vgpr(s32) = COPY $vgpr1 + %0:sgpr(p4) = G_MERGE_VALUES %4(s32), %5(s32) + %1:sgpr(p4) = G_MERGE_VALUES %6(s32), %7(s32) + %2:sgpr(p4) = G_MERGE_VALUES %8(s32), %9(s32) + %3:vgpr(p1) = G_MERGE_VALUES %10(s32), %11(s32) + %13:sgpr(s32) = G_LOAD %0(p4) :: (load 4, align 32, addrspace 4) + %14:sgpr(s32) = G_LOAD %1(p4) :: (load 4, align 32, addrspace 4) + %15:sgpr(s32) = G_LOAD %2(p4) :: (load 4, align 32, addrspace 4) + %16:sgpr(s32) = G_CONSTANT i32 -1 + %21:sgpr(s32) = G_ICMP intpred(sgt), %13(s32), %16 + %17:sgpr(s1) = G_TRUNC %21(s32) + %18:sgpr(s32) = G_CONSTANT i32 268435455 + %19:sgpr(s32) = G_AND %15, %18 + %22:sgpr(s32) = G_ZEXT %17(s1) + %20:sgpr(s32) = G_SELECT %22(s32), %14, %19 + %23:vgpr(s32) = COPY %20(s32) + G_STORE %23(s32), %3(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... + +--- +name: uniform_icmp_select_not_single_use +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: uniform_icmp_select_not_single_use + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[MV:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[MV1:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[MV2:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; CHECK: [[MV3:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; CHECK: [[MV4:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; CHECK: [[MV5:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[MV]](p4) :: (load 4, align 32, addrspace 4) + ; CHECK: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[MV1]](p4) :: (load 4, align 32, addrspace 4) + ; CHECK: [[LOAD2:%[0-9]+]]:sgpr(s32) = G_LOAD [[MV2]](p4) :: (load 4, align 32, addrspace 4) + ; CHECK: [[LOAD3:%[0-9]+]]:sgpr(s32) = G_LOAD [[MV3]](p4) :: (load 4, align 32, addrspace 4) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(sgt), [[LOAD]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 268435455 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD2]], [[C1]] + ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT]](s32), [[LOAD1]], [[AND]] + ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK: G_STORE [[COPY12]](s32), [[MV4]](p1) :: (store 4, addrspace 1) + ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) + ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT1]](s32), [[AND]], [[LOAD3]] + ; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[SELECT1]](s32) + ; CHECK: G_STORE [[COPY13]](s32), [[MV5]](p1) :: (store 4, addrspace 1) + ; CHECK: S_ENDPGM 0 + %6:sgpr(s32) = COPY $sgpr0 + %7:sgpr(s32) = COPY $sgpr1 + %8:sgpr(s32) = COPY $sgpr2 + %9:sgpr(s32) = COPY $sgpr3 + %10:sgpr(s32) = COPY $sgpr4 + %11:sgpr(s32) = COPY $sgpr5 + %12:sgpr(s32) = COPY $sgpr6 + %13:sgpr(s32) = COPY $sgpr7 + %14:vgpr(s32) = COPY $vgpr0 + %15:vgpr(s32) = COPY $vgpr1 + %16:vgpr(s32) = COPY $vgpr2 + %17:vgpr(s32) = COPY $vgpr3 + %0:sgpr(p4) = G_MERGE_VALUES %6(s32), %7(s32) + %1:sgpr(p4) = G_MERGE_VALUES %8(s32), %9(s32) + %2:sgpr(p4) = G_MERGE_VALUES %10(s32), %11(s32) + %3:sgpr(p4) = G_MERGE_VALUES %12(s32), %13(s32) + %4:vgpr(p1) = G_MERGE_VALUES %14(s32), %15(s32) + %5:vgpr(p1) = G_MERGE_VALUES %16(s32), %17(s32) + %19:sgpr(s32) = G_LOAD %0(p4) :: (load 4, align 32, addrspace 4) + %20:sgpr(s32) = G_LOAD %1(p4) :: (load 4, align 32, addrspace 4) + %21:sgpr(s32) = G_LOAD %2(p4) :: (load 4, align 32, addrspace 4) + %22:sgpr(s32) = G_LOAD %3(p4) :: (load 4, align 32, addrspace 4) + %23:sgpr(s32) = G_CONSTANT i32 -1 + %29:sgpr(s32) = G_ICMP intpred(sgt), %19(s32), %23 + %24:sgpr(s1) = G_TRUNC %29(s32) + %25:sgpr(s32) = G_CONSTANT i32 268435455 + %26:sgpr(s32) = G_AND %21, %25 + %30:sgpr(s32) = G_ZEXT %24(s1) + %27:sgpr(s32) = G_SELECT %30(s32), %20, %26 + %31:vgpr(s32) = COPY %27(s32) + G_STORE %31(s32), %4(p1) :: (store 4, addrspace 1) + %32:sgpr(s32) = G_ZEXT %24(s1) + %28:sgpr(s32) = G_SELECT %32(s32), %26, %22 + %33:vgpr(s32) = COPY %28(s32) + G_STORE %33(s32), %5(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -5293,32 +5293,29 @@ ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f +; GFX6-NEXT: s_movk_i32 s12, 0x7f ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_sub_i32 s6, s12, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_and_b32 s13, s13, 1 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_add_u32 s2, s2, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 @@ -5387,31 +5384,28 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_movk_i32 s12, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_sub_i32 s6, s12, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_and_b32 s13, s13, 1 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s2, s2, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -5480,31 +5474,28 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_movk_i32 s12, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_sub_i32 s6, s12, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_and_b32 s13, s13, 1 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_add_u32 s2, s2, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 @@ -5570,57 +5561,54 @@ ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 +; GFX10-NEXT: s_sub_i32 s14, s12, 64 +; GFX10-NEXT: s_and_b32 s13, 1, s1 ; GFX10-NEXT: s_sub_i32 s2, 64, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0 ; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s13 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 +; GFX10-NEXT: s_and_b32 s15, s15, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 ; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6586,21 +6574,19 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], 0 ; GFX6-NEXT: s_sub_i32 s21, s20, 64 ; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6613,8 +6599,7 @@ ; GFX6-NEXT: s_and_b32 s23, s23, 1 ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6670,14 +6655,12 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6689,8 +6672,7 @@ ; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6775,8 +6757,6 @@ ; GFX8-NEXT: s_sub_i32 s22, 64, s20 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6787,8 +6767,7 @@ ; GFX8-NEXT: s_and_b32 s23, s23, 1 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6858,8 +6837,6 @@ ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6870,8 +6847,7 @@ ; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6955,8 +6931,6 @@ ; GFX9-NEXT: s_sub_i32 s22, 64, s20 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6967,8 +6941,7 @@ ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -7038,8 +7011,6 @@ ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -7050,8 +7021,7 @@ ; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -7107,6 +7077,7 @@ ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_addc_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7116,144 +7087,137 @@ ; GFX10-NEXT: s_addc_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_movk_i32 s20, 0x7f +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: s_sub_i32 s21, s20, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], 0 ; GFX10-NEXT: s_sub_i32 s22, 64, s20 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 +; GFX10-NEXT: s_and_b32 s23, s10, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s31, 31 ; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_brev_b32 s23, 1 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s28 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s29 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 ; GFX10-NEXT: s_addc_u32 s3, s3, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_add_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_addc_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s12, 1, s2 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 ; GFX10-NEXT: s_and_b32 s13, s10, 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s4, s4, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -5279,32 +5279,29 @@ ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] -; GFX6-NEXT: s_movk_i32 s2, 0x7f +; GFX6-NEXT: s_movk_i32 s12, 0x7f ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 -; GFX6-NEXT: s_sub_i32 s6, s2, 64 -; GFX6-NEXT: s_sub_i32 s4, 64, s2 -; GFX6-NEXT: s_cmp_lt_u32 s2, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_sub_i32 s6, s12, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_and_b32 s13, s13, 1 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_add_u32 s2, s2, 0 ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 @@ -5373,31 +5370,28 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_movk_i32 s12, 0x7f ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_movk_i32 s2, 0x7f -; GFX8-NEXT: s_sub_i32 s6, s2, 64 -; GFX8-NEXT: s_sub_i32 s4, 64, s2 -; GFX8-NEXT: s_cmp_lt_u32 s2, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_sub_i32 s6, s12, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_and_b32 s13, s13, 1 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s2, s2, 0 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -5466,31 +5460,28 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_movk_i32 s12, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_movk_i32 s2, 0x7f -; GFX9-NEXT: s_sub_i32 s6, s2, 64 -; GFX9-NEXT: s_sub_i32 s4, 64, s2 -; GFX9-NEXT: s_cmp_lt_u32 s2, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_sub_i32 s6, s12, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_and_b32 s13, s13, 1 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_add_u32 s2, s2, 0 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 @@ -5556,57 +5547,54 @@ ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[4:5], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, 64 -; GFX10-NEXT: s_and_b32 s14, 1, s1 +; GFX10-NEXT: s_sub_i32 s14, s12, 64 +; GFX10-NEXT: s_and_b32 s13, 1, s1 ; GFX10-NEXT: s_sub_i32 s2, 64, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[6:7], 0 ; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s13 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 +; GFX10-NEXT: s_and_b32 s15, s15, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 ; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6572,21 +6560,19 @@ ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_subb_u32 s19, s3, s11 -; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_movk_i32 s20, 0x7f ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 ; GFX6-NEXT: s_sub_i32 s21, s20, 64 ; GFX6-NEXT: s_sub_i32 s22, 64, s20 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6599,8 +6585,7 @@ ; GFX6-NEXT: s_and_b32 s23, s23, 1 ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6656,14 +6641,12 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s20, 0 +; GFX6-NEXT: s_cmp_lt_u32 s20, 64 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -6675,8 +6658,7 @@ ; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6761,8 +6743,6 @@ ; GFX8-NEXT: s_sub_i32 s22, 64, s20 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6773,8 +6753,7 @@ ; GFX8-NEXT: s_and_b32 s23, s23, 1 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6844,8 +6823,6 @@ ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: s_cmp_lt_u32 s20, 64 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s20, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6856,8 +6833,7 @@ ; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6941,8 +6917,6 @@ ; GFX9-NEXT: s_sub_i32 s22, 64, s20 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6953,8 +6927,7 @@ ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -7024,8 +6997,6 @@ ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: s_cmp_lt_u32 s20, 64 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s20 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s22 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -7036,8 +7007,7 @@ ; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -7093,6 +7063,7 @@ ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s30, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -7102,144 +7073,137 @@ ; GFX10-NEXT: s_subb_u32 s31, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10-NEXT: s_movk_i32 s20, 0x7f -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_movk_i32 s20, 0x7f +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: s_sub_i32 s21, s20, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[10:11], 0 ; GFX10-NEXT: s_sub_i32 s22, 64, s20 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 +; GFX10-NEXT: s_and_b32 s23, s10, 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_ashr_i32 s2, s31, 31 ; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_brev_b32 s23, 1 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s28 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s29 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 ; GFX10-NEXT: s_addc_u32 s3, s3, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s3, s[12:13], 0 ; GFX10-NEXT: s_subb_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s12, 1, s2 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 +; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s20, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 ; GFX10-NEXT: s_and_b32 s13, s10, 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 ; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_cmp_eq_u32 s20, 0 +; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s4, s4, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uniform-icmp-select.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uniform-icmp-select.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s + +define amdgpu_cs void @uniform_icmp_select(i32 addrspace(4)* inreg %cmp_val_ptr, i32 addrspace(4)* inreg %src1_ptr, i32 addrspace(4)* inreg %src2_ptr, i32 addrspace(1)* %out_ptr) { +; GFX10-LABEL: uniform_icmp_select: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s2, s4, 0xfffffff +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s0, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm + %cmp_val = load i32, i32 addrspace(4)* %cmp_val_ptr, align 32 + %src1 = load i32, i32 addrspace(4)* %src1_ptr, align 32 + %src2 = load i32, i32 addrspace(4)* %src2_ptr, align 32 + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + %result = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_cs void @uniform_icmp_select_not_single_use(i32 addrspace(4)* inreg %cmp_val_ptr, i32 addrspace(4)* inreg %src1_ptr, i32 addrspace(4)* inreg %src2_ptr, i32 addrspace(4)* inreg %src3_ptr, i32 addrspace(1)* %out_ptr1, i32 addrspace(1)* %out_ptr2) { +; GFX10-LABEL: uniform_icmp_select_not_single_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s1, s1, 0xfffffff +; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s2, s2, s1 +; GFX10-NEXT: s_cselect_b32 s0, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: global_store_dword v[2:3], v5, off +; GFX10-NEXT: s_endpgm + %cmp_val = load i32, i32 addrspace(4)* %cmp_val_ptr, align 32 + %src1 = load i32, i32 addrspace(4)* %src1_ptr, align 32 + %src2 = load i32, i32 addrspace(4)* %src2_ptr, align 32 + %src3 = load i32, i32 addrspace(4)* %src3_ptr, align 32 + %cmp = icmp sgt i32 %cmp_val, -1 + %maskedsrc2 = and i32 %src2, 268435455 + %result1 = select i1 %cmp, i32 %src1, i32 %maskedsrc2 + store i32 %result1, i32 addrspace(1)* %out_ptr1 + %result2 = select i1 %cmp, i32 %maskedsrc2, i32 %src3 + store i32 %result2, i32 addrspace(1)* %out_ptr2 + ret void +}