Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -64,6 +64,8 @@ } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", []> { + "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; + let StateClass = "AMDGPURegBankCombinerHelperState"; + let AdditionalArguments = []; } Index: llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -13,6 +13,7 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -27,6 +28,32 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPURegBankCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + const RegisterBankInfo &RBI; + const TargetRegisterInfo &TRI; + CombinerHelper &Helper; + +public: + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + RBI(*MF.getSubtarget().getRegBankInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; +}; + +class AMDGPURegBankCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPURegBankCombinerHelper &RegBankHelper; + +public: + AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, + AMDGPURegBankCombinerHelper &RegBankHelper) + : Helper(Helper), RegBankHelper(RegBankHelper) {} +}; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" @@ -62,9 +89,11 @@ MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, + RegBankHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -228,6 +228,7 @@ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); + initializeAMDGPURegBankCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); @@ -807,6 +808,7 @@ bool addLegalizeMachineIR() override; void addPreRegBankSelect() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1113,6 +1115,11 @@ return false; } +void GCNPassConfig::addPreGlobalInstructionSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPURegBankCombiner(IsOptNone)); +} + bool GCNPassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); // TODO: Fix instruction selection to do the right thing for image Index: llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -139,9 +139,6 @@ ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc1 BB4_4 ; CHECK-NEXT: ; %bb.1: ; %bb2 ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -161,9 +158,6 @@ ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: BB4_3: ; %bb8 ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc0 BB4_5 ; CHECK-NEXT: BB4_4: ; %bb12 ; CHECK-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -9,9 +9,6 @@ ; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cselect_b32 s1, 1, 0 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -8,9 +8,6 @@ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, 1, 0 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -53,9 +53,6 @@ ; CI-NEXT: s_load_dword s0, s[4:5], 0x11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cselect_b32 s0, 1, 0 -; CI-NEXT: s_and_b32 s0, s0, 1 -; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -71,9 +68,6 @@ ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_cmp_lg_u32 s1, s0 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -53,9 +53,6 @@ ; CI-NEXT: s_load_dword s0, s[4:5], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cselect_b32 s0, 1, 0 -; CI-NEXT: s_and_b32 s0, s0, 1 -; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 BB1_2 ; CI-NEXT: ; %bb.1: ; %bb0 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -71,9 +68,6 @@ ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_cmp_lg_u32 s1, s0 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 1 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -43,19 +43,18 @@ define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_buffer_load_dword s1, s[8:11], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 56 ; GCN-NEXT: s_cselect_b32 s0, 1, 0 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_and_b32 s0, s0, 1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 BB2_2 ; GCN-NEXT: ; %bb.1: ; %.one Index: llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -21,18 +21,12 @@ ; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -102,9 +96,6 @@ ; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s6, 1, 0 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4819,11 +4819,9 @@ ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -4912,11 +4910,9 @@ ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -5005,11 +5001,9 @@ ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -5080,55 +5074,53 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: s_sub_i32 s13, s12, 64 ; GFX10-NEXT: s_and_b32 s14, 1, s1 -; GFX10-NEXT: s_sub_i32 s2, 64, s12 +; GFX10-NEXT: s_sub_i32 s15, 64, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[10:11], s15 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 -; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_ashr_i32 s6, s11, 31 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 +; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s13 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 +; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo -; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6118,11 +6110,9 @@ ; GFX6-NEXT: s_ashr_i32 s8, s19, 31 ; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX6-NEXT: s_and_b32 s23, s23, 1 ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s24, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6194,11 +6184,9 @@ ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6292,11 +6280,9 @@ ; GFX8-NEXT: s_ashr_i32 s8, s19, 31 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX8-NEXT: s_and_b32 s23, s23, 1 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s24, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6375,11 +6361,9 @@ ; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6472,11 +6456,9 @@ ; GFX9-NEXT: s_ashr_i32 s8, s19, 31 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -6555,11 +6537,9 @@ ; GFX9-NEXT: s_ashr_i32 s8, s3, 31 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -6605,15 +6585,15 @@ ; ; GFX10-LABEL: s_saddsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s28, s0, s8 +; GFX10-NEXT: s_add_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 ; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 -; GFX10-NEXT: s_addc_u32 s29, s1, s9 +; GFX10-NEXT: s_addc_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_addc_u32 s30, s2, s10 @@ -6633,136 +6613,132 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_sub_i32 s21, s20, 64 ; GFX10-NEXT: s_sub_i32 s22, 64, s20 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s20, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 +; GFX10-NEXT: s_ashr_i32 s10, s31, 31 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_brev_b32 s23, 1 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 +; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_addc_u32 s1, s1, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo +; GFX10-NEXT: s_add_u32 s2, s4, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s30 -; GFX10-NEXT: s_addc_u32 s3, s3, s23 -; GFX10-NEXT: s_add_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_addc_u32 s1, s5, s13 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_addc_u32 s3, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[2:3], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[6:7] +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 -; GFX10-NEXT: s_and_b32 s2, 1, s2 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[12:13], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_and_b32 s16, 1, s1 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[14:15], 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s20, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 -; GFX10-NEXT: s_and_b32 s13, s10, 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_ashr_i32 s4, s9, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s20 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[8:9], s22 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: s_ashr_i32 s10, s9, 31 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_ashr_i64 s[0:1], s[8:9], s20 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s21 +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_add_u32 s2, s2, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[2:3], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_add_u32 s4, s4, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s4, s4, 0 +; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: s_addc_u32 s1, s1, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 ; GFX10-NEXT: v_readfirstlane_b32 s6, v6 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4805,11 +4805,9 @@ ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_and_b32 s6, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -4898,11 +4896,9 @@ ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: s_ashr_i32 s4, s11, 31 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_and_b32 s6, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -4991,11 +4987,9 @@ ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6 -; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_and_b32 s6, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -5066,55 +5060,53 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: s_sub_i32 s13, s12, 64 ; GFX10-NEXT: s_and_b32 s14, 1, s1 -; GFX10-NEXT: s_sub_i32 s2, 64, s12 +; GFX10-NEXT: s_sub_i32 s15, 64, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[8:9], s12 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[10:11], s15 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2 -; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12 -; GFX10-NEXT: s_and_b32 s12, s15, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_and_b32 s6, s16, 1 +; GFX10-NEXT: s_ashr_i32 s6, s11, 31 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_ashr_i64 s[0:1], s[10:11], s12 +; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s13 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 +; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo -; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s1, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 @@ -6104,11 +6096,9 @@ ; GFX6-NEXT: s_ashr_i32 s8, s19, 31 ; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX6-NEXT: s_and_b32 s23, s23, 1 ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s24, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s24, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s23, 0 ; GFX6-NEXT: s_mov_b32 s9, s8 @@ -6180,11 +6170,9 @@ ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX6-NEXT: s_and_b32 s12, s12, 1 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX6-NEXT: s_and_b32 s10, s13, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 @@ -6278,11 +6266,9 @@ ; GFX8-NEXT: s_ashr_i32 s8, s19, 31 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX8-NEXT: s_and_b32 s23, s23, 1 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s24, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s24, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_mov_b32 s9, s8 @@ -6361,11 +6347,9 @@ ; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX8-NEXT: s_and_b32 s12, s12, 1 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX8-NEXT: s_and_b32 s10, s13, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 @@ -6458,11 +6442,9 @@ ; GFX9-NEXT: s_ashr_i32 s8, s19, 31 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21 -; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s24, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s23, 0 ; GFX9-NEXT: s_mov_b32 s9, s8 @@ -6541,11 +6523,9 @@ ; GFX9-NEXT: s_ashr_i32 s8, s3, 31 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21 -; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] -; GFX9-NEXT: s_and_b32 s10, s13, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 @@ -6591,15 +6571,15 @@ ; ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s28, s0, s8 +; GFX10-NEXT: s_sub_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 ; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 -; GFX10-NEXT: s_subb_u32 s29, s1, s9 +; GFX10-NEXT: s_subb_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s30, s2, s10 @@ -6619,136 +6599,132 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_sub_i32 s21, s20, 64 ; GFX10-NEXT: s_sub_i32 s22, 64, s20 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s20, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22 -; GFX10-NEXT: s_and_b32 s24, s10, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_ashr_i32 s2, s31, 31 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21 +; GFX10-NEXT: s_ashr_i32 s10, s31, 31 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s23, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s29 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] -; GFX10-NEXT: s_add_u32 s0, s0, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_add_u32 s2, s2, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s28 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_brev_b32 s23, 1 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_mov_b32_e32 v3, s31 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_addc_u32 s2, s2, 0 +; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_addc_u32 s1, s1, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo +; GFX10-NEXT: s_sub_u32 s2, s4, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo +; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s30 -; GFX10-NEXT: s_addc_u32 s3, s3, s23 -; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_subb_u32 s1, s5, s13 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_subb_u32 s3, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[2:3], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_subb_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_subb_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[6:7] +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 -; GFX10-NEXT: s_and_b32 s2, 1, s2 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[12:13], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s16, 1, s3 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[12:13], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_and_b32 s16, 1, s1 ; GFX10-NEXT: s_cmp_lt_u32 s20, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[14:15], 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s20, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20 -; GFX10-NEXT: s_and_b32 s13, s10, 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_ashr_i32 s4, s9, 31 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20 -; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_and_b32 s10, s12, 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s20 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[8:9], s22 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: s_ashr_i32 s10, s9, 31 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_ashr_i64 s[0:1], s[8:9], s20 +; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s21 +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_add_u32 s2, s2, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[2:3], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_add_u32 s4, s4, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s4, s4, 0 +; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo -; GFX10-NEXT: s_addc_u32 s1, s5, s23 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: s_addc_u32 s1, s1, s23 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 ; GFX10-NEXT: v_readfirstlane_b32 s6, v6