diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1976,104 +1976,43 @@ return Res; } - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - - // RCP = URECIP(Den) = 2^32 / Den + e - // e is rounding error. - SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); - - // RCP_LO = mul(RCP, Den) */ - SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); - - // RCP_HI = mulhu (RCP, Den) */ - SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); - - // NEG_RCP_LO = -RCP_LO - SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - RCP_LO); - - const SDValue Zero = DAG.getConstant(0, DL, VT); - const EVT CCVT = getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), VT); - - // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - SDValue CmpRcpHiZero = DAG.getSetCC(DL, CCVT, RCP_HI, Zero, ISD::SETEQ); - SDValue ABS_RCP_LO = DAG.getNode(ISD::SELECT, - DL, VT, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); - - // Calculate the rounding error from the URECIP instruction - // E = mulhu(ABS_RCP_LO, RCP) - SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); - - // RCP_A_E = RCP + E - SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); - - // RCP_S_E = RCP - E - SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); - - // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - SDValue Tmp0 = DAG.getNode(ISD::SELECT, DL, VT, - CmpRcpHiZero, RCP_A_E, RCP_S_E); - - // Quotient = mulhu(Tmp0, Num) - SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); - - // Num_S_Remainder = Quotient * Den - SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); - - // Remainder = Num - Num_S_Remainder - SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); - - // Remainder_GE_Den = (Remainder >= Den) - SDValue Remainder_GE_Den = DAG.getSetCC(DL, CCVT, Remainder, Den, ISD::SETUGE); - - // Remainder_GE_Zero = (Num >= Num_S_Remainder) - SDValue Remainder_GE_Zero = DAG.getSetCC(DL, CCVT, Num, Num_S_Remainder, - ISD::SETUGE); - - // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero - SDValue Tmp1 = DAG.getNode(ISD::AND, DL, CCVT, Remainder_GE_Den, - Remainder_GE_Zero); - - // Calculate Division result: - - // Quotient_A_One = Quotient + 1 - SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Quotient_S_One = Quotient - 1 - SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Div = (Tmp1 ? Quotient_A_One : Quotient) - SDValue Div = DAG.getNode(ISD::SELECT, DL, VT, Tmp1, - Quotient_A_One, Quotient); - - // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) - Div = DAG.getNode(ISD::SELECT, DL, VT, Remainder_GE_Zero, - Div, Quotient_S_One); - - // Calculate Rem result: - - // Remainder_S_Den = Remainder - Den - SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); - - // Remainder_A_Den = Remainder + Den - SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); - - // Rem = (Tmp1 ? Remainder_S_Den : Remainder) - SDValue Rem = DAG.getNode(ISD::SELECT, DL, VT, Tmp1, - Remainder_S_Den, Remainder); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); - // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) - Rem = DAG.getNode(ISD::SELECT, DL, VT, - Remainder_GE_Zero, Rem, Remainder_A_Den); - SDValue Ops[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Ops, DL); + // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the + // algorithm used here. + + // Initial estimate of inv(y). + SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y); + + // One round of UNR. + SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y); + SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z); + Z = DAG.getNode(ISD::ADD, DL, VT, Z, + DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ)); + + // Quotient/remainder estimate. + SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z); + SDValue R = + DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y)); + + // First quotient/remainder refinement. + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); + Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); + R = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::SUB, DL, VT, R, Y), R); + + // Second quotient/remainder refinement. + Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); + Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); + R = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::SUB, DL, VT, R, Y), R); + + return DAG.getMergeValues({Q, R}, DL); } SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -528,7 +528,7 @@ int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; -int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP_4294966784 = 0x4f7ffffe; // 4294966784 = 4294967296 - 512 = 2^32 - 2^9 int FP16_ONE = 0x3C00; int FP16_NEG_ONE = 0xBC00; int FP32_ONE = 0x3f800000; diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td --- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -57,11 +57,12 @@ defm DIV_cm : DIV_Common; // RECIP_UINT emulation for Cayman -// The multiplication scales from [0,1] to the unsigned integer range +// The multiplication scales from [0,1) to the unsigned integer range, +// rounding down a bit to avoid unwanted overflow. def : R600Pat < (AMDGPUurecip i32:$src0), (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), - (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) + (MOV_IMM_I32 CONST.FP_4294966784))) >; def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1552,11 +1552,12 @@ def : Ext32Pat ; def : Ext32Pat ; -// The multiplication scales from [0,1] to the unsigned integer range +// The multiplication scales from [0,1) to the unsigned integer range, +// rounding down a bit to avoid unwanted overflow. def : GCNPat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), + (V_MUL_F32_e32 (i32 CONST.FP_4294966784), (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -4356,34 +4356,30 @@ ; GCN-NEXT: s_add_i32 s3, s3, s8 ; GCN-NEXT: s_xor_b32 s9, s3, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s3 +; GCN-NEXT: s_sub_i32 s3, 0, s9 +; GCN-NEXT: s_ashr_i32 s0, s2, 31 +; GCN-NEXT: s_add_i32 s1, s2, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s3, s3, s8 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 +; GCN-NEXT: s_xor_b32 s1, s1, s0 +; GCN-NEXT: s_xor_b32 s2, s0, s8 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %shl.y = shl i32 4096, %y @@ -4690,45 +4686,38 @@ ; ; GCN-LABEL: srem_i32_pow2_shl_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, 0x1000, s5 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s10, s2, s3 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GCN-NEXT: s_ashr_i32 s8, s4, 31 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: s_xor_b32 s9, s4, s8 +; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GCN-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-NEXT: s_add_i32 s3, s3, s4 +; GCN-NEXT: s_xor_b32 s4, s3, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_sub_i32 s3, 0, s4 +; GCN-NEXT: s_ashr_i32 s5, s2, 31 +; GCN-NEXT: s_add_i32 s2, s2, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 +; GCN-NEXT: s_xor_b32 s6, s2, s5 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s10 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s10 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] -; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s10 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s9, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s9, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s10, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v1 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s5, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = srem i32 %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -138,36 +138,32 @@ ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc ; GFX9-NEXT: BB0_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; GFX9-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB0_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v4, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_add_u32_e32 v5, -1, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: BB0_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -293,36 +289,32 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GFX9-NEXT: BB1_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; GFX9-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB1_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v4, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_add_u32_e32 v5, -1, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc ; GFX9-NEXT: BB1_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -462,36 +454,30 @@ ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc ; GFX9-NEXT: BB2_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[8:9] -; GFX9-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v4, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v3, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_add_u32_e32 v4, v3, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: BB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -616,36 +602,30 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v10, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GFX9-NEXT: BB3_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[8:9] -; GFX9-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_u32_e32 v4, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v3, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_add_u32_e32 v4, v3, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: BB3_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -924,41 +904,35 @@ ; GFX9-NEXT: v_subb_co_u32_e64 v4, s[8:9], v7, v10, s[8:9] ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v8, vcc ; GFX9-NEXT: BB8_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[10:11] -; GFX9-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[10:11] +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB8_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v4, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v1 -; GFX9-NEXT: v_sub_u32_e32 v5, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v5, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_add_u32_e32 v8, v5, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: BB8_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, v5 @@ -1097,41 +1071,35 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v1, vcc ; GFX9-NEXT: BB9_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[8:9] -; GFX9-NEXT: s_xor_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_u32_e32 v4, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v1 -; GFX9-NEXT: v_sub_u32_e32 v6, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v6, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_add_u32_e32 v8, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GFX9-NEXT: BB9_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -153,7 +153,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 30, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 26, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -165,29 +165,25 @@ ; EG-NEXT: SETGT_INT * T0.W, 0.0, T0.Y, ; EG-NEXT: ADD_INT * T1.W, T0.Y, PV.W, ; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W, +; EG-NEXT: SUB_INT T2.W, 0.0, PV.W, ; EG-NEXT: RECIP_UINT * T0.Y, PV.W, -; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W, -; EG-NEXT: SUB_INT T2.W, 0.0, PS, -; EG-NEXT: MULHI * T1.X, T0.Y, T1.W, -; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Z, -; EG-NEXT: SETGT_INT * T3.W, 0.0, T0.X, -; EG-NEXT: MULHI * T0.Z, PV.W, T0.Y, -; EG-NEXT: ADD_INT T1.Z, T0.X, T3.W, -; EG-NEXT: ADD_INT T2.W, T0.Y, PS, -; EG-NEXT: SUB_INT * T4.W, T0.Y, PS, -; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS, -; EG-NEXT: XOR_INT * T4.W, PV.Z, T3.W, -; EG-NEXT: MULHI * T0.X, PV.W, PS, +; EG-NEXT: SETGT_INT T3.W, 0.0, T0.X, +; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, +; EG-NEXT: ADD_INT T2.W, T0.X, PV.W, +; EG-NEXT: MULHI * T0.X, T0.Y, PS, +; EG-NEXT: ADD_INT T4.W, T0.Y, PS, +; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, +; EG-NEXT: MULHI * T0.X, PS, PV.W, ; EG-NEXT: MULLO_INT * T0.Y, PS, T1.W, -; EG-NEXT: SUB_INT * T2.W, T4.W, PS, -; EG-NEXT: SETGE_UINT T1.W, PV.W, T1.W, -; EG-NEXT: SETGE_UINT * T2.W, T4.W, T0.Y, -; EG-NEXT: AND_INT T1.W, PV.W, PS, -; EG-NEXT: ADD_INT * T4.W, T0.X, 1, -; EG-NEXT: CNDE_INT T1.W, PV.W, T0.X, PS, -; EG-NEXT: ADD_INT * T4.W, T0.X, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T1.W, T2.W, PS, PV.W, +; EG-NEXT: SUB_INT * T2.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T4.W, PV.W, T1.W, +; EG-NEXT: SUB_INT * T5.W, PV.W, T1.W, +; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, +; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T5.W, PS, 1, +; EG-NEXT: SETGE_UINT * T1.W, PV.W, T1.W, +; EG-NEXT: CNDE_INT T1.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 ; EG-NEXT: XOR_INT * T0.W, T3.W, T0.W, ; EG-NEXT: XOR_INT * T1.W, PV.W, PS, ; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, @@ -622,7 +618,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 59, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -633,61 +629,53 @@ ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: ; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Y, -; EG-NEXT: ADD_INT * T1.W, T1.Y, PV.W, -; EG-NEXT: XOR_INT T1.W, PV.W, T0.W, +; EG-NEXT: ADD_INT T1.W, T1.Y, PV.W, ; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.X, -; EG-NEXT: ADD_INT T3.W, T1.X, PS, -; EG-NEXT: RECIP_UINT * T0.Z, PV.W, -; EG-NEXT: XOR_INT T3.W, PV.W, T2.W, BS:VEC_021/SCL_122 -; EG-NEXT: MULLO_INT * T1.X, PS, T1.W, +; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W, +; EG-NEXT: SUB_INT T0.Z, 0.0, PV.W, +; EG-NEXT: ADD_INT T3.W, T1.X, T2.W, +; EG-NEXT: RECIP_UINT * T1.X, PV.W, +; EG-NEXT: XOR_INT T3.W, PV.W, T2.W, +; EG-NEXT: MULLO_INT * T0.Z, PV.Z, PS, +; EG-NEXT: SUB_INT T4.W, 0.0, PV.W, ; EG-NEXT: RECIP_UINT * T1.Y, PV.W, -; EG-NEXT: MULLO_INT * T1.Z, PS, T3.W, -; EG-NEXT: SUB_INT T4.W, 0.0, PS, -; EG-NEXT: MULHI * T2.X, T1.Y, T3.W, -; EG-NEXT: CNDE_INT T1.Z, PS, PV.W, T1.Z, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T4.W, 0.0, T1.X, -; EG-NEXT: MULHI * T2.Y, T0.Z, T1.W, -; EG-NEXT: CNDE_INT T2.Z, PS, PV.W, T1.X, -; EG-NEXT: SETGT_INT T4.W, 0.0, T0.X, -; EG-NEXT: MULHI * T1.X, PV.Z, T1.Y, -; EG-NEXT: SETGT_INT T3.X, 0.0, T0.Y, -; EG-NEXT: ADD_INT T3.Y, T0.X, PV.W, -; EG-NEXT: ADD_INT T1.Z, T1.Y, PS, -; EG-NEXT: SUB_INT T5.W, T1.Y, PS, -; EG-NEXT: MULHI * T0.X, PV.Z, T0.Z, -; EG-NEXT: CNDE_INT T1.X, T2.X, PV.Z, PV.W, -; EG-NEXT: XOR_INT T1.Y, PV.Y, T4.W, -; EG-NEXT: ADD_INT T1.Z, T0.Y, PV.X, -; EG-NEXT: ADD_INT T5.W, T0.Z, PS, -; EG-NEXT: SUB_INT * T6.W, T0.Z, PS, -; EG-NEXT: CNDE_INT T0.Z, T2.Y, PV.W, PS, -; EG-NEXT: XOR_INT T5.W, PV.Z, T3.X, -; EG-NEXT: MULHI * T0.X, PV.X, PV.Y, -; EG-NEXT: MULHI * T0.Y, PV.Z, PV.W, +; EG-NEXT: SETGT_INT T5.W, 0.0, T0.X, +; EG-NEXT: MULLO_INT * T1.Z, PV.W, PS, +; EG-NEXT: SETGT_INT T2.Z, 0.0, T0.Y, +; EG-NEXT: ADD_INT T4.W, T0.X, PV.W, +; EG-NEXT: MULHI * T0.X, T1.Y, PS, +; EG-NEXT: ADD_INT T1.Y, T1.Y, PS, +; EG-NEXT: XOR_INT T1.Z, PV.W, T5.W, +; EG-NEXT: ADD_INT T4.W, T0.Y, PV.Z, BS:VEC_120/SCL_212 +; EG-NEXT: MULHI * T0.X, T1.X, T0.Z, +; EG-NEXT: ADD_INT T0.Z, T1.X, PS, +; EG-NEXT: XOR_INT T4.W, PV.W, T2.Z, +; EG-NEXT: MULHI * T0.X, PV.Z, PV.Y, +; EG-NEXT: MULHI * T0.Y, PV.W, PV.Z, ; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W, -; EG-NEXT: SUB_INT T6.W, T5.W, PS, -; EG-NEXT: MULLO_INT * T1.X, T0.X, T3.W, -; EG-NEXT: SUB_INT T1.Z, T1.Y, PS, -; EG-NEXT: SETGE_UINT T1.W, PV.W, T1.W, -; EG-NEXT: SETGE_UINT * T5.W, T5.W, T0.Z, -; EG-NEXT: AND_INT T2.Y, PV.W, PS, +; EG-NEXT: SUB_INT T4.W, T4.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, T3.W, +; EG-NEXT: SUB_INT T1.Y, T1.Z, PS, ; EG-NEXT: ADD_INT T0.Z, T0.Y, 1, -; EG-NEXT: SETGE_UINT T1.W, PV.Z, T3.W, -; EG-NEXT: SETGE_UINT * T3.W, T1.Y, T1.X, -; EG-NEXT: AND_INT T1.Y, PV.W, PS, -; EG-NEXT: ADD_INT T1.Z, T0.X, 1, -; EG-NEXT: CNDE_INT T1.W, PV.Y, T0.Y, PV.Z, -; EG-NEXT: ADD_INT * T6.W, T0.Y, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.Y, T5.W, PS, PV.W, -; EG-NEXT: XOR_INT T0.Z, T3.X, T0.W, -; EG-NEXT: CNDE_INT T0.W, PV.Y, T0.X, PV.Z, -; EG-NEXT: ADD_INT * T1.W, T0.X, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T1.Z, T3.W, PS, PV.W, -; EG-NEXT: XOR_INT T0.W, T4.W, T2.W, BS:VEC_120/SCL_212 +; EG-NEXT: SETGE_UINT T6.W, PV.W, T1.W, +; EG-NEXT: SUB_INT * T7.W, PV.W, T1.W, +; EG-NEXT: CNDE_INT T1.X, PV.W, T4.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, PV.Z, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T4.W, PV.Y, T3.W, +; EG-NEXT: SUB_INT * T6.W, PV.Y, T3.W, +; EG-NEXT: CNDE_INT T1.Y, PV.W, T1.Y, PS, +; EG-NEXT: CNDE_INT T0.Z, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T4.W, PV.Y, 1, +; EG-NEXT: SETGE_UINT * T1.W, PV.X, T1.W, +; EG-NEXT: CNDE_INT T0.Y, PS, T0.Y, PV.W, +; EG-NEXT: XOR_INT T1.Z, T2.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: ADD_INT T0.W, PV.Z, 1, +; EG-NEXT: SETGE_UINT * T1.W, PV.Y, T3.W, +; EG-NEXT: CNDE_INT T0.Z, PS, T0.Z, PV.W, +; EG-NEXT: XOR_INT T0.W, T5.W, T2.W, ; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.Z, -; EG-NEXT: SUB_INT T0.Y, PS, T0.Z, +; EG-NEXT: SUB_INT T0.Y, PS, T1.Z, ; EG-NEXT: XOR_INT * T1.W, PV.Z, PV.W, ; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, @@ -1214,138 +1202,118 @@ ; ; EG-LABEL: sdiv_v4i32: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 2, @13, KC0[], KC1[] -; EG-NEXT: TEX 0 @10 -; EG-NEXT: ALU 114, @16, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 101, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 -; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: ; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 13: -; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Z, -; EG-NEXT: ADD_INT * T2.W, T1.Z, PV.W, -; EG-NEXT: XOR_INT * T2.W, PV.W, T0.W, -; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: RECIP_UINT * T0.X, T2.W, -; EG-NEXT: MULLO_INT * T0.Y, PS, T2.W, -; EG-NEXT: SUB_INT T4.W, 0.0, PS, -; EG-NEXT: MULHI * T0.Z, T0.X, T2.W, -; EG-NEXT: CNDE_INT T4.W, PS, PV.W, T0.Y, -; EG-NEXT: SETGT_INT * T5.W, 0.0, T3.Z, -; EG-NEXT: MULHI * T0.Y, PV.W, T0.X, -; EG-NEXT: SETGT_INT T2.Y, 0.0, T1.W, -; EG-NEXT: ADD_INT T1.Z, T3.Z, T5.W, BS:VEC_021/SCL_122 -; EG-NEXT: ADD_INT T4.W, T0.X, PS, -; EG-NEXT: SUB_INT * T6.W, T0.X, PS, -; EG-NEXT: CNDE_INT T0.Z, T0.Z, PV.W, PS, -; EG-NEXT: XOR_INT T4.W, PV.Z, T5.W, -; EG-NEXT: ADD_INT * T1.W, T1.W, PV.Y, -; EG-NEXT: XOR_INT T1.W, PS, T2.Y, -; EG-NEXT: MULHI * T0.X, PV.Z, PV.W, -; EG-NEXT: SETGT_INT T6.W, 0.0, T1.Y, -; EG-NEXT: RECIP_UINT * T0.Y, PV.W, -; EG-NEXT: ADD_INT T7.W, T1.Y, PV.W, -; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W, -; EG-NEXT: XOR_INT T1.Z, PV.W, T6.W, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T7.W, 0.0, PS, -; EG-NEXT: MULHI * T1.Y, T0.Y, T1.W, -; EG-NEXT: CNDE_INT T7.W, PS, PV.W, T0.Z, -; EG-NEXT: RECIP_UINT * T0.Z, PV.Z, -; EG-NEXT: SETGT_INT T8.W, 0.0, T3.W, -; EG-NEXT: MULHI * T2.X, PV.W, T0.Y, -; EG-NEXT: ADD_INT T4.Y, T3.W, PV.W, -; EG-NEXT: ADD_INT T2.Z, T0.Y, PS, -; EG-NEXT: SUB_INT T3.W, T0.Y, PS, -; EG-NEXT: MULLO_INT * T0.Y, T0.Z, T1.Z, -; EG-NEXT: CNDE_INT T2.X, T1.Y, PV.Z, PV.W, -; EG-NEXT: XOR_INT T1.Y, PV.Y, T8.W, -; EG-NEXT: SETGT_INT T2.Z, 0.0, T1.X, -; EG-NEXT: SUB_INT T3.W, 0.0, PS, -; EG-NEXT: MULHI * T3.Z, T0.Z, T1.Z, -; EG-NEXT: CNDE_INT T4.Z, PS, PV.W, T0.Y, -; EG-NEXT: ADD_INT T3.W, T1.X, PV.Z, -; EG-NEXT: MULHI * T0.Y, PV.X, PV.Y, -; EG-NEXT: XOR_INT T3.W, PV.W, T2.Z, BS:VEC_021/SCL_122 -; EG-NEXT: MULHI * T1.X, PV.Z, T0.Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.W, +; EG-NEXT: ADD_INT * T1.W, T1.W, PV.W, +; EG-NEXT: XOR_INT * T1.W, PV.W, T2.W, +; EG-NEXT: SUB_INT T3.W, 0.0, PV.W, ; EG-NEXT: RECIP_UINT * T2.X, PV.W, -; EG-NEXT: MULLO_INT * T4.X, PS, T3.W, -; EG-NEXT: SETGT_INT T4.Z, 0.0, T3.Y, -; EG-NEXT: SUB_INT T7.W, 0.0, PS, -; EG-NEXT: MULHI * T4.Y, T2.X, T3.W, -; EG-NEXT: CNDE_INT T4.X, PS, PV.W, T4.X, -; EG-NEXT: ADD_INT T3.Y, T3.Y, PV.Z, -; EG-NEXT: ADD_INT T5.Z, T0.Z, T1.X, -; EG-NEXT: SUB_INT T7.W, T0.Z, T1.X, -; EG-NEXT: MULLO_INT * T0.Z, T0.Y, T1.W, -; EG-NEXT: CNDE_INT T5.Y, T3.Z, PV.Z, PV.W, -; EG-NEXT: XOR_INT T3.Z, PV.Y, T4.Z, -; EG-NEXT: SUB_INT T7.W, T1.Y, PS, -; EG-NEXT: MULHI * T1.X, PV.X, T2.X, -; EG-NEXT: SETGE_UINT T5.Z, PV.W, T1.W, -; EG-NEXT: SETGE_UINT T1.W, T1.Y, T0.Z, -; EG-NEXT: MULHI * T0.Z, PV.Y, PV.Z, -; EG-NEXT: AND_INT T1.Y, PV.Z, PV.W, -; EG-NEXT: ADD_INT T5.Z, T0.Y, 1, -; EG-NEXT: SETGT_INT T7.W, 0.0, T3.X, -; EG-NEXT: MULLO_INT * T3.Y, PS, T1.Z, -; EG-NEXT: SUB_INT T4.X, T3.Z, PS, -; EG-NEXT: ADD_INT T5.Y, T3.X, PV.W, -; EG-NEXT: ADD_INT T6.Z, T2.X, T1.X, BS:VEC_120/SCL_212 -; EG-NEXT: SUB_INT * T9.W, T2.X, T1.X, BS:VEC_120/SCL_212 -; EG-NEXT: MULLO_INT * T1.X, T0.X, T2.W, -; EG-NEXT: CNDE_INT T2.X, T4.Y, T6.Z, T9.W, -; EG-NEXT: XOR_INT T4.Y, T5.Y, T7.W, BS:VEC_201 -; EG-NEXT: SUB_INT T6.Z, T4.W, PS, BS:VEC_120/SCL_212 -; EG-NEXT: SETGE_UINT T9.W, T4.X, T1.Z, BS:VEC_102/SCL_221 -; EG-NEXT: SETGE_UINT * T10.W, T3.Z, T3.Y, -; EG-NEXT: AND_INT T3.X, PV.W, PS, -; EG-NEXT: ADD_INT T3.Y, T0.Z, 1, -; EG-NEXT: SETGE_UINT T1.Z, PV.Z, T2.W, -; EG-NEXT: SETGE_UINT T2.W, T4.W, T1.X, -; EG-NEXT: MULHI * T1.X, PV.X, PV.Y, -; EG-NEXT: AND_INT T2.X, PV.Z, PV.W, -; EG-NEXT: ADD_INT T5.Y, T0.X, 1, -; EG-NEXT: CNDE_INT T1.Z, PV.X, T0.Z, PV.Y, -; EG-NEXT: ADD_INT T4.W, T0.Z, literal.x, -; EG-NEXT: MULLO_INT * T0.Z, PS, T3.W, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T3.X, T10.W, PV.W, PV.Z, -; EG-NEXT: CNDE_INT T3.Y, PV.X, T0.X, PV.Y, -; EG-NEXT: CNDE_INT T1.Z, T1.Y, T0.Y, T5.Z, -; EG-NEXT: ADD_INT T4.W, T0.Y, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: SUB_INT * T9.W, T4.Y, PS, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, T0.X, literal.x, -; EG-NEXT: SETGE_UINT T0.Y, PS, T3.W, -; EG-NEXT: SETGE_UINT T0.Z, T4.Y, T0.Z, -; EG-NEXT: CNDE_INT T1.W, T1.W, PV.W, PV.Z, -; EG-NEXT: XOR_INT * T3.W, T8.W, T2.Y, -; EG-NEXT: -1(nan), 0(0.000000e+00) +; EG-NEXT: SETGT_INT T4.W, 0.0, T0.W, +; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, +; EG-NEXT: SETGT_INT T2.Z, 0.0, T1.Y, +; EG-NEXT: ADD_INT T0.W, T0.W, PV.W, +; EG-NEXT: MULHI * T2.Y, T2.X, PS, +; EG-NEXT: ADD_INT T3.Z, T2.X, PS, +; EG-NEXT: XOR_INT T0.W, PV.W, T4.W, +; EG-NEXT: ADD_INT * T3.W, T1.Y, PV.Z, +; EG-NEXT: XOR_INT T3.W, PS, T2.Z, +; EG-NEXT: MULHI * T1.Y, PV.W, PV.Z, +; EG-NEXT: SUB_INT T5.W, 0.0, PV.W, +; EG-NEXT: RECIP_UINT * T2.X, PV.W, +; EG-NEXT: SETGT_INT T6.W, 0.0, T0.Y, +; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, +; EG-NEXT: ADD_INT T5.W, T0.Y, PV.W, +; EG-NEXT: MULHI * T0.Y, T2.X, PS, +; EG-NEXT: ADD_INT T0.Y, T2.X, PS, +; EG-NEXT: XOR_INT T3.Z, PV.W, T6.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGT_INT T5.W, 0.0, T1.Z, +; EG-NEXT: MULLO_INT * T2.X, T1.Y, T1.W, +; EG-NEXT: ADD_INT T7.W, T1.Z, PV.W, +; EG-NEXT: MULHI * T0.Y, PV.Z, PV.Y, +; EG-NEXT: XOR_INT T7.W, PV.W, T5.W, BS:VEC_021/SCL_122 +; EG-NEXT: MULLO_INT * T1.Z, PS, T3.W, +; EG-NEXT: SUB_INT T4.Z, 0.0, PV.W, +; EG-NEXT: SETGT_INT T8.W, 0.0, T1.X, +; EG-NEXT: RECIP_UINT * T2.Y, PV.W, +; EG-NEXT: ADD_INT T9.W, T1.X, PV.W, +; EG-NEXT: MULLO_INT * T1.X, PV.Z, PS, +; EG-NEXT: SETGT_INT T4.Z, 0.0, T0.Z, +; EG-NEXT: XOR_INT T9.W, PV.W, T8.W, +; EG-NEXT: MULHI * T1.X, T2.Y, PS, +; EG-NEXT: ADD_INT T1.X, T2.Y, PS, +; EG-NEXT: SUB_INT T2.Y, 0.0, PV.W, +; EG-NEXT: SUB_INT T1.Z, T3.Z, T1.Z, +; EG-NEXT: ADD_INT T10.W, T0.Z, PV.Z, BS:VEC_201 +; EG-NEXT: RECIP_UINT * T0.Z, PV.W, +; EG-NEXT: XOR_INT T3.X, PV.W, T4.Z, +; EG-NEXT: ADD_INT T3.Y, T0.Y, 1, +; EG-NEXT: SETGE_UINT T3.Z, PV.Z, T3.W, +; EG-NEXT: SUB_INT T10.W, PV.Z, T3.W, +; EG-NEXT: MULLO_INT * T2.Y, PV.Y, PS, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, T1.Z, PV.W, +; EG-NEXT: CNDE_INT T10.W, PV.Z, T0.Y, PV.Y, +; EG-NEXT: MULHI * T0.Y, PV.X, T1.X, +; EG-NEXT: SETGT_INT T3.Y, 0.0, T0.X, +; EG-NEXT: ADD_INT T3.Z, PV.W, 1, +; EG-NEXT: SETGE_UINT T3.W, PV.Z, T3.W, BS:VEC_021/SCL_122 +; EG-NEXT: MULLO_INT * T1.X, PS, T7.W, +; EG-NEXT: CNDE_INT T4.Y, PV.W, T10.W, PV.Z, +; EG-NEXT: ADD_INT T1.Z, T0.X, PV.Y, +; EG-NEXT: SUB_INT T3.W, T3.X, PS, BS:VEC_120/SCL_212 +; EG-NEXT: MULHI * T0.X, T0.Z, T2.Y, +; EG-NEXT: ADD_INT T1.X, T0.Y, 1, +; EG-NEXT: SETGE_UINT T2.Y, PV.W, T7.W, +; EG-NEXT: ADD_INT T0.Z, T0.Z, PS, +; EG-NEXT: XOR_INT T10.W, PV.Z, T3.Y, +; EG-NEXT: SUB_INT * T0.W, T0.W, T2.X, +; EG-NEXT: SUB_INT T0.X, T3.W, T7.W, +; EG-NEXT: ADD_INT T5.Y, T1.Y, 1, +; EG-NEXT: SETGE_UINT T1.Z, PS, T1.W, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T11.W, PS, T1.W, BS:VEC_021/SCL_122 +; EG-NEXT: MULHI * T0.Z, PV.W, PV.Z, +; EG-NEXT: CNDE_INT T2.X, PV.Z, T0.W, PV.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T1.Y, PV.Z, T1.Y, PV.Y, +; EG-NEXT: CNDE_INT T1.Z, T2.Y, T3.W, PV.X, BS:VEC_201 +; EG-NEXT: CNDE_INT T0.W, T2.Y, T0.Y, T1.X, BS:VEC_201 +; EG-NEXT: MULLO_INT * T0.X, PS, T9.W, +; EG-NEXT: ADD_INT T1.X, PV.W, 1, +; EG-NEXT: SETGE_UINT T0.Y, PV.Z, T7.W, +; EG-NEXT: ADD_INT T1.Z, PV.Y, 1, +; EG-NEXT: SETGE_UINT T1.W, PV.X, T1.W, BS:VEC_102/SCL_221 +; EG-NEXT: SUB_INT * T3.W, T10.W, PS, +; EG-NEXT: ADD_INT T0.X, T0.Z, 1, +; EG-NEXT: SETGE_UINT T2.Y, PS, T9.W, BS:VEC_102/SCL_221 +; EG-NEXT: SUB_INT T3.Z, PS, T9.W, BS:VEC_102/SCL_221 +; EG-NEXT: CNDE_INT T1.W, PV.W, T1.Y, PV.Z, +; EG-NEXT: XOR_INT * T2.W, T4.W, T2.W, ; EG-NEXT: XOR_INT T2.X, PV.W, PS, -; EG-NEXT: AND_INT T0.Y, PV.Y, PV.Z, -; EG-NEXT: ADD_INT T1.Z, T1.X, 1, -; EG-NEXT: CNDE_INT T1.W, T2.W, PV.X, T3.Y, -; EG-NEXT: XOR_INT * T0.W, T5.W, T0.W, -; EG-NEXT: XOR_INT T0.X, T4.Z, T6.W, BS:VEC_021/SCL_122 -; EG-NEXT: XOR_INT T1.Y, PV.W, PS, -; EG-NEXT: CNDE_INT T1.Z, PV.Y, T1.X, PV.Z, -; EG-NEXT: ADD_INT T1.W, T1.X, literal.x, -; EG-NEXT: SUB_INT * T3.W, PV.X, T3.W, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.Y, T0.Z, PV.W, PV.Z, -; EG-NEXT: SUB_INT T3.Z, PV.Y, T0.W, -; EG-NEXT: XOR_INT T0.W, T7.W, T2.Z, -; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X, -; EG-NEXT: SUB_INT T3.Y, PS, T0.X, +; EG-NEXT: CNDE_INT T1.Y, PV.Y, T3.W, PV.Z, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T0.Z, PV.Y, T0.Z, PV.X, +; EG-NEXT: CNDE_INT T0.W, T0.Y, T0.W, T1.X, BS:VEC_102/SCL_221 +; EG-NEXT: XOR_INT * T1.W, T4.Z, T5.W, +; EG-NEXT: XOR_INT T0.X, T6.W, T2.Z, +; EG-NEXT: XOR_INT T0.Y, PV.W, PS, +; EG-NEXT: ADD_INT T1.Z, PV.Z, 1, +; EG-NEXT: SETGE_UINT T0.W, PV.Y, T9.W, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT * T2.W, PV.X, T2.W, +; EG-NEXT: CNDE_INT T1.Y, PV.W, T0.Z, PV.Z, +; EG-NEXT: SUB_INT T2.Z, PV.Y, T1.W, +; EG-NEXT: XOR_INT T0.W, T3.Y, T8.W, BS:VEC_021/SCL_122 +; EG-NEXT: XOR_INT * T1.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT T2.Y, PS, T0.X, ; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.W, -; EG-NEXT: SUB_INT T3.X, PV.W, T0.W, +; EG-NEXT: SUB_INT T2.X, PV.W, T0.W, ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 @@ -1947,7 +1915,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 43, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1965,37 +1933,33 @@ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T0.X, PV.W, ; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W, -; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W, -; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W, -; EG-NEXT: RECIP_UINT * T0.X, PV.W, ; EG-NEXT: BFE_INT T2.W, T3.X, 0.0, literal.x, -; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, +; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: LSHL T0.Z, PV.W, literal.x, -; EG-NEXT: SUB_INT T2.W, 0.0, PS, -; EG-NEXT: MULHI * T1.X, T0.X, T0.W, +; EG-NEXT: LSHL T2.W, PV.W, literal.x, +; EG-NEXT: XOR_INT * T0.W, PS, T1.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Y, -; EG-NEXT: OR_INT * T3.W, T2.X, PV.Z, -; EG-NEXT: SETGT_INT T4.W, 0.0, PS, -; EG-NEXT: MULHI * T0.Y, PV.W, T0.X, -; EG-NEXT: ADD_INT T0.Z, T3.W, PV.W, -; EG-NEXT: ADD_INT T2.W, T0.X, PS, -; EG-NEXT: SUB_INT * T3.W, T0.X, PS, -; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS, -; EG-NEXT: XOR_INT * T3.W, PV.Z, T4.W, -; EG-NEXT: MULHI * T0.X, PV.W, PS, +; EG-NEXT: SUB_INT T0.Z, 0.0, PS, +; EG-NEXT: OR_INT T2.W, T2.X, PV.W, +; EG-NEXT: RECIP_UINT * T0.X, PS, +; EG-NEXT: SETGT_INT T3.W, 0.0, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PV.Z, PS, +; EG-NEXT: ADD_INT T2.W, T2.W, PV.W, +; EG-NEXT: MULHI * T0.Y, T0.X, PS, +; EG-NEXT: ADD_INT T4.W, T0.X, PS, +; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, +; EG-NEXT: MULHI * T0.X, PS, PV.W, ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, -; EG-NEXT: SUB_INT * T2.W, T3.W, PS, -; EG-NEXT: SETGE_UINT T0.W, PV.W, T0.W, -; EG-NEXT: SETGE_UINT * T2.W, T3.W, T0.Y, -; EG-NEXT: AND_INT T0.W, PV.W, PS, -; EG-NEXT: ADD_INT * T3.W, T0.X, 1, -; EG-NEXT: CNDE_INT T0.W, PV.W, T0.X, PS, -; EG-NEXT: ADD_INT * T3.W, T0.X, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.W, T2.W, PS, PV.W, -; EG-NEXT: XOR_INT * T1.W, T4.W, T1.W, +; EG-NEXT: SUB_INT * T2.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T5.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, +; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T5.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 +; EG-NEXT: XOR_INT * T1.W, T3.W, T1.W, ; EG-NEXT: XOR_INT * T0.W, PV.W, PS, ; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W, ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, @@ -2161,7 +2125,7 @@ ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 41, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -2177,36 +2141,32 @@ ; EG-NEXT: ASHR * T0.W, PV.W, literal.x, ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) ; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W, -; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W, +; EG-NEXT: ADD_INT T0.W, T0.W, PV.W, +; EG-NEXT: LSHL * T2.W, T1.X, literal.x, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) ; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W, +; EG-NEXT: SUB_INT T0.Z, 0.0, PV.W, +; EG-NEXT: ASHR T2.W, T2.W, literal.x, ; EG-NEXT: RECIP_UINT * T0.X, PV.W, -; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, -; EG-NEXT: LSHL T0.Z, T1.X, literal.x, -; EG-NEXT: SUB_INT T2.W, 0.0, PS, -; EG-NEXT: MULHI * T1.X, T0.X, T0.W, -; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Y, -; EG-NEXT: ASHR * T3.W, PV.Z, literal.x, ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; EG-NEXT: SETGT_INT T4.W, 0.0, PS, -; EG-NEXT: MULHI * T0.Y, PV.W, T0.X, -; EG-NEXT: ADD_INT T0.Z, T3.W, PV.W, -; EG-NEXT: ADD_INT T2.W, T0.X, PS, -; EG-NEXT: SUB_INT * T3.W, T0.X, PS, -; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS, -; EG-NEXT: XOR_INT * T3.W, PV.Z, T4.W, -; EG-NEXT: MULHI * T0.X, PV.W, PS, +; EG-NEXT: SETGT_INT T3.W, 0.0, PV.W, +; EG-NEXT: MULLO_INT * T0.Y, PV.Z, PS, +; EG-NEXT: ADD_INT T2.W, T2.W, PV.W, +; EG-NEXT: MULHI * T0.Y, T0.X, PS, +; EG-NEXT: ADD_INT T4.W, T0.X, PS, +; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, +; EG-NEXT: MULHI * T0.X, PS, PV.W, ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, -; EG-NEXT: SUB_INT * T2.W, T3.W, PS, -; EG-NEXT: SETGE_UINT T0.W, PV.W, T0.W, -; EG-NEXT: SETGE_UINT * T2.W, T3.W, T0.Y, -; EG-NEXT: AND_INT T0.W, PV.W, PS, -; EG-NEXT: ADD_INT * T3.W, T0.X, 1, -; EG-NEXT: CNDE_INT T0.W, PV.W, T0.X, PS, -; EG-NEXT: ADD_INT * T3.W, T0.X, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.W, T2.W, PS, PV.W, -; EG-NEXT: XOR_INT * T1.W, T4.W, T1.W, +; EG-NEXT: SUB_INT * T2.W, T2.W, PS, +; EG-NEXT: ADD_INT T0.Z, T0.X, 1, +; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T5.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, +; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, +; EG-NEXT: ADD_INT T5.W, PS, 1, +; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 +; EG-NEXT: XOR_INT * T1.W, T3.W, T1.W, ; EG-NEXT: XOR_INT * T0.W, PV.W, PS, ; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W, ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -6,37 +6,31 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) { ; R600-LABEL: test_udivrem: ; R600: ; %bb.0: -; R600-NEXT: ALU 27, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 +; R600-NEXT: ALU 21, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SUB_INT T0.W, 0.0, KC0[9].X, ; R600-NEXT: RECIP_UINT * T0.X, KC0[9].X, -; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[9].X, -; R600-NEXT: SUB_INT T0.W, 0.0, PS, -; R600-NEXT: MULHI * T0.Z, T0.X, KC0[9].X, -; R600-NEXT: CNDE_INT * T0.W, PS, PV.W, T0.Y, -; R600-NEXT: MULHI * T0.Y, PV.W, T0.X, -; R600-NEXT: ADD_INT T0.W, T0.X, PS, -; R600-NEXT: SUB_INT * T1.W, T0.X, PS, -; R600-NEXT: CNDE_INT * T0.W, T0.Z, PV.W, PS, -; R600-NEXT: MULHI * T0.X, PV.W, KC0[6].W, +; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, +; R600-NEXT: MULHI * T0.Y, T0.X, PS, +; R600-NEXT: ADD_INT * T0.W, T0.X, PS, +; R600-NEXT: MULHI * T0.X, KC0[6].W, PV.W, ; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[9].X, ; R600-NEXT: SUB_INT * T0.W, KC0[6].W, PS, -; R600-NEXT: SETGE_UINT T1.W, PV.W, KC0[9].X, -; R600-NEXT: SETGE_UINT * T2.W, KC0[6].W, T0.Y, -; R600-NEXT: AND_INT T1.W, PV.W, PS, -; R600-NEXT: SUB_INT * T3.W, T0.W, KC0[9].X, -; R600-NEXT: CNDE_INT T3.W, PV.W, T0.W, PS, -; R600-NEXT: ADD_INT * T0.W, T0.W, KC0[9].X, -; R600-NEXT: CNDE_INT T1.X, T2.W, PS, PV.W, -; R600-NEXT: ADD_INT T0.W, T0.X, 1, -; R600-NEXT: LSHR * T2.X, KC0[4].Z, literal.x, +; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, +; R600-NEXT: SETGE_UINT * T2.W, PV.W, KC0[9].X, +; R600-NEXT: CNDE_INT * T0.W, PS, T0.W, PV.W, +; R600-NEXT: ADD_INT T0.Z, T0.X, 1, +; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, +; R600-NEXT: SETGE_UINT * T3.W, PV.W, KC0[9].X, +; R600-NEXT: CNDE_INT T1.X, PS, T0.W, PV.W, +; R600-NEXT: CNDE_INT T0.W, T2.W, T0.X, PV.Z, +; R600-NEXT: LSHR * T0.X, KC0[4].Z, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T0.W, T1.W, T0.X, PV.W, -; R600-NEXT: ADD_INT * T1.W, T0.X, literal.x, -; R600-NEXT: -1(nan), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T0.X, T2.W, PS, PV.W, +; R600-NEXT: ADD_INT * T1.W, PV.W, 1, +; R600-NEXT: CNDE_INT T2.X, T3.W, T0.W, PV.W, ; R600-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; @@ -123,49 +117,39 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { ; R600-LABEL: test_udivrem_v2: ; R600: ; %bb.0: -; R600-NEXT: ALU 39, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 29, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Z, ; R600-NEXT: RECIP_UINT * T0.X, KC0[3].Z, -; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[3].Z, +; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, +; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Y, ; R600-NEXT: RECIP_UINT * T0.Z, KC0[3].Y, -; R600-NEXT: MULLO_INT * T0.W, PS, KC0[3].Y, -; R600-NEXT: SUB_INT T1.W, 0.0, PS, -; R600-NEXT: MULHI * T1.X, T0.Z, KC0[3].Y, -; R600-NEXT: CNDE_INT T1.Z, PS, PV.W, T0.W, -; R600-NEXT: SUB_INT T0.W, 0.0, T0.Y, -; R600-NEXT: MULHI * T1.Y, T0.X, KC0[3].Z, -; R600-NEXT: CNDE_INT T0.W, PS, PV.W, T0.Y, -; R600-NEXT: MULHI * T0.Y, PV.Z, T0.Z, -; R600-NEXT: ADD_INT T1.Z, T0.Z, PS, -; R600-NEXT: SUB_INT T1.W, T0.Z, PS, -; R600-NEXT: MULHI * T0.Y, PV.W, T0.X, -; R600-NEXT: CNDE_INT T0.Z, T1.X, PV.Z, PV.W, -; R600-NEXT: ADD_INT T0.W, T0.X, PS, BS:VEC_120/SCL_212 -; R600-NEXT: SUB_INT * T1.W, T0.X, PS, -; R600-NEXT: CNDE_INT T0.W, T1.Y, PV.W, PS, -; R600-NEXT: MULHI * T0.X, PV.Z, KC0[2].W, -; R600-NEXT: MULHI * T0.Y, PV.W, KC0[3].X, +; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, +; R600-NEXT: MULHI * T0.W, T0.Z, PS, +; R600-NEXT: ADD_INT T0.W, T0.Z, PS, +; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, +; R600-NEXT: ADD_INT T1.W, T0.X, PS, +; R600-NEXT: MULHI * T0.X, KC0[2].W, PV.W, +; R600-NEXT: MULHI * T0.Y, KC0[3].X, PV.W, ; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[3].Z, ; R600-NEXT: SUB_INT T0.W, KC0[3].X, PS, ; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[3].Y, ; R600-NEXT: SUB_INT T0.Z, KC0[2].W, PS, -; R600-NEXT: SETGE_UINT * T1.W, PV.W, KC0[3].Z, -; R600-NEXT: SETGE_UINT * T2.W, KC0[3].X, T0.Y, -; R600-NEXT: AND_INT T0.Y, T1.W, PV.W, -; R600-NEXT: SUB_INT T1.Z, T0.W, KC0[3].Z, BS:VEC_120/SCL_212 -; R600-NEXT: SETGE_UINT * T1.W, T0.Z, KC0[3].Y, -; R600-NEXT: SETGE_UINT * T3.W, KC0[2].W, T0.X, -; R600-NEXT: AND_INT T1.Y, T1.W, PV.W, -; R600-NEXT: SUB_INT T2.Z, T0.Z, KC0[3].Y, -; R600-NEXT: CNDE_INT T1.W, T0.Y, T0.W, T1.Z, -; R600-NEXT: ADD_INT * T0.W, T0.W, KC0[3].Z, -; R600-NEXT: CNDE_INT T0.Y, T2.W, PS, PV.W, -; R600-NEXT: CNDE_INT T0.W, PV.Y, T0.Z, PV.Z, -; R600-NEXT: ADD_INT * T1.W, T0.Z, KC0[3].Y, -; R600-NEXT: CNDE_INT T0.X, T3.W, PS, PV.W, +; R600-NEXT: SETGE_UINT T1.W, PV.W, KC0[3].Z, +; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[3].Z, +; R600-NEXT: CNDE_INT T1.Z, PV.W, T0.W, PS, +; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, +; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, +; R600-NEXT: CNDE_INT T0.Z, PV.W, T0.Z, PS, +; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Z, +; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Z, +; R600-NEXT: CNDE_INT T0.Y, PV.W, T1.Z, PS, +; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, +; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, +; R600-NEXT: CNDE_INT T0.X, PV.W, T0.Z, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; @@ -268,88 +252,68 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { ; R600-LABEL: test_udivrem_v4: ; R600: ; %bb.0: -; R600-NEXT: ALU 77, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: ALU 57, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SUB_INT T0.W, 0.0, KC0[5].X, ; R600-NEXT: RECIP_UINT * T0.X, KC0[5].X, -; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[5].X, -; R600-NEXT: SUB_INT T0.W, 0.0, PS, -; R600-NEXT: MULHI * T0.Z, T0.X, KC0[5].X, -; R600-NEXT: CNDE_INT * T0.W, PS, PV.W, T0.Y, -; R600-NEXT: MULHI * T0.Y, PV.W, T0.X, -; R600-NEXT: RECIP_UINT * T0.W, KC0[4].Y, -; R600-NEXT: MULLO_INT * T1.X, PS, KC0[4].Y, -; R600-NEXT: SUB_INT T1.W, 0.0, PS, -; R600-NEXT: MULHI * T1.Y, T0.W, KC0[4].Y, -; R600-NEXT: CNDE_INT T1.Z, PS, PV.W, T1.X, BS:VEC_021/SCL_122 -; R600-NEXT: ADD_INT T1.W, T0.X, T0.Y, -; R600-NEXT: SUB_INT * T2.W, T0.X, T0.Y, -; R600-NEXT: CNDE_INT T1.W, T0.Z, PV.W, PS, -; R600-NEXT: MULHI * T0.X, PV.Z, T0.W, -; R600-NEXT: MULHI * T0.Y, PV.W, KC0[4].X, +; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, +; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].Z, ; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Z, -; R600-NEXT: MULLO_INT * T1.X, PS, KC0[4].Z, -; R600-NEXT: SUB_INT T1.W, 0.0, PS, -; R600-NEXT: MULHI * T1.Z, T0.Z, KC0[4].Z, -; R600-NEXT: CNDE_INT T1.W, PS, PV.W, T1.X, +; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, +; R600-NEXT: MULHI * T0.W, T0.Z, PS, +; R600-NEXT: ADD_INT T0.W, T0.Z, PS, +; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, +; R600-NEXT: ADD_INT T1.W, T0.X, PS, +; R600-NEXT: MULHI * T0.X, KC0[3].Z, PV.W, +; R600-NEXT: MULHI * T0.Y, KC0[4].X, PV.W, +; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[5].X, +; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Y, +; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].W, ; R600-NEXT: RECIP_UINT * T1.X, KC0[4].W, -; R600-NEXT: MULHI * T1.W, PV.W, T0.Z, -; R600-NEXT: ADD_INT T2.Z, T0.Z, PS, -; R600-NEXT: SUB_INT T1.W, T0.Z, PS, -; R600-NEXT: MULLO_INT * T0.Z, T1.X, KC0[4].W, -; R600-NEXT: CNDE_INT T1.Z, T1.Z, PV.Z, PV.W, -; R600-NEXT: SUB_INT T1.W, 0.0, PS, -; R600-NEXT: MULHI * T2.X, T1.X, KC0[4].W, -; R600-NEXT: CNDE_INT T1.W, PS, PV.W, T0.Z, -; R600-NEXT: MULHI * T0.Z, PV.Z, KC0[3].Z, -; R600-NEXT: MULHI * T1.Z, PV.W, T1.X, -; R600-NEXT: ADD_INT T2.Z, T1.X, PS, -; R600-NEXT: SUB_INT T1.W, T1.X, PS, -; R600-NEXT: MULLO_INT * T0.Z, T0.Z, KC0[4].Z, -; R600-NEXT: CNDE_INT T1.Z, T2.X, PV.Z, PV.W, -; R600-NEXT: SUB_INT T1.W, KC0[3].Z, PS, -; R600-NEXT: MULLO_INT * T0.Y, T0.Y, KC0[5].X, -; R600-NEXT: SUB_INT T1.X, PV.W, KC0[4].Z, -; R600-NEXT: SUB_INT T2.Y, KC0[4].X, PS, -; R600-NEXT: ADD_INT T2.Z, T0.W, T0.X, -; R600-NEXT: SUB_INT * T0.W, T0.W, T0.X, -; R600-NEXT: MULHI * T0.X, T1.Z, KC0[3].W, -; R600-NEXT: CNDE_INT T1.Y, T1.Y, T2.Z, T0.W, -; R600-NEXT: SETGE_UINT T1.Z, T2.Y, KC0[5].X, BS:VEC_120/SCL_212 -; R600-NEXT: SETGE_UINT * T0.W, KC0[4].X, T0.Y, BS:VEC_021/SCL_122 -; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[4].W, -; R600-NEXT: ADD_INT T2.X, T2.Y, KC0[5].X, -; R600-NEXT: AND_INT T0.Y, T1.Z, T0.W, -; R600-NEXT: SUB_INT T1.Z, T2.Y, KC0[5].X, -; R600-NEXT: SUB_INT * T2.W, KC0[3].W, PS, -; R600-NEXT: MULHI * T1.Y, T1.Y, KC0[3].Y, -; R600-NEXT: ADD_INT T3.X, T2.W, KC0[4].W, -; R600-NEXT: CNDE_INT T0.Y, T0.Y, T2.Y, T1.Z, -; R600-NEXT: SETGE_UINT T1.Z, T2.W, KC0[4].W, -; R600-NEXT: SETGE_UINT * T3.W, KC0[3].W, T0.X, -; R600-NEXT: MULLO_INT * T0.X, T1.Y, KC0[4].Y, -; R600-NEXT: SUB_INT T4.X, KC0[3].Y, PS, -; R600-NEXT: AND_INT T1.Y, T1.Z, T3.W, -; R600-NEXT: SUB_INT T1.Z, T2.W, KC0[4].W, -; R600-NEXT: SETGE_UINT * T4.W, T1.W, KC0[4].Z, BS:VEC_201 -; R600-NEXT: SETGE_UINT * T5.W, KC0[3].Z, T0.Z, -; R600-NEXT: AND_INT T5.X, T4.W, PV.W, -; R600-NEXT: CNDE_INT T1.Y, T1.Y, T2.W, T1.Z, BS:VEC_210 -; R600-NEXT: SETGE_UINT T0.Z, T4.X, KC0[4].Y, -; R600-NEXT: SETGE_UINT T2.W, KC0[3].Y, T0.X, BS:VEC_021/SCL_122 -; R600-NEXT: CNDE_INT * T0.W, T0.W, T2.X, T0.Y, -; R600-NEXT: AND_INT T0.X, PV.Z, PV.W, -; R600-NEXT: SUB_INT T2.Y, T4.X, KC0[4].Y, -; R600-NEXT: CNDE_INT T0.Z, T3.W, T3.X, PV.Y, -; R600-NEXT: CNDE_INT T3.W, PV.X, T1.W, T1.X, -; R600-NEXT: ADD_INT * T1.W, T1.W, KC0[4].Z, -; R600-NEXT: CNDE_INT T0.Y, T5.W, PS, PV.W, -; R600-NEXT: CNDE_INT T1.W, PV.X, T4.X, PV.Y, -; R600-NEXT: ADD_INT * T3.W, T4.X, KC0[4].Y, -; R600-NEXT: CNDE_INT T0.X, T2.W, PS, PV.W, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, +; R600-NEXT: SUB_INT T1.W, 0.0, KC0[4].Y, +; R600-NEXT: MULHI * T0.W, T1.X, PS, +; R600-NEXT: ADD_INT T0.W, T1.X, PS, +; R600-NEXT: MULLO_INT * T1.X, PV.W, T0.Z, +; R600-NEXT: MULHI * T0.W, KC0[3].W, PV.W, +; R600-NEXT: MULLO_INT * T0.W, PS, KC0[4].W, +; R600-NEXT: SUB_INT T0.W, KC0[3].W, PS, +; R600-NEXT: MULHI * T1.X, T0.Z, T1.X, +; R600-NEXT: SETGE_UINT T1.Y, PV.W, KC0[4].W, +; R600-NEXT: ADD_INT T0.Z, T0.Z, PS, +; R600-NEXT: SUB_INT T1.W, KC0[4].X, T0.Y, +; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[4].Z, +; R600-NEXT: SUB_INT T0.Y, KC0[3].Z, PS, +; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[5].X, +; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[5].X, +; R600-NEXT: MULHI * T0.X, KC0[3].Y, T0.Z, +; R600-NEXT: SUB_INT T1.X, T0.W, KC0[4].W, +; R600-NEXT: CNDE_INT T2.Y, T1.Z, T1.W, T2.W, +; R600-NEXT: SETGE_UINT T0.Z, T0.Y, KC0[4].Z, +; R600-NEXT: SUB_INT T1.W, T0.Y, KC0[4].Z, +; R600-NEXT: MULLO_INT * T0.X, PS, KC0[4].Y, +; R600-NEXT: CNDE_INT T2.X, PV.Z, T0.Y, PV.W, +; R600-NEXT: SETGE_UINT T0.Y, PV.Y, KC0[5].X, +; R600-NEXT: SUB_INT T0.Z, PV.Y, KC0[5].X, +; R600-NEXT: SUB_INT T1.W, KC0[3].Y, PS, +; R600-NEXT: CNDE_INT * T0.W, T1.Y, T0.W, PV.X, +; R600-NEXT: SETGE_UINT T0.X, PS, KC0[4].W, +; R600-NEXT: SUB_INT T1.Y, PS, KC0[4].W, +; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[4].Y, +; R600-NEXT: SUB_INT T2.W, PV.W, KC0[4].Y, +; R600-NEXT: CNDE_INT * T3.W, PV.Y, T2.Y, PV.Z, +; R600-NEXT: CNDE_INT T0.Y, PV.Z, T1.W, PV.W, +; R600-NEXT: CNDE_INT T3.Z, PV.X, T0.W, PV.Y, BS:VEC_021/SCL_122 +; R600-NEXT: SETGE_UINT T0.W, T2.X, KC0[4].Z, +; R600-NEXT: SUB_INT * T1.W, T2.X, KC0[4].Z, +; R600-NEXT: CNDE_INT T3.Y, PV.W, T2.X, PS, +; R600-NEXT: SETGE_UINT T0.W, PV.Y, KC0[4].Y, +; R600-NEXT: SUB_INT * T1.W, PV.Y, KC0[4].Y, +; R600-NEXT: CNDE_INT T3.X, PV.W, T0.Y, PS, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX6-LABEL: test_udivrem_v4: