Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3618,27 +3618,49 @@ NewMode = DAG.getConstant( AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32); } else { - SDValue BitTable = - DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64); + // If we know the input can only be one of the supported standard modes in + // the range 0-3, we can use a simplified mapping to hardware values. + KnownBits KB = DAG.computeKnownBits(NewMode); + const bool UseReducedTable = KB.countMinLeadingZeros() >= 30; - SDValue Four = DAG.getConstant(4, SL, MVT::i32); - SDValue IsStandardValue = - DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT); - SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four); - SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, - NewMode, OffsetEnum); + if (UseReducedTable) { + SDValue BitTable = DAG.getConstant( + AMDGPU::FltRoundToHWConversionTableStandardOnly, SL, MVT::i32); - SDValue Two = DAG.getConstant(2, SL, MVT::i32); - SDValue RoundModeTimesNumBits = - DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two); + SDValue Two = DAG.getConstant(2, SL, MVT::i32); + SDValue RoundModeTimesNumBits = + DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two); - SDValue TableValue = - DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); - SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); + SDValue TableValue = + DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits); + NewMode = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); + + // TODO: SimplifyDemandedBits on the setreg source here can likely reduce + // the table extracted bits into inline immediates. + } else { + SDValue BitTable = + DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64); + + SDValue Four = DAG.getConstant(4, SL, MVT::i32); + SDValue IsStandardValue = + DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT); + SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four); - // No need to mask out the high bits since the setreg will ignore them - // anyway. - NewMode = TruncTable; + SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, + NewMode, OffsetEnum); + + SDValue Two = DAG.getConstant(2, SL, MVT::i32); + SDValue RoundModeTimesNumBits = + DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two); + + SDValue TableValue = + DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); + SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); + + // No need to mask out the high bits since the setreg will ignore them + // anyway. + NewMode = TruncTable; + } // Insert a readfirstlane in case the value is a VGPR. We could do this // earlier and keep more operations scalar, but that interferes with Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h +++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h @@ -146,6 +146,9 @@ // values extern const uint64_t FltRoundToHWConversionTable; +// Reduced version of FltRoundToHWConversionTable that only works for 0-3. +extern const uint32_t FltRoundToHWConversionTableStandardOnly; + /// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value. constexpr uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds) { uint32_t IndexVal = FltRounds; Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp +++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp @@ -281,3 +281,28 @@ static_assert( decodeFltRoundToHWConversionTable(TowardZeroF32_TowardNegativeF64) == getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)); + +static constexpr uint32_t +encodeFltRoundsToHWTableReduced(uint32_t HWVal, uint32_t FltRoundsVal) { + return getModeRegisterRoundMode(HWVal, HWVal) << (FltRoundsVal << 2); +} + +static constexpr uint32_t decodeFltRoundsToHWTableReduced(uint32_t HWMode) { + return (FltRoundToHWConversionTableStandardOnly >> (HWMode << 2)) & 0xf; +} + +constexpr uint32_t AMDGPU::FltRoundToHWConversionTableStandardOnly = + encodeFltRoundsToHWTableReduced(HWTowardZero, TowardZero) | + encodeFltRoundsToHWTableReduced(HWNearestTiesToEven, NearestTiesToEven) | + encodeFltRoundsToHWTableReduced(HWTowardPositive, TowardPositive) | + encodeFltRoundsToHWTableReduced(HWTowardNegative, TowardNegative); + +static_assert(decodeFltRoundsToHWTableReduced(TowardZero) == + getModeRegisterRoundMode(HWTowardZero, HWTowardZero)); +static_assert(decodeFltRoundsToHWTableReduced(NearestTiesToEven) == + getModeRegisterRoundMode(HWNearestTiesToEven, + HWNearestTiesToEven)); +static_assert(decodeFltRoundsToHWTableReduced(TowardPositive) == + getModeRegisterRoundMode(HWTowardPositive, HWTowardPositive)); +static_assert(decodeFltRoundsToHWTableReduced(TowardNegative) == + getModeRegisterRoundMode(HWTowardNegative, HWTowardNegative)); Index: llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -914,56 +914,34 @@ ; GFX6-LABEL: s_set_rounding_i2_zeroext: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s34, s4, -4 -; GFX6-NEXT: s_cmp_lt_u32 s4, 4 -; GFX6-NEXT: s_cselect_b32 s34, s4, s34 -; GFX6-NEXT: s_lshl_b32 s36, s34, 2 -; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX6-NEXT: s_lshl_b32 s34, s4, 2 +; GFX6-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: s_set_rounding_i2_zeroext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_or_b32 s34, s4, -4 -; GFX7-NEXT: s_cmp_lt_u32 s4, 4 -; GFX7-NEXT: s_cselect_b32 s34, s4, s34 -; GFX7-NEXT: s_lshl_b32 s36, s34, 2 -; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX7-NEXT: s_lshl_b32 s34, s4, 2 +; GFX7-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: s_set_rounding_i2_zeroext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s36, 0xffff, s4 -; GFX8-NEXT: v_cmp_lt_u16_e64 s[34:35], s4, 4 -; GFX8-NEXT: s_or_b32 s37, s36, -4 -; GFX8-NEXT: s_and_b64 s[34:35], s[34:35], exec -; GFX8-NEXT: s_cselect_b32 s34, s36, s37 -; GFX8-NEXT: s_lshl_b32 s36, s34, 2 -; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX8-NEXT: s_and_b32 s34, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s34, s34, 2 +; GFX8-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_set_rounding_i2_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s36, 0xffff, s4 -; GFX9-NEXT: v_cmp_lt_u16_e64 s[34:35], s4, 4 -; GFX9-NEXT: s_or_b32 s37, s36, -4 -; GFX9-NEXT: s_and_b64 s[34:35], s[34:35], exec -; GFX9-NEXT: s_cselect_b32 s34, s36, s37 -; GFX9-NEXT: s_lshl_b32 s36, s34, 2 -; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_and_b32 s34, 0xffff, s4 +; GFX9-NEXT: s_lshl_b32 s34, s34, 2 +; GFX9-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -971,15 +949,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_lt_u16_e64 s34, s4, 4 -; GFX10-NEXT: s_and_b32 s35, 0xffff, s4 -; GFX10-NEXT: s_or_b32 s36, s35, -4 -; GFX10-NEXT: s_and_b32 s34, s34, exec_lo -; GFX10-NEXT: s_cselect_b32 s34, s35, s36 -; GFX10-NEXT: s_lshl_b32 s36, s34, 2 -; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_and_b32 s34, 0xffff, s4 +; GFX10-NEXT: s_lshl_b32 s34, s34, 2 +; GFX10-NEXT: s_lshr_b32 s34, 0xa50f, s34 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -987,15 +959,9 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_cmp_lt_u16_e64 s0, s4, 4 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s4 -; GFX11-NEXT: s_or_b32 s2, s1, -4 -; GFX11-NEXT: s_and_b32 s0, s0, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s1, s2 -; GFX11-NEXT: s_lshl_b32 s2, s0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_lshr_b32 s0, 0xa50f, s0 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %zext.rounding = zext i2 %rounding to i32 @@ -1281,10 +1247,8 @@ ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0 +; GFX6-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0 ; GFX6-NEXT: v_readfirstlane_b32 s34, v0 ; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1295,10 +1259,8 @@ ; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0 +; GFX7-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0 ; GFX7-NEXT: v_readfirstlane_b32 s34, v0 ; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1309,10 +1271,9 @@ ; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35] +; GFX8-NEXT: s_mov_b32 s34, 0xa50f +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s34 ; GFX8-NEXT: v_readfirstlane_b32 s34, v0 ; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1323,10 +1284,9 @@ ; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, 0xa50f +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s34 ; GFX9-NEXT: v_readfirstlane_b32 s34, v0 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1338,10 +1298,8 @@ ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b32 s34, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 -; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35] +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f ; GFX10-NEXT: v_readfirstlane_b32 s34, v0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1353,10 +1311,8 @@ ; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1371,14 +1327,7 @@ ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX678-NEXT: s_cmp_eq_u32 s4, 0 -; GFX678-NEXT: s_cselect_b32 s34, 1, 3 -; GFX678-NEXT: s_or_b32 s35, s34, -4 -; GFX678-NEXT: s_cmp_lt_u32 s34, 4 -; GFX678-NEXT: s_cselect_b32 s34, s34, s35 -; GFX678-NEXT: s_lshl_b32 s36, s34, 2 -; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_cselect_b32 s34, 0xa50, 10 ; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX678-NEXT: s_setpc_b64 s[30:31] ; @@ -1386,14 +1335,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s34, 1, 3 -; GFX9-NEXT: s_or_b32 s35, s34, -4 -; GFX9-NEXT: s_cmp_lt_u32 s34, 4 -; GFX9-NEXT: s_cselect_b32 s34, s34, s35 -; GFX9-NEXT: s_lshl_b32 s36, s34, 2 -; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_cselect_b32 s34, 0xa50, 10 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1402,14 +1344,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s34, 1, 3 -; GFX10-NEXT: s_or_b32 s35, s34, -4 -; GFX10-NEXT: s_cmp_lt_u32 s34, 4 -; GFX10-NEXT: s_cselect_b32 s34, s34, s35 -; GFX10-NEXT: s_lshl_b32 s36, s34, 2 -; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_cselect_b32 s34, 0xa50, 10 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,14 +1353,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s0, 1, 3 -; GFX11-NEXT: s_or_b32 s1, s0, -4 -; GFX11-NEXT: s_cmp_lt_u32 s0, 4 -; GFX11-NEXT: s_cselect_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s2, s0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_cselect_b32 s0, 0xa50, 10 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %cond, 0 @@ -1435,66 +1363,22 @@ } define void @v_set_rounding_select_1_3(i32 %cond) { -; GFX6-LABEL: v_set_rounding_select_1_3: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc -; GFX6-NEXT: v_or_b32_e32 v1, -4, v0 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: s_mov_b32 s4, 0x1c84a50f -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX6-NEXT: s_mov_b32 s5, 0xb73e62d9 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v0 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: v_set_rounding_select_1_3: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc -; GFX7-NEXT: v_or_b32_e32 v1, -4, v0 -; GFX7-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX7-NEXT: s_mov_b32 s4, 0x1c84a50f -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xb73e62d9 -; GFX7-NEXT: v_lshr_b64 v[0:1], s[4:5], v0 -; GFX7-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: v_set_rounding_select_1_3: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc -; GFX8-NEXT: v_or_b32_e32 v1, -4, v0 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: s_mov_b32 s4, 0x1c84a50f -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xb73e62d9 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX678-LABEL: v_set_rounding_select_1_3: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v1, 0xa50 +; GFX678-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX678-NEXT: v_cndmask_b32_e32 v0, 10, v1, vcc +; GFX678-NEXT: v_readfirstlane_b32 s4, v0 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX678-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_set_rounding_select_1_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0xa50 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v1, -4, v0 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, 10, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1504,14 +1388,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v1, -4, v0 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1521,14 +1398,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 4, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1539,23 +1409,42 @@ } define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) { -; GFX678-LABEL: s_set_rounding_select_2_0: -; GFX678: ; %bb.0: -; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX678-NEXT: s_cmp_eq_u32 s4, 0 -; GFX678-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GFX678-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX678-NEXT: v_readfirstlane_b32 s34, v0 -; GFX678-NEXT: s_lshl_b32 s34, s34, 1 -; GFX678-NEXT: s_or_b32 s35, s34, -4 -; GFX678-NEXT: s_cmp_lt_u32 s34, 4 -; GFX678-NEXT: s_cselect_b32 s34, s34, s35 -; GFX678-NEXT: s_lshl_b32 s36, s34, 2 -; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 -; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 -; GFX678-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: s_set_rounding_select_2_0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX6-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0 +; GFX6-NEXT: v_readfirstlane_b32 s34, v0 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: s_set_rounding_select_2_0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_cmp_eq_u32 s4, 0 +; GFX7-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0 +; GFX7-NEXT: v_readfirstlane_b32 s34, v0 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_set_rounding_select_2_0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_mov_b32 s34, 0xa50f +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s34 +; GFX8-NEXT: v_readfirstlane_b32 s34, v0 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_set_rounding_select_2_0: ; GFX9: ; %bb.0: @@ -1563,15 +1452,10 @@ ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_mov_b32 s34, 0xa50f +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s34 ; GFX9-NEXT: v_readfirstlane_b32 s34, v0 -; GFX9-NEXT: s_lshl_b32 s34, s34, 1 -; GFX9-NEXT: s_or_b32 s35, s34, -4 -; GFX9-NEXT: s_cmp_lt_u32 s34, 4 -; GFX9-NEXT: s_cselect_b32 s34, s34, s35 -; GFX9-NEXT: s_lshl_b32 s36, s34, 2 -; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,15 +1466,9 @@ ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b32 s34, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f ; GFX10-NEXT: v_readfirstlane_b32 s34, v0 -; GFX10-NEXT: s_lshl_b32 s34, s34, 1 -; GFX10-NEXT: s_or_b32 s35, s34, -4 -; GFX10-NEXT: s_cmp_lt_u32 s34, 4 -; GFX10-NEXT: s_cselect_b32 s34, s34, s35 -; GFX10-NEXT: s_lshl_b32 s36, s34, 2 -; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1601,15 +1479,9 @@ ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_or_b32 s1, s0, -4 -; GFX11-NEXT: s_cmp_lt_u32 s0, 4 -; GFX11-NEXT: s_cselect_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s2, s0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %cond, 0 @@ -1623,14 +1495,8 @@ ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX678-NEXT: s_cmp_eq_u32 s4, 0 -; GFX678-NEXT: s_cselect_b32 s34, 2, 1 -; GFX678-NEXT: s_or_b32 s35, s34, -4 -; GFX678-NEXT: s_cmp_lt_u32 s34, 4 -; GFX678-NEXT: s_cselect_b32 s34, s34, s35 -; GFX678-NEXT: s_lshl_b32 s36, s34, 2 -; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_movk_i32 s34, 0xa5 +; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa50 ; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX678-NEXT: s_setpc_b64 s[30:31] ; @@ -1638,14 +1504,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s34, 2, 1 -; GFX9-NEXT: s_or_b32 s35, s34, -4 -; GFX9-NEXT: s_cmp_lt_u32 s34, 4 -; GFX9-NEXT: s_cselect_b32 s34, s34, s35 -; GFX9-NEXT: s_lshl_b32 s36, s34, 2 -; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_movk_i32 s34, 0xa5 +; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa50 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1654,14 +1514,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s34, 2, 1 -; GFX10-NEXT: s_or_b32 s35, s34, -4 -; GFX10-NEXT: s_cmp_lt_u32 s34, 4 -; GFX10-NEXT: s_cselect_b32 s34, s34, s35 -; GFX10-NEXT: s_lshl_b32 s36, s34, 2 -; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_movk_i32 s34, 0xa5 +; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa50 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1670,14 +1524,8 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s0, 2, 1 -; GFX11-NEXT: s_or_b32 s1, s0, -4 -; GFX11-NEXT: s_cmp_lt_u32 s0, 4 -; GFX11-NEXT: s_cselect_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s2, s0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_movk_i32 s0, 0xa5 +; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa50 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %cond, 0 @@ -1691,14 +1539,8 @@ ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX678-NEXT: s_cmp_eq_u32 s4, 0 -; GFX678-NEXT: s_cselect_b32 s34, 1, 2 -; GFX678-NEXT: s_or_b32 s35, s34, -4 -; GFX678-NEXT: s_cmp_lt_u32 s34, 4 -; GFX678-NEXT: s_cselect_b32 s34, s34, s35 -; GFX678-NEXT: s_lshl_b32 s36, s34, 2 -; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_movk_i32 s34, 0xa50 +; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa5 ; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX678-NEXT: s_setpc_b64 s[30:31] ; @@ -1706,14 +1548,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s34, 1, 2 -; GFX9-NEXT: s_or_b32 s35, s34, -4 -; GFX9-NEXT: s_cmp_lt_u32 s34, 4 -; GFX9-NEXT: s_cselect_b32 s34, s34, s35 -; GFX9-NEXT: s_lshl_b32 s36, s34, 2 -; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_movk_i32 s34, 0xa50 +; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa5 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1722,14 +1558,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s34, 1, 2 -; GFX10-NEXT: s_or_b32 s35, s34, -4 -; GFX10-NEXT: s_cmp_lt_u32 s34, 4 -; GFX10-NEXT: s_cselect_b32 s34, s34, s35 -; GFX10-NEXT: s_lshl_b32 s36, s34, 2 -; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_movk_i32 s34, 0xa50 +; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa5 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1738,14 +1568,8 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s0, 1, 2 -; GFX11-NEXT: s_or_b32 s1, s0, -4 -; GFX11-NEXT: s_cmp_lt_u32 s0, 4 -; GFX11-NEXT: s_cselect_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s2, s0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_movk_i32 s0, 0xa50 +; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa5 ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %cond, 0 @@ -1759,14 +1583,7 @@ ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX678-NEXT: s_cmp_eq_u32 s4, 0 -; GFX678-NEXT: s_cselect_b32 s34, 3, 0 -; GFX678-NEXT: s_or_b32 s35, s34, -4 -; GFX678-NEXT: s_cmp_lt_u32 s34, 4 -; GFX678-NEXT: s_cselect_b32 s34, s34, s35 -; GFX678-NEXT: s_lshl_b32 s36, s34, 2 -; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_cselect_b32 s34, 10, 0xa50f ; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX678-NEXT: s_setpc_b64 s[30:31] ; @@ -1774,14 +1591,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s34, 3, 0 -; GFX9-NEXT: s_or_b32 s35, s34, -4 -; GFX9-NEXT: s_cmp_lt_u32 s34, 4 -; GFX9-NEXT: s_cselect_b32 s34, s34, s35 -; GFX9-NEXT: s_lshl_b32 s36, s34, 2 -; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_cselect_b32 s34, 10, 0xa50f ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1790,14 +1600,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s34, 3, 0 -; GFX10-NEXT: s_or_b32 s35, s34, -4 -; GFX10-NEXT: s_cmp_lt_u32 s34, 4 -; GFX10-NEXT: s_cselect_b32 s34, s34, s35 -; GFX10-NEXT: s_lshl_b32 s36, s34, 2 -; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f -; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 -; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_cselect_b32 s34, 10, 0xa50f ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1806,14 +1609,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s0, 3, 0 -; GFX11-NEXT: s_or_b32 s1, s0, -4 -; GFX11-NEXT: s_cmp_lt_u32 s0, 4 -; GFX11-NEXT: s_cselect_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s2, s0, 2 -; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f -; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_cselect_b32 s0, 10, 0xa50f ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %cond, 0