Index: llvm/docs/AMDGPUUsage.rst =================================================================== --- llvm/docs/AMDGPUUsage.rst +++ llvm/docs/AMDGPUUsage.rst @@ -1020,6 +1020,12 @@ register do not exactly match the FLT_ROUNDS values, so a conversion is performed. + :ref:`llvm.set.rounding` Input value expected to be one of the valid results + from '``llvm.get.rounding``'. Rounding mode is + undefined if not passed a valid input. This should be + a wave uniform value. In case of a divergent input + value, the first active lane's value will be used. + llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values provided by each lane in the wavefront. Intrinsic takes a hint for reduction strategy using second operand Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -25513,6 +25513,8 @@ Other values may be used to represent additional rounding modes, supported by a target. These values are target-specific. +.. _int_set_rounding: + '``llvm.set.rounding``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/docs/ReleaseNotes.rst =================================================================== --- llvm/docs/ReleaseNotes.rst +++ llvm/docs/ReleaseNotes.rst @@ -81,7 +81,7 @@ * Implemented `llvm.stacksave` and `llvm.stackrestore` intrinsics. -* Implemented :ref:`llvm.get.rounding ` +* Implemented :ref:`llvm.get.rounding ` and :ref:`llvm.set.rounding ` Changes to the ARM Backend -------------------------- Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -412,6 +412,7 @@ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -756,6 +756,7 @@ setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); + setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); // TODO: Could move this to custom lowering, could benefit from combines on // extract of relevant bits. @@ -3617,6 +3618,75 @@ return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); } +SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + + SDValue NewMode = Op.getOperand(1); + assert(NewMode.getValueType() == MVT::i32); + + // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the + // hardware MODE.fp_round values. + if (auto *ConstMode = dyn_cast(NewMode)) { + uint32_t ClampedVal = std::min( + static_cast(ConstMode->getZExtValue()), + static_cast(AMDGPU::TowardZeroF32_TowardNegativeF64)); + NewMode = DAG.getConstant( + AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32); + } else { + SDValue BitTable = + DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64); + + // The supported standard values are 0-3. The extended values start at 8. We + // need to offset by 4 if the value is in the extended range. + + // is_standard = value < 4; + // table_index = is_standard ? value : (value - 4) + // MODE.fp_round = (bit_table >> table_index) & 0xf + + SDValue Four = DAG.getConstant(4, SL, MVT::i32); + SDValue IsStandardValue = + DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT); + SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four); + SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, + NewMode, OffsetEnum); + + SDValue Two = DAG.getConstant(2, SL, MVT::i32); + SDValue RoundModeTimesNumBits = + DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two); + + SDValue TableValue = + DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); + SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); + + // No need to mask out the high bits since the setreg will ignore them + // anyway. + NewMode = TruncTable; + + // Insert a readfirstlane in case the value is a VGPR. We could do this + // earlier and keep more operations scalar, but that interferes with + // combining the source. + SDValue ReadFirstLaneID = + DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); + NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, + ReadFirstLaneID, NewMode); + } + + // N.B. The setreg will be later folded into s_round_mode on supported + // targets. + SDValue IntrinID = + DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); + uint32_t BothRoundHwReg = + AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4); + SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); + + SDValue SetReg = + DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0), + IntrinID, RoundBothImm, NewMode); + + return SetReg; +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch(RegName) @@ -5128,6 +5198,8 @@ return LowerSTACKSAVE(Op, DAG); case ISD::GET_ROUNDING: return lowerGET_ROUNDING(Op, DAG); + case ISD::SET_ROUNDING: + return lowerSET_ROUNDING(Op, DAG); } return SDValue(); } Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h +++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h @@ -142,6 +142,18 @@ // values. extern const uint64_t FltRoundConversionTable; +// Bit indexed table to convert from FLT_ROUNDS values to hardware rounding mode +// values +extern const uint64_t FltRoundToHWConversionTable; + +/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value. +constexpr uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds) { + uint32_t IndexVal = FltRounds; + if (IndexVal > TowardNegative) + IndexVal -= ExtendedFltRoundOffset; + return (FltRoundToHWConversionTable >> (IndexVal << 2)) & 0xf; +} + } // end namespace AMDGPU } // end namespace llvm Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp +++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp @@ -168,3 +168,116 @@ static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( HWTowardNegative, HWTowardPositive)) == TowardNegativeF32_TowardPositiveF64); + +// Decode FLT_ROUNDS into the hardware value where the two rounding modes are +// the same and use a standard value +static constexpr uint64_t encodeFltRoundsToHWTableSame(uint32_t HWVal, + uint32_t FltRoundsVal) { + if (FltRoundsVal > TowardNegative) + FltRoundsVal -= ExtendedFltRoundOffset; + + return static_cast(getModeRegisterRoundMode(HWVal, HWVal)) + << (FltRoundsVal << 2); +} + +/// Decode FLT_ROUNDS into the hardware value where the two rounding modes +/// different and use an extended value. +static constexpr uint64_t encodeFltRoundsToHWTable(uint32_t HWF32Val, + uint32_t HWF64Val, + uint32_t FltRoundsVal) { + if (FltRoundsVal > TowardNegative) + FltRoundsVal -= ExtendedFltRoundOffset; + return static_cast(getModeRegisterRoundMode(HWF32Val, HWF64Val)) + << (FltRoundsVal << 2); +} + +constexpr uint64_t AMDGPU::FltRoundToHWConversionTable = + encodeFltRoundsToHWTableSame(HWTowardZero, TowardZeroF32_TowardZeroF64) | + encodeFltRoundsToHWTableSame(HWNearestTiesToEven, + NearestTiesToEvenF32_NearestTiesToEvenF64) | + encodeFltRoundsToHWTableSame(HWTowardPositive, + TowardPositiveF32_TowardPositiveF64) | + encodeFltRoundsToHWTableSame(HWTowardNegative, + TowardNegativeF32_TowardNegativeF64) | + + encodeFltRoundsToHWTable(HWTowardZero, HWNearestTiesToEven, + TowardZeroF32_NearestTiesToEvenF64) | + encodeFltRoundsToHWTable(HWTowardZero, HWTowardPositive, + TowardZeroF32_TowardPositiveF64) | + encodeFltRoundsToHWTable(HWTowardZero, HWTowardNegative, + TowardZeroF32_TowardNegativeF64) | + + encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardZero, + NearestTiesToEvenF32_TowardZeroF64) | + encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardPositive, + NearestTiesToEvenF32_TowardPositiveF64) | + encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardNegative, + NearestTiesToEvenF32_TowardNegativeF64) | + + encodeFltRoundsToHWTable(HWTowardPositive, HWTowardZero, + TowardPositiveF32_TowardZeroF64) | + encodeFltRoundsToHWTable(HWTowardPositive, HWNearestTiesToEven, + TowardPositiveF32_NearestTiesToEvenF64) | + encodeFltRoundsToHWTable(HWTowardPositive, HWTowardNegative, + TowardPositiveF32_TowardNegativeF64) | + + encodeFltRoundsToHWTable(HWTowardNegative, HWTowardZero, + TowardNegativeF32_TowardZeroF64) | + encodeFltRoundsToHWTable(HWTowardNegative, HWNearestTiesToEven, + TowardNegativeF32_NearestTiesToEvenF64) | + encodeFltRoundsToHWTable(HWTowardNegative, HWTowardPositive, + TowardNegativeF32_TowardPositiveF64); + +// Verify evaluation of FltRoundToHWConversionTable + +static_assert(decodeFltRoundToHWConversionTable(AMDGPUFltRounds::TowardZero) == + getModeRegisterRoundMode(HWTowardZero, HWTowardZero)); +static_assert( + decodeFltRoundToHWConversionTable(AMDGPUFltRounds::NearestTiesToEven) == + getModeRegisterRoundMode(HWNearestTiesToEven, HWNearestTiesToEven)); +static_assert( + decodeFltRoundToHWConversionTable(AMDGPUFltRounds::TowardPositive) == + getModeRegisterRoundMode(HWTowardPositive, HWTowardPositive)); +static_assert( + decodeFltRoundToHWConversionTable(AMDGPUFltRounds::TowardNegative) == + getModeRegisterRoundMode(HWTowardNegative, HWTowardNegative)); + +static_assert( + decodeFltRoundToHWConversionTable(NearestTiesToEvenF32_TowardPositiveF64) == + getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardPositive)); +static_assert( + decodeFltRoundToHWConversionTable(NearestTiesToEvenF32_TowardNegativeF64) == + getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardNegative)); +static_assert( + decodeFltRoundToHWConversionTable(NearestTiesToEvenF32_TowardZeroF64) == + getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardZero)); + +static_assert( + decodeFltRoundToHWConversionTable(TowardPositiveF32_NearestTiesToEvenF64) == + getModeRegisterRoundMode(HWTowardPositive, HWNearestTiesToEven)); +static_assert( + decodeFltRoundToHWConversionTable(TowardPositiveF32_TowardNegativeF64) == + getModeRegisterRoundMode(HWTowardPositive, HWTowardNegative)); +static_assert( + decodeFltRoundToHWConversionTable(TowardPositiveF32_TowardZeroF64) == + getModeRegisterRoundMode(HWTowardPositive, HWTowardZero)); + +static_assert( + decodeFltRoundToHWConversionTable(TowardNegativeF32_NearestTiesToEvenF64) == + getModeRegisterRoundMode(HWTowardNegative, HWNearestTiesToEven)); +static_assert( + decodeFltRoundToHWConversionTable(TowardNegativeF32_TowardPositiveF64) == + getModeRegisterRoundMode(HWTowardNegative, HWTowardPositive)); +static_assert( + decodeFltRoundToHWConversionTable(TowardNegativeF32_TowardZeroF64) == + getModeRegisterRoundMode(HWTowardNegative, HWTowardZero)); + +static_assert( + decodeFltRoundToHWConversionTable(TowardZeroF32_NearestTiesToEvenF64) == + getModeRegisterRoundMode(HWTowardZero, HWNearestTiesToEven)); +static_assert( + decodeFltRoundToHWConversionTable(TowardZeroF32_TowardPositiveF64) == + getModeRegisterRoundMode(HWTowardZero, HWTowardPositive)); +static_assert( + decodeFltRoundToHWConversionTable(TowardZeroF32_TowardNegativeF64) == + getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)); Index: llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -0,0 +1,1919 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s +; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s +; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s + +declare void @llvm.set.rounding(i32) +declare i32 @llvm.get.rounding() + +define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) { +; GFX678-LABEL: s_set_rounding: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_add_i32 s34, s4, -4 +; GFX678-NEXT: s_cmp_lt_u32 s4, 4 +; GFX678-NEXT: s_cselect_b32 s34, s4, s34 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s34, s4, -4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 4 +; GFX9-NEXT: s_cselect_b32 s34, s4, s34 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s34, s4, -4 +; GFX10-NEXT: s_cmp_lt_u32 s4, 4 +; GFX10-NEXT: s_cselect_b32 s34, s4, s34 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s4, -4 +; GFX11-NEXT: s_cmp_lt_u32 s4, 4 +; GFX11-NEXT: s_cselect_b32 s0, s4, s0 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { +; GFX6-LABEL: s_set_rounding_kernel: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX6-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s3, s2, -4 +; GFX6-NEXT: s_cmp_lt_u32 s2, 4 +; GFX6-NEXT: s_cselect_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s2, s2, 2 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: s_set_rounding_kernel: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX7-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX7-NEXT: ;;#ASMSTART +; GFX7-NEXT: ;;#ASMEND +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s3, s2, -4 +; GFX7-NEXT: s_cmp_lt_u32 s2, 4 +; GFX7-NEXT: s_cselect_b32 s2, s2, s3 +; GFX7-NEXT: s_lshl_b32 s2, s2, 2 +; GFX7-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: s_set_rounding_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX8-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s3, s2, -4 +; GFX8-NEXT: s_cmp_lt_u32 s2, 4 +; GFX8-NEXT: s_cselect_b32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, 2 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_set_rounding_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s3, s2, -4 +; GFX9-NEXT: s_cmp_lt_u32 s2, 4 +; GFX9-NEXT: s_cselect_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_set_rounding_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s1, s0, -4 +; GFX10-NEXT: s_cmp_lt_u32 s0, 4 +; GFX10-NEXT: s_cselect_b32 s2, s0, s1 +; GFX10-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX10-NEXT: s_lshl_b32 s2, s2, 2 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_set_rounding_kernel: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s2, s0, s1 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshl_b32 s2, s2, 2 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_endpgm + call void @llvm.set.rounding(i32 %rounding) + call void asm sideeffect "",""() + ret void +} + +define void @v_set_rounding(i32 %rounding) { +; GFX6-LABEL: v_set_rounding: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -4, v0 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_set_rounding: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v1, vcc, -4, v0 +; GFX7-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX7-NEXT: v_lshr_b64 v[0:1], s[4:5], v0 +; GFX7-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_set_rounding: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -4, v0 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_set_rounding: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, -4, v0 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_set_rounding: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v1, -4, v0 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_set_rounding: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, -4, v0 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define void @set_rounding_get_rounding() { +; GFX678-LABEL: set_rounding_get_rounding: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4) +; GFX678-NEXT: s_lshl_b32 s6, s4, 2 +; GFX678-NEXT: s_mov_b32 s4, 0xeb24da71 +; GFX678-NEXT: s_mov_b32 s5, 0xc96f385 +; GFX678-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX678-NEXT: s_and_b32 s4, s4, 15 +; GFX678-NEXT: s_add_i32 s5, s4, 4 +; GFX678-NEXT: s_cmp_lt_u32 s4, 4 +; GFX678-NEXT: s_cselect_b32 s4, s4, s5 +; GFX678-NEXT: s_add_i32 s5, s4, -4 +; GFX678-NEXT: s_cmp_lt_u32 s4, 4 +; GFX678-NEXT: s_cselect_b32 s4, s4, s5 +; GFX678-NEXT: s_lshl_b32 s6, s4, 2 +; GFX678-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: set_rounding_get_rounding: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4) +; GFX9-NEXT: s_lshl_b32 s6, s4, 2 +; GFX9-NEXT: s_mov_b32 s4, 0xeb24da71 +; GFX9-NEXT: s_mov_b32 s5, 0xc96f385 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX9-NEXT: s_and_b32 s4, s4, 15 +; GFX9-NEXT: s_add_i32 s5, s4, 4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 4 +; GFX9-NEXT: s_cselect_b32 s4, s4, s5 +; GFX9-NEXT: s_add_i32 s5, s4, -4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 4 +; GFX9-NEXT: s_cselect_b32 s4, s4, s5 +; GFX9-NEXT: s_lshl_b32 s6, s4, 2 +; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: set_rounding_get_rounding: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_getreg_b32 s6, hwreg(HW_REG_MODE, 0, 4) +; GFX10-NEXT: s_mov_b32 s4, 0xeb24da71 +; GFX10-NEXT: s_mov_b32 s5, 0xc96f385 +; GFX10-NEXT: s_lshl_b32 s6, s6, 2 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX10-NEXT: s_and_b32 s4, s4, 15 +; GFX10-NEXT: s_add_i32 s5, s4, 4 +; GFX10-NEXT: s_cmp_lt_u32 s4, 4 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_add_i32 s5, s4, -4 +; GFX10-NEXT: s_cmp_lt_u32 s4, 4 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s6, s4, 2 +; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: set_rounding_get_rounding: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 4) +; GFX11-NEXT: s_mov_b32 s0, 0xeb24da71 +; GFX11-NEXT: s_mov_b32 s1, 0xc96f385 +; GFX11-NEXT: s_lshl_b32 s2, s2, 2 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: s_add_i32 s1, s0, 4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_add_i32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %rounding = call i32 @llvm.get.rounding() + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define void @s_set_rounding_0() { +; GFX678-LABEL: s_set_rounding_0: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_0: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xf +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 0) + ret void +} + +define void @s_set_rounding_1() { +; GFX678-LABEL: s_set_rounding_1: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_1: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 1) + ret void +} + +define void @s_set_rounding_2() { +; GFX678-LABEL: s_set_rounding_2: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_2: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x5 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 2) + ret void +} + +define void @s_set_rounding_3() { +; GFX678-LABEL: s_set_rounding_3: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_3: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xa +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 3) + ret void +} + +; Unsupported mode. +define void @s_set_rounding_4() { +; GFX678-LABEL: s_set_rounding_4: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_4: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xf +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 4) + ret void +} + +; undefined +define void @s_set_rounding_5() { +; GFX678-LABEL: s_set_rounding_5: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_5: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 5) + ret void +} + +; undefined +define void @s_set_rounding_6() { +; GFX678-LABEL: s_set_rounding_6: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_6: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x5 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 6) + ret void +} + +; "Dynamic" +define void @s_set_rounding_7() { +; GFX678-LABEL: s_set_rounding_7: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_7: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xa +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 7) + ret void +} + +; Invalid +define void @s_set_rounding_neg1() { +; GFX678-LABEL: s_set_rounding_neg1: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_neg1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_neg1: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xb +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 -1) + ret void +} + +; -------------------------------------------------------------------- +; Test extended values +; -------------------------------------------------------------------- + +; NearestTiesToEvenF32_TowardPositiveF64 = 8 +define void @s_set_rounding_8() { +; GFX678-LABEL: s_set_rounding_8: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_8: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x4 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 8) + ret void +} + +; NearestTiesToEvenF32_TowardNegativeF64 = 9 +define void @s_set_rounding_9() { +; GFX678-LABEL: s_set_rounding_9: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_9: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x8 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 9) + ret void +} + +; NearestTiesToEvenF32_TowardZeroF64 = 10 +define void @s_set_rounding_10() { +; GFX678-LABEL: s_set_rounding_10: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_10: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xc +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 10) + ret void +} + +; TowardPositiveF32_NearestTiesToEvenF64 = 11 +define void @s_set_rounding_11() { +; GFX678-LABEL: s_set_rounding_11: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_11: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x1 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 11) + ret void +} + +; TowardPositiveF32_TowardNegativeF64 = 12 +define void @s_set_rounding_12() { +; GFX678-LABEL: s_set_rounding_12: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_12: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x9 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 12) + ret void +} + +; TowardPositiveF32_TowardZeroF64 = 13 +define void @s_set_rounding_13() { +; GFX678-LABEL: s_set_rounding_13: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_13: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xd +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 13) + ret void +} + +; TowardNegativeF32_NearestTiesToEvenF64 = 14 +define void @s_set_rounding_14() { +; GFX678-LABEL: s_set_rounding_14: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_14: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x2 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 14) + ret void +} + +; TowardNegativeF32_TowardPositiveF64 = 15 +define void @s_set_rounding_15() { +; GFX678-LABEL: s_set_rounding_15: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_15: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x6 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 15) + ret void +} + + +; TowardNegativeF32_TowardZeroF64 = 16 +define void @s_set_rounding_16() { +; GFX678-LABEL: s_set_rounding_16: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_16: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xe +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 16) + ret void +} + +; TowardZeroF32_NearestTiesToEvenF64 = 17 +define void @s_set_rounding_17() { +; GFX678-LABEL: s_set_rounding_17: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_17: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_17: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x3 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 17) + ret void +} + +; TowardZeroF32_TowardPositiveF64 = 18 +define void @s_set_rounding_18() { +; GFX678-LABEL: s_set_rounding_18: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_18: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_18: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0x7 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 18) + ret void +} + +; TowardZeroF32_TowardNegativeF64 = 19, +define void @s_set_rounding_19() { +; GFX678-LABEL: s_set_rounding_19: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_19: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_19: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xb +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 19) + ret void +} + +; Invalid, out of bounds +define void @s_set_rounding_20() { +; GFX678-LABEL: s_set_rounding_20: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_20: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_20: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xb +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 20) + ret void +} + +define void @s_set_rounding_0xffff() { +; GFX678-LABEL: s_set_rounding_0xffff: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_0xffff: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: s_set_rounding_0xffff: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: s_round_mode 0xb +; GFX1011-NEXT: s_setpc_b64 s[30:31] + call void @llvm.set.rounding(i32 65535) + ret void +} + +; -------------------------------------------------------------------- +; Test optimization knowing the value can only be in the standard +; range +; -------------------------------------------------------------------- + +define amdgpu_gfx void @s_set_rounding_i2_zeroext(i2 zeroext inreg %rounding) { +; GFX6-LABEL: s_set_rounding_i2_zeroext: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_or_b32 s34, s4, -4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 4 +; GFX6-NEXT: s_cselect_b32 s34, s4, s34 +; GFX6-NEXT: s_lshl_b32 s36, s34, 2 +; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: s_set_rounding_i2_zeroext: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_or_b32 s34, s4, -4 +; GFX7-NEXT: s_cmp_lt_u32 s4, 4 +; GFX7-NEXT: s_cselect_b32 s34, s4, s34 +; GFX7-NEXT: s_lshl_b32 s36, s34, 2 +; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_set_rounding_i2_zeroext: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s36, 0xffff, s4 +; GFX8-NEXT: v_cmp_lt_u16_e64 s[34:35], s4, 4 +; GFX8-NEXT: s_or_b32 s37, s36, -4 +; GFX8-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GFX8-NEXT: s_cselect_b32 s34, s36, s37 +; GFX8-NEXT: s_lshl_b32 s36, s34, 2 +; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_i2_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s36, 0xffff, s4 +; GFX9-NEXT: v_cmp_lt_u16_e64 s[34:35], s4, 4 +; GFX9-NEXT: s_or_b32 s37, s36, -4 +; GFX9-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GFX9-NEXT: s_cselect_b32 s34, s36, s37 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_i2_zeroext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lt_u16_e64 s34, s4, 4 +; GFX10-NEXT: s_and_b32 s35, 0xffff, s4 +; GFX10-NEXT: s_or_b32 s36, s35, -4 +; GFX10-NEXT: s_and_b32 s34, s34, exec_lo +; GFX10-NEXT: s_cselect_b32 s34, s35, s36 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_i2_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_lt_u16_e64 s0, s4, 4 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX11-NEXT: s_or_b32 s2, s1, -4 +; GFX11-NEXT: s_and_b32 s0, s0, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %zext.rounding = zext i2 %rounding to i32 + call void @llvm.set.rounding(i32 %zext.rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) { +; GFX6-LABEL: s_set_rounding_i2_signext: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s34, s4, -4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 4 +; GFX6-NEXT: s_cselect_b32 s34, s4, s34 +; GFX6-NEXT: s_lshl_b32 s36, s34, 2 +; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: s_set_rounding_i2_signext: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s34, s4, -4 +; GFX7-NEXT: s_cmp_lt_u32 s4, 4 +; GFX7-NEXT: s_cselect_b32 s34, s4, s34 +; GFX7-NEXT: s_lshl_b32 s36, s34, 2 +; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_set_rounding_i2_signext: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sext_i32_i16 s34, s4 +; GFX8-NEXT: s_add_i32 s35, s34, -4 +; GFX8-NEXT: s_cmp_lt_u32 s34, 4 +; GFX8-NEXT: s_cselect_b32 s34, s34, s35 +; GFX8-NEXT: s_lshl_b32 s36, s34, 2 +; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_i2_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sext_i32_i16 s34, s4 +; GFX9-NEXT: s_add_i32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_i2_signext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_sext_i32_i16 s34, s4 +; GFX10-NEXT: s_add_i32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_i2_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s0, s4 +; GFX11-NEXT: s_add_i32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sext.rounding = sext i2 %rounding to i32 + call void @llvm.set.rounding(i32 %sext.rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) { +; GFX6-LABEL: s_set_rounding_i3_signext: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s34, s4, -4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 4 +; GFX6-NEXT: s_cselect_b32 s34, s4, s34 +; GFX6-NEXT: s_lshl_b32 s36, s34, 2 +; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: s_set_rounding_i3_signext: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s34, s4, -4 +; GFX7-NEXT: s_cmp_lt_u32 s4, 4 +; GFX7-NEXT: s_cselect_b32 s34, s4, s34 +; GFX7-NEXT: s_lshl_b32 s36, s34, 2 +; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_set_rounding_i3_signext: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sext_i32_i16 s34, s4 +; GFX8-NEXT: s_add_i32 s35, s34, -4 +; GFX8-NEXT: s_cmp_lt_u32 s34, 4 +; GFX8-NEXT: s_cselect_b32 s34, s34, s35 +; GFX8-NEXT: s_lshl_b32 s36, s34, 2 +; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_i3_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sext_i32_i16 s34, s4 +; GFX9-NEXT: s_add_i32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_i3_signext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_sext_i32_i16 s34, s4 +; GFX10-NEXT: s_add_i32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_i3_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s0, s4 +; GFX11-NEXT: s_add_i32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sext.rounding = sext i3 %rounding to i32 + call void @llvm.set.rounding(i32 %sext.rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) { +; GFX6-LABEL: s_set_rounding_i3_zeroext: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s34, s4, -4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 4 +; GFX6-NEXT: s_cselect_b32 s34, s4, s34 +; GFX6-NEXT: s_lshl_b32 s36, s34, 2 +; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: s_set_rounding_i3_zeroext: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s34, s4, -4 +; GFX7-NEXT: s_cmp_lt_u32 s4, 4 +; GFX7-NEXT: s_cselect_b32 s34, s4, s34 +; GFX7-NEXT: s_lshl_b32 s36, s34, 2 +; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_set_rounding_i3_zeroext: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s36, 0xffff, s4 +; GFX8-NEXT: v_cmp_lt_u16_e64 s[34:35], s4, 4 +; GFX8-NEXT: s_add_i32 s37, s36, -4 +; GFX8-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GFX8-NEXT: s_cselect_b32 s34, s36, s37 +; GFX8-NEXT: s_lshl_b32 s36, s34, 2 +; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_i3_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s36, 0xffff, s4 +; GFX9-NEXT: v_cmp_lt_u16_e64 s[34:35], s4, 4 +; GFX9-NEXT: s_add_i32 s37, s36, -4 +; GFX9-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GFX9-NEXT: s_cselect_b32 s34, s36, s37 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_i3_zeroext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lt_u16_e64 s34, s4, 4 +; GFX10-NEXT: s_and_b32 s35, 0xffff, s4 +; GFX10-NEXT: s_add_i32 s36, s35, -4 +; GFX10-NEXT: s_and_b32 s34, s34, exec_lo +; GFX10-NEXT: s_cselect_b32 s34, s35, s36 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_i3_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_lt_u16_e64 s0, s4, 4 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX11-NEXT: s_add_i32 s2, s1, -4 +; GFX11-NEXT: s_and_b32 s0, s0, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sext.rounding = zext i3 %rounding to i32 + call void @llvm.set.rounding(i32 %sext.rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) { +; GFX6-LABEL: s_set_rounding_select_0_1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0 +; GFX6-NEXT: v_readfirstlane_b32 s34, v0 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: s_set_rounding_select_0_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_cmp_lg_u32 s4, 0 +; GFX7-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0 +; GFX7-NEXT: v_readfirstlane_b32 s34, v0 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_set_rounding_select_0_1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 +; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35] +; GFX8-NEXT: v_readfirstlane_b32 s34, v0 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35] +; GFX9-NEXT: v_readfirstlane_b32 s34, v0 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_0_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35] +; GFX10-NEXT: v_readfirstlane_b32 s34, v0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_0_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 0, i32 1 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) { +; GFX678-LABEL: s_set_rounding_select_1_3: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_cselect_b32 s34, 1, 3 +; GFX678-NEXT: s_or_b32 s35, s34, -4 +; GFX678-NEXT: s_cmp_lt_u32 s34, 4 +; GFX678-NEXT: s_cselect_b32 s34, s34, s35 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s34, 1, 3 +; GFX9-NEXT: s_or_b32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_1_3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, 1, 3 +; GFX10-NEXT: s_or_b32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_1_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, 1, 3 +; GFX11-NEXT: s_or_b32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 1, i32 3 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define void @v_set_rounding_select_1_3(i32 %cond) { +; GFX6-LABEL: v_set_rounding_select_1_3: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc +; GFX6-NEXT: v_or_b32_e32 v1, -4, v0 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_set_rounding_select_1_3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc +; GFX7-NEXT: v_or_b32_e32 v1, -4, v0 +; GFX7-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX7-NEXT: v_lshr_b64 v[0:1], s[4:5], v0 +; GFX7-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_set_rounding_select_1_3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc +; GFX8-NEXT: v_or_b32_e32 v1, -4, v0 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_set_rounding_select_1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v1, -4, v0 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_set_rounding_select_1_3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v1, -4, v0 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5] +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_set_rounding_select_1_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 3, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v1, -4, v0 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 1, i32 3 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) { +; GFX678-LABEL: s_set_rounding_select_2_0: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX678-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX678-NEXT: v_readfirstlane_b32 s34, v0 +; GFX678-NEXT: s_lshl_b32 s34, s34, 1 +; GFX678-NEXT: s_or_b32 s35, s34, -4 +; GFX678-NEXT: s_cmp_lt_u32 s34, 4 +; GFX678-NEXT: s_cselect_b32 s34, s34, s35 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX9-NEXT: v_readfirstlane_b32 s34, v0 +; GFX9-NEXT: s_lshl_b32 s34, s34, 1 +; GFX9-NEXT: s_or_b32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_2_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 +; GFX10-NEXT: v_readfirstlane_b32 s34, v0 +; GFX10-NEXT: s_lshl_b32 s34, s34, 1 +; GFX10-NEXT: s_or_b32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_2_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_or_b32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 2, i32 0 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) { +; GFX678-LABEL: s_set_rounding_select_2_1: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_cselect_b32 s34, 2, 1 +; GFX678-NEXT: s_or_b32 s35, s34, -4 +; GFX678-NEXT: s_cmp_lt_u32 s34, 4 +; GFX678-NEXT: s_cselect_b32 s34, s34, s35 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s34, 2, 1 +; GFX9-NEXT: s_or_b32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_2_1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, 2, 1 +; GFX10-NEXT: s_or_b32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_2_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, 2, 1 +; GFX11-NEXT: s_or_b32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 2, i32 1 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) { +; GFX678-LABEL: s_set_rounding_select_1_2: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_cselect_b32 s34, 1, 2 +; GFX678-NEXT: s_or_b32 s35, s34, -4 +; GFX678-NEXT: s_cmp_lt_u32 s34, 4 +; GFX678-NEXT: s_cselect_b32 s34, s34, s35 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s34, 1, 2 +; GFX9-NEXT: s_or_b32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_1_2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, 1, 2 +; GFX10-NEXT: s_or_b32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_1_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, 1, 2 +; GFX11-NEXT: s_or_b32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 1, i32 2 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) { +; GFX678-LABEL: s_set_rounding_select_3_0: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_cselect_b32 s34, 3, 0 +; GFX678-NEXT: s_or_b32 s35, s34, -4 +; GFX678-NEXT: s_cmp_lt_u32 s34, 4 +; GFX678-NEXT: s_cselect_b32 s34, s34, s35 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s34, 3, 0 +; GFX9-NEXT: s_or_b32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_3_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, 3, 0 +; GFX10-NEXT: s_or_b32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_3_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, 3, 0 +; GFX11-NEXT: s_or_b32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 3, i32 0 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) { +; GFX678-LABEL: s_set_rounding_select_4_0: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX678-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX678-NEXT: v_readfirstlane_b32 s34, v0 +; GFX678-NEXT: s_lshl_b32 s34, s34, 2 +; GFX678-NEXT: s_add_i32 s35, s34, -4 +; GFX678-NEXT: s_cmp_lt_u32 s34, 4 +; GFX678-NEXT: s_cselect_b32 s34, s34, s35 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_4_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX9-NEXT: v_readfirstlane_b32 s34, v0 +; GFX9-NEXT: s_lshl_b32 s34, s34, 2 +; GFX9-NEXT: s_add_i32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_4_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 +; GFX10-NEXT: v_readfirstlane_b32 s34, v0 +; GFX10-NEXT: s_lshl_b32 s34, s34, 2 +; GFX10-NEXT: s_add_i32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_4_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_add_i32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 4, i32 0 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) { +; GFX678-LABEL: s_set_rounding_select_3_5: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_cselect_b32 s34, 3, 5 +; GFX678-NEXT: s_add_i32 s35, s34, -4 +; GFX678-NEXT: s_cmp_lt_u32 s34, 4 +; GFX678-NEXT: s_cselect_b32 s34, s34, s35 +; GFX678-NEXT: s_lshl_b32 s36, s34, 2 +; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_set_rounding_select_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s34, 3, 5 +; GFX9-NEXT: s_add_i32 s35, s34, -4 +; GFX9-NEXT: s_cmp_lt_u32 s34, 4 +; GFX9-NEXT: s_cselect_b32 s34, s34, s35 +; GFX9-NEXT: s_lshl_b32 s36, s34, 2 +; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_set_rounding_select_3_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s34, 3, 5 +; GFX10-NEXT: s_add_i32 s35, s34, -4 +; GFX10-NEXT: s_cmp_lt_u32 s34, 4 +; GFX10-NEXT: s_cselect_b32 s34, s34, s35 +; GFX10-NEXT: s_lshl_b32 s36, s34, 2 +; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f +; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9 +; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_set_rounding_select_3_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s0, 3, 5 +; GFX11-NEXT: s_add_i32 s1, s0, -4 +; GFX11-NEXT: s_cmp_lt_u32 s0, 4 +; GFX11-NEXT: s_cselect_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s2, s0, 2 +; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f +; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %cond, 0 + %rounding = select i1 %cmp, i32 3, i32 5 + call void @llvm.set.rounding(i32 %rounding) + ret void +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}}