Index: llvm/docs/AMDGPUUsage.rst
===================================================================
--- llvm/docs/AMDGPUUsage.rst
+++ llvm/docs/AMDGPUUsage.rst
@@ -948,42 +948,61 @@
 .. table:: AMDGPU LLVM IR Intrinsics
   :name: amdgpu-llvm-ir-intrinsics-table
 
-  =========================================  ==========================================================
-  LLVM Intrinsic                             Description
-  =========================================  ==========================================================
-  llvm.amdgcn.log                            Provides direct access to v_log_f32 and v_log_f16
-                                             (on targets with half support). Peforms log2 function.
-
-  llvm.amdgcn.exp2                           Provides direct access to v_exp_f32 and v_exp_f16
-                                             (on targets with half support). Performs exp2 function.
-
-  :ref:`llvm.get.fpenv.i32 <int_get_fpenv>`  The natural floating-point environment type is i32. This
-                                             implemented by extracting relevant bits out of the MODE
-                                             register with s_getreg_b32. The first 10 bits are the
-                                             core floating-point mode. Bits 12:18 are the exception
-                                             mask. On gfx9+, bit 23 is FP16_OVFL. Bitfields not
-                                             relevant to floating-point instructions are 0s.
-
-  :ref:`llvm.log2 <int_log2>`                Implemented for float and half (and vectors of float or
-                                             half). Not implemented for double. Hardware provides
-                                             1ULP accuracy for float, and 0.51ULP for half. Float
-                                             instruction does not natively support denormal
-                                             inputs. Backend will optimize out denormal scaling if
-                                             marked with the :ref:`afn <fastmath_afn>` flag.
-
-  :ref:`llvm.exp2 <int_exp2>`                Implemented for float and half (and vectors of float or
-                                             half). Not implemented for double. Hardware provides
-                                             1ULP accuracy for float, and 0.51ULP for half. Float
-                                             instruction does not natively support denormal
-                                             inputs. Backend will optimize out denormal scaling if
-                                             marked with the :ref:`afn <fastmath_afn>` flag.
-
-  :ref:`llvm.log <int_log>`                  Implemented for float and half (and vectors).
-
-  :ref:`llvm.exp <int_exp>`                  Implemented for float and half (and vectors).
-
-  :ref:`llvm.log10 <int_log10>`              Implemented for float and half (and vectors).
-  =========================================  ==========================================================
+  ==========================================  ==========================================================
+  LLVM Intrinsic                              Description
+  ==========================================  ==========================================================
+  llvm.amdgcn.log                             Provides direct access to v_log_f32 and v_log_f16
+                                              (on targets with half support). Peforms log2 function.
+
+  llvm.amdgcn.exp2                            Provides direct access to v_exp_f32 and v_exp_f16
+                                              (on targets with half support). Performs exp2 function.
+
+  :ref:`llvm.get.fpenv.i32 <int_get_fpenv>`   The natural floating-point environment type is i32. This
+                                              implemented by extracting relevant bits out of the MODE
+                                              register with s_getreg_b32. The first 10 bits are the
+                                              core floating-point mode. Bits 12:18 are the exception
+                                              mask. On gfx9+, bit 23 is FP16_OVFL. Bitfields not
+                                              relevant to floating-point instructions are 0s.
+
+  :ref:`llvm.log2 <int_log2>`                 Implemented for float and half (and vectors of float or
+                                              half). Not implemented for double. Hardware provides
+                                              1ULP accuracy for float, and 0.51ULP for half. Float
+                                              instruction does not natively support denormal
+                                              inputs. Backend will optimize out denormal scaling if
+                                              marked with the :ref:`afn <fastmath_afn>` flag.
+
+  :ref:`llvm.exp2 <int_exp2>`                 Implemented for float and half (and vectors of float or
+                                              half). Not implemented for double. Hardware provides
+                                              1ULP accuracy for float, and 0.51ULP for half. Float
+                                              instruction does not natively support denormal
+                                              inputs. Backend will optimize out denormal scaling if
+                                              marked with the :ref:`afn <fastmath_afn>` flag.
+
+  :ref:`llvm.log <int_log>`                   Implemented for float and half (and vectors).
+
+  :ref:`llvm.exp <int_exp>`                   Implemented for float and half (and vectors).
+
+  :ref:`llvm.log10 <int_log10>`               Implemented for float and half (and vectors).
+
+
+  :ref:`llvm.get.rounding<int_get_rounding>`  AMDGPU supports two separately controllable rounding
+                                              modes depending on the floating-point type. One
+                                              controls float, and the other controls both double and
+                                              half operations. If both modes are the same, returns
+                                              one of the standard return values. If the modes are
+                                              different, returns one of
+                                              :ref:`12 extended values <amdgpu-rounding-mode-enumeration-values-table>`
+                                              describing the two modes. The extended value is
+                                              composed of a pair of standard 0-3 values shifted up.
+                                              The float mode is described by bits [3:4] and the
+                                              double/half mode by bits [5:6].
+
+                                              To nearest, ties away from zero is not a supported
+                                              mode. The raw rounding mode values in the MODE
+                                              register do not exactly match the FLT_ROUNDS values,
+                                              so a conversion is performed.
+
+  ==========================================  ==========================================================
 
 .. TODO::
 
@@ -4711,6 +4730,22 @@
      FLOAT_ROUND_MODE_ZERO                  3     Round Toward 0
      ====================================== ===== ==============================
 
+
+  .. table:: Extended FLT_ROUNDS Enumeration Values
+     :name: amdgpu-rounding-mode-enumeration-values-table
+
+     +------------------------+---------------+-------------------+--------------------+----------+
+     |                        | F32 NEAR_EVEN | F32 PLUS_INFINITY | F32 MINUS_INFINITY | F32 ZERO |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 NEAR_EVEN      |      1        |        48         |         56         |    32    |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 PLUS_INFINITY  |      72       |         2         |         88         |    64    |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 MINUS_INFINITY |     104       |       112         |          3         |    96    |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 ZERO           |      8        |       16          |         24         |    0     |
+     +------------------------+---------------+-------------------+--------------------+----------+
+
 ..
 
   .. table:: Floating Point Denorm Mode Enumeration Values
Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -25339,6 +25339,8 @@
 mode or state of floating point exceptions. Altering the floating point
 environment requires special care. See :ref:`Floating Point Environment <floatenv>`.
 
+.. _int_get_rounding:
+
 '``llvm.get.rounding``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
Index: llvm/docs/ReleaseNotes.rst
===================================================================
--- llvm/docs/ReleaseNotes.rst
+++ llvm/docs/ReleaseNotes.rst
@@ -146,6 +146,8 @@
 * llvm.exp2.f32 and llvm.exp.f32 are now lowered accurately. Use
   llvm.amdgcn.exp2.f32 to access the old behavior for llvm.exp2.f32.
 
+* Implemented :ref:`llvm.get.rounding <int_get_rounding>`
+
 Changes to the ARM Backend
 --------------------------
 
Index: llvm/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -872,6 +872,7 @@
   ///  2 Round to +inf
   ///  3 Round to -inf
   ///  4 Round to nearest, ties to zero
+  ///  Other values are target dependent.
   /// Result is rounding mode and chain. Input is a chain.
   GET_ROUNDING,
 
Index: llvm/lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -401,6 +401,7 @@
 
   SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -749,6 +749,7 @@
 
   // FP environment is a subset of a 32-bit mode register.
   setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
+  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
 
   setTargetDAGCombine({ISD::ADD,
                        ISD::UADDO_CARRY,
@@ -3534,6 +3535,65 @@
   return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
 }
 
+SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  assert(Op.getValueType() == MVT::i32);
+
+  uint32_t BothRoundHwReg =
+      AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+  SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
+
+  SDValue IntrinID =
+      DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
+  SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
+                               Op.getOperand(0), IntrinID, GetRoundBothImm);
+
+  // There are two rounding modes, one for f32 and one for f64/f16. We only
+  // report in the standard value range if both are the same.
+  //
+  // The raw values also differ from the expected FLT_ROUNDS values. Nearest
+  // ties away from zero is not supported, and the other values are rotated by
+  // 1.
+  //
+  // If the two rounding modes are not the same, report the standard values
+  // shifted up beyond the standard return value range.
+
+  // Mode register rounding mode fields:
+  //
+  // [1:0] Single-precision round mode.
+  // [3:2] Double/Half-precision round mode.
+  //
+  // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
+  //
+  //             Hardware   Spec
+  // Toward-0        3        0
+  // Nearest Even    0        1
+  // +Inf            1        2
+  // -Inf            2        3
+  //  NearestAway0  N/A       4
+  //
+  // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
+  // table we can index by the raw hardware mode.
+  SDValue BitTable =
+      DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
+
+  SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+  SDValue RoundModeTimesNumBits =
+      DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
+
+  // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
+  // knew only one mode was demanded.
+  SDValue TableValue =
+      DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
+  SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+
+  SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
+  SDValue Result = DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
+
+  return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -4918,6 +4978,8 @@
     return lowerXMUL_LOHI(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::GET_ROUNDING:
+    return lowerGET_ROUNDING(Op, DAG);
   }
   return SDValue();
 }
Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
+++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
@@ -85,6 +85,81 @@
   }
 };
 
+namespace AMDGPU {
+
+/// Bit offset of nonstandard values for llvm.get.rounding results.
+static constexpr uint32_t ExtendedFltRoundShift = 3;
+
+/// Offset in mode register of f32 rounding mode.
+static constexpr uint32_t F32FltRoundOffset = 0;
+
+/// Offset in mode register of f64/f16 rounding mode.
+static constexpr uint32_t F64FltRoundOffset = 2;
+
+#define PACK_ROUND_MODE(f32mode, f64mode)                                      \
+  ((f32mode) << F32FltRoundOffset | (f64mode) << F64FltRoundOffset)            \
+      << ExtendedFltRoundShift
+
+/// Return values used for llvm.get.rounding
+///
+/// When both the F32 and F64/F16 modes are the same, returns the standard
+/// values. If they differ, returns a bitmask of the two modes, offset by
+/// ExtendedFltRoundShift.
+enum AMDGPUFltRounds : uint64_t {
+  // Inherit everything from RoundingMode
+  TowardZero = static_cast<uint64_t>(RoundingMode::TowardZero),
+  NearestTiesToEven = static_cast<uint64_t>(RoundingMode::NearestTiesToEven),
+  TowardPositive = static_cast<uint64_t>(RoundingMode::TowardPositive),
+  TowardNegative = static_cast<uint64_t>(RoundingMode::TowardNegative),
+  NearestTiesToAwayUnsupported =
+      static_cast<uint64_t>(RoundingMode::NearestTiesToAway),
+
+  Dynamic = static_cast<uint64_t>(RoundingMode::Dynamic),
+
+  // If the modes are the same, use the standard values.
+
+  // Permute the mismatched rounding mode cases into values that match the
+  // standard modes when shifted down and bit extracted.
+
+  NearestTiesToEvenF32_NearestTiesToEvenF64 = NearestTiesToEven,
+  NearestTiesToEvenF32_TowardPositiveF64 =
+      PACK_ROUND_MODE(NearestTiesToEven, TowardPositive),
+  NearestTiesToEvenF32_TowardNegativeF64 =
+      PACK_ROUND_MODE(NearestTiesToEven, TowardNegative),
+  NearestTiesToEvenF32_TowardZeroF64 =
+      PACK_ROUND_MODE(NearestTiesToEven, TowardZero),
+
+  TowardPositiveF32_NearestTiesToEvenF64 =
+      PACK_ROUND_MODE(TowardPositive, NearestTiesToEven),
+  TowardPositiveF32_TowardPositiveF64 = TowardPositive,
+  TowardPositiveF32_TowardNegativeF64 =
+      PACK_ROUND_MODE(TowardPositive, TowardNegative),
+  TowardPositiveF32_TowardZeroF64 = PACK_ROUND_MODE(TowardPositive, TowardZero),
+
+  TowardNegativeF32_NearestTiesToEvenF64 =
+      PACK_ROUND_MODE(TowardNegative, NearestTiesToEven),
+  TowardNegativeF32_TowardPositiveF64 =
+      PACK_ROUND_MODE(TowardNegative, TowardPositive),
+  TowardNegativeF32_TowardNegativeF64 = TowardNegative,
+  TowardNegativeF32_TowardZeroF64 = PACK_ROUND_MODE(TowardNegative, TowardZero),
+
+  TowardZeroF32_NearestTiesToEvenF64 =
+      PACK_ROUND_MODE(TowardZero, NearestTiesToEven),
+  TowardZeroF32_TowardPositiveF64 = PACK_ROUND_MODE(TowardZero, TowardPositive),
+  TowardZeroF32_TowardNegativeF64 = PACK_ROUND_MODE(TowardZero, TowardNegative),
+  TowardZeroF32_TowardZeroF64 = TowardZero,
+
+  Invalid = static_cast<uint64_t>(static_cast<int64_t>(RoundingMode::Invalid))
+};
+
+#undef PACK_ROUND_MODE
+
+// Bit indexed table to convert from hardware rounding mode values to FLT_ROUNDS
+// values.
+extern const uint64_t FltRoundConversionTable;
+
+} // end namespace AMDGPU
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H
Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
+++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -36,3 +36,131 @@
     FP64FP16Denormals = DenormMode;
   }
 }
+
+using namespace AMDGPU;
+
+/// Combine f32 and f64 rounding modes into a combined rounding mode value.
+static constexpr uint32_t getModeRegisterRoundMode(uint32_t HWFP32Val,
+                                                   uint32_t HWFP64Val) {
+  return HWFP32Val << F32FltRoundOffset | HWFP64Val << F64FltRoundOffset;
+}
+
+// Encode FLT_ROUNDS value where the two rounding modes are the same and use a
+// standard value
+static constexpr uint64_t encodeFltRoundsTableSame(uint32_t FltRoundsVal,
+                                                   uint32_t HWVal) {
+  return static_cast<uint64_t>(FltRoundsVal)
+         << (getModeRegisterRoundMode(HWVal, HWVal) << 2);
+}
+
+/// Encode FLT_ROUNDS value where the two rounding modes different and use an
+/// extended value.
+static constexpr uint64_t encodeFltRoundsTable(uint32_t FltRoundsVal,
+                                               uint32_t HWF32Val,
+                                               uint32_t HWF64Val) {
+  return static_cast<uint64_t>(FltRoundsVal >> ExtendedFltRoundShift)
+         << (getModeRegisterRoundMode(HWF32Val, HWF64Val) << 2);
+}
+
+static constexpr uint32_t HWTowardZero = FP_ROUND_ROUND_TO_ZERO;
+static constexpr uint32_t HWNearestTiesToEven = FP_ROUND_ROUND_TO_NEAREST;
+static constexpr uint32_t HWTowardPositive = FP_ROUND_ROUND_TO_INF;
+static constexpr uint32_t HWTowardNegative = FP_ROUND_ROUND_TO_NEGINF;
+
+const uint64_t AMDGPU::FltRoundConversionTable =
+    encodeFltRoundsTableSame(TowardZeroF32_TowardZeroF64, HWTowardZero) |
+    encodeFltRoundsTableSame(NearestTiesToEvenF32_NearestTiesToEvenF64,
+                             HWNearestTiesToEven) |
+    encodeFltRoundsTableSame(TowardPositiveF32_TowardPositiveF64,
+                             HWTowardPositive) |
+    encodeFltRoundsTableSame(TowardNegativeF32_TowardNegativeF64,
+                             HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardZeroF32_NearestTiesToEvenF64, HWTowardZero,
+                         HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardZeroF32_TowardPositiveF64, HWTowardZero,
+                         HWTowardPositive) |
+    encodeFltRoundsTable(TowardZeroF32_TowardNegativeF64, HWTowardZero,
+                         HWTowardNegative) |
+
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardZeroF64,
+                         HWNearestTiesToEven, HWTowardZero) |
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardPositiveF64,
+                         HWNearestTiesToEven, HWTowardPositive) |
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardNegativeF64,
+                         HWNearestTiesToEven, HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardPositiveF32_TowardZeroF64, HWTowardPositive,
+                         HWTowardZero) |
+    encodeFltRoundsTable(TowardPositiveF32_NearestTiesToEvenF64,
+                         HWTowardPositive, HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardPositiveF32_TowardNegativeF64, HWTowardPositive,
+                         HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardNegativeF32_TowardZeroF64, HWTowardNegative,
+                         HWTowardZero) |
+    encodeFltRoundsTable(TowardNegativeF32_NearestTiesToEvenF64,
+                         HWTowardNegative, HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardNegativeF32_TowardPositiveF64, HWTowardNegative,
+                         HWTowardPositive);
+
+// Convert mode register encoded rounding mode to AMDGPUFltRounds
+static constexpr AMDGPUFltRounds indexFltRoundConversionTable(uint32_t HWMode) {
+  return static_cast<AMDGPUFltRounds>(FltRoundConversionTable >> 4 * HWMode &
+                                      0xf);
+}
+
+// Verify evaluation of FltRoundConversionTable
+
+// If both modes are the same, should return the standard values.
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardZero, HWTowardZero)) == AMDGPUFltRounds::TowardZero);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWNearestTiesToEven)) ==
+              AMDGPUFltRounds::NearestTiesToEven);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWTowardPositive)) ==
+              AMDGPUFltRounds::TowardPositive);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWTowardNegative)) ==
+              AMDGPUFltRounds::TowardNegative);
+
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardZero, HWNearestTiesToEven)) ==
+              TowardZeroF32_NearestTiesToEvenF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardZero, HWTowardPositive)) ==
+              TowardZeroF32_TowardPositiveF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)) ==
+              TowardZeroF32_TowardNegativeF64 >> ExtendedFltRoundShift);
+
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardZero)) ==
+              NearestTiesToEvenF32_TowardZeroF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardPositive)) ==
+              NearestTiesToEvenF32_TowardPositiveF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardNegative)) ==
+              NearestTiesToEvenF32_TowardNegativeF64 >> ExtendedFltRoundShift);
+
+static_assert(indexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardPositive, HWTowardZero)) ==
+              TowardPositiveF32_TowardZeroF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWNearestTiesToEven)) ==
+              TowardPositiveF32_NearestTiesToEvenF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWTowardNegative)) ==
+              TowardPositiveF32_TowardNegativeF64 >> ExtendedFltRoundShift);
+
+static_assert(indexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardNegative, HWTowardZero)) ==
+              TowardNegativeF32_TowardZeroF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWNearestTiesToEven)) ==
+              TowardNegativeF32_NearestTiesToEvenF64 >> ExtendedFltRoundShift);
+static_assert(indexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWTowardPositive)) ==
+              TowardNegativeF32_TowardPositiveF64 >> ExtendedFltRoundShift);
Index: llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+
+declare i32 @llvm.get.rounding()
+
+define i32 @func_rounding() {
+; GFX678-LABEL: func_rounding:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX678-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX678-NEXT:    s_mov_b32 s4, 0x8b294761
+; GFX678-NEXT:    s_mov_b32 s5, 0x321c3ed
+; GFX678-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX678-NEXT:    s_and_b32 s4, s4, 15
+; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: func_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX9-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX9-NEXT:    s_mov_b32 s4, 0x8b294761
+; GFX9-NEXT:    s_mov_b32 s5, 0x321c3ed
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX9-NEXT:    s_and_b32 s4, s4, 15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: func_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX10-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0x8b294761
+; GFX10-NEXT:    s_mov_b32 s5, 0x321c3ed
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX10-NEXT:    s_and_b32 s4, s4, 15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x8b294761
+; GFX11-NEXT:    s_mov_b32 s1, 0x321c3ed
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %rounding = call i32 @llvm.get.rounding()
+  ret i32 %rounding
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX1011: {{.*}}
+; GFX6: {{.*}}
+; GFX7: {{.*}}
+; GFX8: {{.*}}