Index: llvm/docs/AMDGPUUsage.rst
===================================================================
--- llvm/docs/AMDGPUUsage.rst
+++ llvm/docs/AMDGPUUsage.rst
@@ -962,63 +962,82 @@
 .. table:: AMDGPU LLVM IR Intrinsics
   :name: amdgpu-llvm-ir-intrinsics-table
 
-  =========================================  ==========================================================
-  LLVM Intrinsic                             Description
-  =========================================  ==========================================================
-  llvm.amdgcn.sqrt                           Provides direct access to v_sqrt_f64, v_sqrt_f32 and v_sqrt_f16
-                                             (on targets with half support). Peforms sqrt function.
-
-  llvm.amdgcn.log                            Provides direct access to v_log_f32 and v_log_f16
-                                             (on targets with half support). Peforms log2 function.
-
-  llvm.amdgcn.exp2                           Provides direct access to v_exp_f32 and v_exp_f16
-                                             (on targets with half support). Performs exp2 function.
-
-  :ref:`llvm.frexp <int_frexp>`              Implemented for half, float and double.
-
-  :ref:`llvm.log2 <int_log2>`                Implemented for float and half (and vectors of float or
-                                             half). Not implemented for double. Hardware provides
-                                             1ULP accuracy for float, and 0.51ULP for half. Float
-                                             instruction does not natively support denormal
-                                             inputs. Backend will optimize out denormal scaling if
-                                             marked with the :ref:`afn <fastmath_afn>` flag.
-
-  :ref:`llvm.sqrt <int_sqrt>`                Implemented for double, float and half (and vectors).
-
-  :ref:`llvm.log <int_log>`                  Implemented for float and half (and vectors).
-
-  :ref:`llvm.exp <int_exp>`                  Implemented for float and half (and vectors).
-
-  :ref:`llvm.log10 <int_log10>`              Implemented for float and half (and vectors).
-
-  :ref:`llvm.exp2 <int_exp2>`                Implemented for float and half (and vectors of float or
-                                             half). Not implemented for double. Hardware provides
-                                             1ULP accuracy for float, and 0.51ULP for half. Float
-                                             instruction does not natively support denormal
-                                             inputs. Backend will optimize out denormal scaling if
-                                             marked with the :ref:`afn <fastmath_afn>` flag.
-
-  llvm.amdgcn.wave.reduce.umin               Performs an arithmetic unsigned min reduction on the unsigned values
-                                             provided by each lane in the wavefront.
-                                             Intrinsic takes a hint for reduction strategy using second operand
-                                             0: Target default preference,
-                                             1: `Iterative strategy`, and
-                                             2: `DPP`.
-                                             If target does not support the DPP operations (e.g. gfx6/7),
-                                             reduction will be performed using default iterative strategy.
-                                             Intrinsic is currently only implemented for i32.
-
-  llvm.amdgcn.wave.reduce.umax               Performs an arithmetic unsigned max reduction on the unsigned values
-                                             provided by each lane in the wavefront.
-                                             Intrinsic takes a hint for reduction strategy using second operand
-                                             0: Target default preference,
-                                             1: `Iterative strategy`, and
-                                             2: `DPP`.
-                                             If target does not support the DPP operations (e.g. gfx6/7),
-                                             reduction will be performed using default iterative strategy.
-                                             Intrinsic is currently only implemented for i32.
-
-  =========================================  ==========================================================
+  ==========================================  ==========================================================
+  LLVM Intrinsic                              Description
+  ==========================================  ==========================================================
+  llvm.amdgcn.sqrt                            Provides direct access to v_sqrt_f64, v_sqrt_f32 and v_sqrt_f16
+                                              (on targets with half support). Peforms sqrt function.
+
+  llvm.amdgcn.log                             Provides direct access to v_log_f32 and v_log_f16
+                                              (on targets with half support). Peforms log2 function.
+
+  llvm.amdgcn.exp2                            Provides direct access to v_exp_f32 and v_exp_f16
+                                              (on targets with half support). Performs exp2 function.
+
+  :ref:`llvm.frexp <int_frexp>`               Implemented for half, float and double.
+
+  :ref:`llvm.log2 <int_log2>`                 Implemented for float and half (and vectors of float or
+                                              half). Not implemented for double. Hardware provides
+                                              1ULP accuracy for float, and 0.51ULP for half. Float
+                                              instruction does not natively support denormal
+                                              inputs. Backend will optimize out denormal scaling if
+
+  :ref:`llvm.sqrt <int_sqrt>`                 Implemented for double, float and half (and vectors).
+
+  :ref:`llvm.log <int_log>`                   Implemented for float and half (and vectors).
+
+  :ref:`llvm.exp2 <int_exp2>`                 Implemented for float and half (and vectors of float or
+                                              half). Not implemented for double. Hardware provides
+                                              1ULP accuracy for float, and 0.51ULP for half. Float
+                                              instruction does not natively support denormal
+                                              inputs. Backend will optimize out denormal scaling if
+                                              marked with the :ref:`afn <fastmath_afn>` flag.
+
+  :ref:`llvm.exp <int_exp>`                   Implemented for float and half (and vectors).
+
+  :ref:`llvm.exp2 <int_exp2>`                 Implemented for float and half (and vectors of float or
+                                              half). Not implemented for double. Hardware provides
+                                              1ULP accuracy for float, and 0.51ULP for half. Float
+                                              instruction does not natively support denormal
+                                              inputs. Backend will optimize out denormal scaling if
+                                              marked with the :ref:`afn <fastmath_afn>` flag.
+
+  llvm.amdgcn.wave.reduce.umin                Performs an arithmetic unsigned min reduction on the unsigned values
+                                              provided by each lane in the wavefront.
+                                              Intrinsic takes a hint for reduction strategy using second operand
+                                              0: Target default preference,
+                                              1: `Iterative strategy`, and
+                                              2: `DPP`.
+                                              If target does not support the DPP operations (e.g. gfx6/7),
+                                              reduction will be performed using default iterative strategy.
+                                              Intrinsic is currently only implemented for i32.
+
+  llvm.amdgcn.wave.reduce.umax                Performs an arithmetic unsigned max reduction on the unsigned values
+                                              provided by each lane in the wavefront.
+                                              Intrinsic takes a hint for reduction strategy using second operand
+                                              0: Target default preference,
+                                              1: `Iterative strategy`, and
+                                              2: `DPP`.
+                                              If target does not support the DPP operations (e.g. gfx6/7),
+                                              reduction will be performed using default iterative strategy.
+                                              Intrinsic is currently only implemented for i32.
+
+  :ref:`llvm.get.rounding<int_get_rounding>`  AMDGPU supports two separately controllable rounding
+                                              modes depending on the floating-point type. One
+                                              controls float, and the other controls both double and
+                                              half operations. If both modes are the same, returns
+                                              one of the standard return values. If the modes are
+                                              different, returns one of :ref:`12 extended values
+                                              <amdgpu-rounding-mode-enumeration-values-table>`
+                                              describing the two modes.
+
+                                              To nearest, ties away from zero is not a supported
+                                              mode. The raw rounding mode values in the MODE
+                                              register do not exactly match the FLT_ROUNDS values,
+                                              so a conversion is performed.
+
+  ==========================================  ==========================================================
+
 
 .. TODO::
 
@@ -4854,6 +4873,22 @@
      FLOAT_ROUND_MODE_ZERO                  3     Round Toward 0
      ====================================== ===== ==============================
 
+
+  .. table:: Extended FLT_ROUNDS Enumeration Values
+     :name: amdgpu-rounding-mode-enumeration-values-table
+
+     +------------------------+---------------+-------------------+--------------------+----------+
+     |                        | F32 NEAR_EVEN | F32 PLUS_INFINITY | F32 MINUS_INFINITY | F32 ZERO |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 NEAR_EVEN      |      1        |        11         |        14          |     17   |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 PLUS_INFINITY  |      8        |         2         |        15          |     18   |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 MINUS_INFINITY |      9        |        12         |         3          |     19   |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 ZERO           |     10        |        13         |        16          |     0    |
+     +------------------------+---------------+-------------------+--------------------+----------+
+
 ..
 
   .. table:: Floating Point Denorm Mode Enumeration Values
Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -25415,6 +25415,8 @@
 mode or state of floating point exceptions. Altering the floating point
 environment requires special care. See :ref:`Floating Point Environment <floatenv>`.
 
+.. _int_get_rounding:
+
 '``llvm.get.rounding``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
Index: llvm/docs/ReleaseNotes.rst
===================================================================
--- llvm/docs/ReleaseNotes.rst
+++ llvm/docs/ReleaseNotes.rst
@@ -71,6 +71,8 @@
 * `llvm.sqrt.f64` is now lowered correctly. Use `llvm.amdgcn.sqrt.f64`
   for raw instruction access.
 
+* Implemented :ref:`llvm.get.rounding <int_get_rounding>`
+
 Changes to the ARM Backend
 --------------------------
 
Index: llvm/include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -872,6 +872,7 @@
   ///  2 Round to +inf
   ///  3 Round to -inf
   ///  4 Round to nearest, ties to zero
+  ///  Other values are target dependent.
   /// Result is rounding mode and chain. Input is a chain.
   GET_ROUNDING,
 
Index: llvm/lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -409,6 +409,7 @@
 
   SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -754,6 +754,8 @@
                       MVT::i8, MVT::i128},
                      Custom);
 
+  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
+
   setTargetDAGCombine({ISD::ADD,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
@@ -3520,6 +3522,77 @@
   return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
 }
 
+SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  assert(Op.getValueType() == MVT::i32);
+
+  uint32_t BothRoundHwReg =
+      AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+  SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
+
+  SDValue IntrinID =
+      DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
+  SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
+                               Op.getOperand(0), IntrinID, GetRoundBothImm);
+
+  // There are two rounding modes, one for f32 and one for f64/f16. We only
+  // report in the standard value range if both are the same.
+  //
+  // The raw values also differ from the expected FLT_ROUNDS values. Nearest
+  // ties away from zero is not supported, and the other values are rotated by
+  // 1.
+  //
+  // If the two rounding modes are not the same, report a target defined value.
+
+  // Mode register rounding mode fields:
+  //
+  // [1:0] Single-precision round mode.
+  // [3:2] Double/Half-precision round mode.
+  //
+  // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
+  //
+  //             Hardware   Spec
+  // Toward-0        3        0
+  // Nearest Even    0        1
+  // +Inf            1        2
+  // -Inf            2        3
+  //  NearestAway0  N/A       4
+  //
+  // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
+  // table we can index by the raw hardware mode.
+  //
+  // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
+
+  SDValue BitTable =
+      DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
+
+  SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+  SDValue RoundModeTimesNumBits =
+      DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
+
+  // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
+  // knew only one mode was demanded.
+  SDValue TableValue =
+      DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
+  SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+
+  SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
+  SDValue TableEntry =
+      DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
+
+  // There's a gap in the 4-bit encoded table and actual enum values, so offset
+  // if it's an extended value.
+  SDValue Four = DAG.getConstant(4, SL, MVT::i32);
+  SDValue IsStandardValue =
+      DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
+  SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
+  SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
+                               TableEntry, EnumOffset);
+
+  return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -5027,6 +5100,8 @@
     return lowerXMUL_LOHI(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::GET_ROUNDING:
+    return lowerGET_ROUNDING(Op, DAG);
   }
   return SDValue();
 }
Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
+++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
@@ -85,6 +85,65 @@
   }
 };
 
+namespace AMDGPU {
+
+/// Return values used for llvm.get.rounding
+///
+/// When both the F32 and F64/F16 modes are the same, returns the standard
+/// values. If they differ, returns an extended mode starting at 8.
+enum AMDGPUFltRounds : int8_t {
+  // Inherit everything from RoundingMode
+  TowardZero = static_cast<int8_t>(RoundingMode::TowardZero),
+  NearestTiesToEven = static_cast<int8_t>(RoundingMode::NearestTiesToEven),
+  TowardPositive = static_cast<int8_t>(RoundingMode::TowardPositive),
+  TowardNegative = static_cast<int8_t>(RoundingMode::TowardNegative),
+  NearestTiesToAwayUnsupported =
+      static_cast<int8_t>(RoundingMode::NearestTiesToAway),
+
+  Dynamic = static_cast<int8_t>(RoundingMode::Dynamic),
+
+  // Permute the mismatched rounding mode cases.  If the modes are the same, use
+  // the standard values, otherwise, these values are sorted such that higher
+  // hardware encoded values have higher enum values.
+  NearestTiesToEvenF32_NearestTiesToEvenF64 = NearestTiesToEven,
+  NearestTiesToEvenF32_TowardPositiveF64 = 8,
+  NearestTiesToEvenF32_TowardNegativeF64 = 9,
+  NearestTiesToEvenF32_TowardZeroF64 = 10,
+
+  TowardPositiveF32_NearestTiesToEvenF64 = 11,
+  TowardPositiveF32_TowardPositiveF64 = TowardPositive,
+  TowardPositiveF32_TowardNegativeF64 = 12,
+  TowardPositiveF32_TowardZeroF64 = 13,
+
+  TowardNegativeF32_NearestTiesToEvenF64 = 14,
+  TowardNegativeF32_TowardPositiveF64 = 15,
+  TowardNegativeF32_TowardNegativeF64 = TowardNegative,
+  TowardNegativeF32_TowardZeroF64 = 16,
+
+  TowardZeroF32_NearestTiesToEvenF64 = 17,
+  TowardZeroF32_TowardPositiveF64 = 18,
+  TowardZeroF32_TowardNegativeF64 = 19,
+  TowardZeroF32_TowardZeroF64 = TowardZero,
+
+  Invalid = static_cast<int8_t>(RoundingMode::Invalid)
+};
+
+/// Offset of nonstandard values for llvm.get.rounding results from the largest
+/// supported mode.
+static constexpr uint32_t ExtendedFltRoundOffset = 4;
+
+/// Offset in mode register of f32 rounding mode.
+static constexpr uint32_t F32FltRoundOffset = 0;
+
+/// Offset in mode register of f64/f16 rounding mode.
+static constexpr uint32_t F64FltRoundOffset = 2;
+
+// Bit indexed table to convert from hardware rounding mode values to FLT_ROUNDS
+// values.
+extern const uint64_t FltRoundConversionTable;
+
+} // end namespace AMDGPU
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H
Index: llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
+++ llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -36,3 +36,135 @@
     FP64FP16Denormals = DenormMode;
   }
 }
+
+using namespace AMDGPU;
+
+/// Combine f32 and f64 rounding modes into a combined rounding mode value.
+static constexpr uint32_t getModeRegisterRoundMode(uint32_t HWFP32Val,
+                                                   uint32_t HWFP64Val) {
+  return HWFP32Val << F32FltRoundOffset | HWFP64Val << F64FltRoundOffset;
+}
+
+static constexpr uint64_t encodeFltRoundsTable(uint32_t FltRoundsVal,
+                                               uint32_t HWF32Val,
+                                               uint32_t HWF64Val) {
+  uint32_t ModeVal = getModeRegisterRoundMode(HWF32Val, HWF64Val);
+  if (FltRoundsVal > TowardNegative)
+    FltRoundsVal -= ExtendedFltRoundOffset;
+
+  uint32_t BitIndex = ModeVal << 2;
+  return static_cast<uint64_t>(FltRoundsVal) << BitIndex;
+}
+
+// Encode FLT_ROUNDS value where the two rounding modes are the same and use a
+// standard value
+static constexpr uint64_t
+encodeFltRoundsTableSame(AMDGPUFltRounds FltRoundsMode, uint32_t HWVal) {
+  return encodeFltRoundsTable(FltRoundsMode, HWVal, HWVal);
+}
+
+// Convert mode register encoded rounding mode to AMDGPUFltRounds
+static constexpr AMDGPUFltRounds
+decodeIndexFltRoundConversionTable(uint32_t HWMode) {
+  uint32_t TableRead = (FltRoundConversionTable >> (HWMode << 2)) & 0xf;
+  if (TableRead > TowardNegative)
+    TableRead += ExtendedFltRoundOffset;
+  return static_cast<AMDGPUFltRounds>(TableRead);
+}
+
+static constexpr uint32_t HWTowardZero = FP_ROUND_ROUND_TO_ZERO;
+static constexpr uint32_t HWNearestTiesToEven = FP_ROUND_ROUND_TO_NEAREST;
+static constexpr uint32_t HWTowardPositive = FP_ROUND_ROUND_TO_INF;
+static constexpr uint32_t HWTowardNegative = FP_ROUND_ROUND_TO_NEGINF;
+
+constexpr uint64_t AMDGPU::FltRoundConversionTable =
+    encodeFltRoundsTableSame(TowardZeroF32_TowardZeroF64, HWTowardZero) |
+    encodeFltRoundsTableSame(NearestTiesToEvenF32_NearestTiesToEvenF64,
+                             HWNearestTiesToEven) |
+    encodeFltRoundsTableSame(TowardPositiveF32_TowardPositiveF64,
+                             HWTowardPositive) |
+    encodeFltRoundsTableSame(TowardNegativeF32_TowardNegativeF64,
+                             HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardZeroF32_NearestTiesToEvenF64, HWTowardZero,
+                         HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardZeroF32_TowardPositiveF64, HWTowardZero,
+                         HWTowardPositive) |
+    encodeFltRoundsTable(TowardZeroF32_TowardNegativeF64, HWTowardZero,
+                         HWTowardNegative) |
+
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardZeroF64,
+                         HWNearestTiesToEven, HWTowardZero) |
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardPositiveF64,
+                         HWNearestTiesToEven, HWTowardPositive) |
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardNegativeF64,
+                         HWNearestTiesToEven, HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardPositiveF32_TowardZeroF64, HWTowardPositive,
+                         HWTowardZero) |
+    encodeFltRoundsTable(TowardPositiveF32_NearestTiesToEvenF64,
+                         HWTowardPositive, HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardPositiveF32_TowardNegativeF64, HWTowardPositive,
+                         HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardNegativeF32_TowardZeroF64, HWTowardNegative,
+                         HWTowardZero) |
+    encodeFltRoundsTable(TowardNegativeF32_NearestTiesToEvenF64,
+                         HWTowardNegative, HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardNegativeF32_TowardPositiveF64, HWTowardNegative,
+                         HWTowardPositive);
+
+// Verify evaluation of FltRoundConversionTable
+
+// If both modes are the same, should return the standard values.
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardZero, HWTowardZero)) == AMDGPUFltRounds::TowardZero);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWNearestTiesToEven)) ==
+              AMDGPUFltRounds::NearestTiesToEven);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWTowardPositive)) ==
+              AMDGPUFltRounds::TowardPositive);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWTowardNegative)) ==
+              AMDGPUFltRounds::TowardNegative);
+
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardZero, HWNearestTiesToEven)) ==
+              TowardZeroF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardZero, HWTowardPositive)) ==
+              TowardZeroF32_TowardPositiveF64);
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)) ==
+              TowardZeroF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardZero)) ==
+              NearestTiesToEvenF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardPositive)) ==
+              NearestTiesToEvenF32_TowardPositiveF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardNegative)) ==
+              NearestTiesToEvenF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardPositive, HWTowardZero)) ==
+              TowardPositiveF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWNearestTiesToEven)) ==
+              TowardPositiveF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWTowardNegative)) ==
+              TowardPositiveF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardNegative, HWTowardZero)) ==
+              TowardNegativeF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWNearestTiesToEven)) ==
+              TowardNegativeF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWTowardPositive)) ==
+              TowardNegativeF32_TowardPositiveF64);
Index: llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+
+declare i32 @llvm.get.rounding()
+
+define i32 @func_rounding() {
+; GFX678-LABEL: func_rounding:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX678-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX678-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX678-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX678-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX678-NEXT:    s_and_b32 s4, s4, 15
+; GFX678-NEXT:    s_add_i32 s5, s4, 4
+; GFX678-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX678-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: func_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX9-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX9-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX9-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX9-NEXT:    s_and_b32 s4, s4, 15
+; GFX9-NEXT:    s_add_i32 s5, s4, 4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: func_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX10-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX10-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX10-NEXT:    s_and_b32 s4, s4, 15
+; GFX10-NEXT:    s_add_i32 s5, s4, 4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX11-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_add_i32 s1, s0, 4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %rounding = call i32 @llvm.get.rounding()
+  ret i32 %rounding
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX1011: {{.*}}
+; GFX6: {{.*}}
+; GFX7: {{.*}}
+; GFX8: {{.*}}