Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -974,6 +974,20 @@
             [IntrNoMem, IntrSpeculatable]>;
           // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
 
+// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
+def int_amdgcn_interp_p1_f16 :
+  GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">,
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
+def int_amdgcn_interp_p2_f16 :
+  GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">,
+  Intrinsic<[llvm_half_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
 // invocation for derivative computation).
 def int_amdgcn_ps_live : Intrinsic <
Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -57,6 +57,7 @@
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
+FunctionPass *createSIModeRegisterPass();
 
 void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
 
@@ -179,6 +180,9 @@
 void initializeSIDebuggerInsertNopsPass(PassRegistry&);
 extern char &SIDebuggerInsertNopsID;
 
+void initializeSIModeRegisterPass(PassRegistry&);
+extern char &SIModeRegisterID;
+
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -455,6 +455,9 @@
   INTERP_MOV,
   INTERP_P1,
   INTERP_P2,
+  INTERP_P1LL_F16,
+  INTERP_P1LV_F16,
+  INTERP_P2_F16,
   PC_ADD_REL_OFFSET,
   KILL,
   DUMMY_CHAIN,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4063,6 +4063,9 @@
   NODE_NAME_CASE(INTERP_MOV)
   NODE_NAME_CASE(INTERP_P1)
   NODE_NAME_CASE(INTERP_P2)
+  NODE_NAME_CASE(INTERP_P1LL_F16)
+  NODE_NAME_CASE(INTERP_P1LV_F16)
+  NODE_NAME_CASE(INTERP_P2_F16)
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -372,6 +372,17 @@
                       SDTypeProfile<1, 4, [SDTCisFP<0>]>,
                       [SDNPInGlue]>;
 
+def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
+                            SDTypeProfile<1, 7, [SDTCisFP<0>]>,
+                            [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
+                            SDTypeProfile<1, 9, [SDTCisFP<0>]>,
+                            [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
+                          SDTypeProfile<1, 8, [SDTCisFP<0>]>,
+                          [SDNPInGlue]>;
 
 def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
   [SDNPHasChain, SDNPSideEffect]>;
Index: lib/Target/AMDGPU/AMDGPUSearchableTables.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -49,6 +49,8 @@
 def : SourceOfDivergence<int_amdgcn_interp_mov>;
 def : SourceOfDivergence<int_amdgcn_interp_p1>;
 def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
+def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
 def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
 def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
 def : SourceOfDivergence<int_r600_read_tidig_x>;
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -136,6 +136,13 @@
   cl::init(true),
   cl::Hidden);
 
+// Enable Mode register optimization
+static cl::opt<bool> EnableSIModeRegisterPass(
+  "amdgpu-mode-register",
+  cl::desc("Enable mode register pass"),
+  cl::init(true),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -171,6 +178,7 @@
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
+  initializeSIModeRegisterPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
@@ -884,6 +892,7 @@
 }
 
 void GCNPassConfig::addPreEmitPass() {
+  addPass(createSIModeRegisterPass());
   // The hazard recognizer that runs as part of the post-ra scheduler does not
   // guarantee to be able handle all hazards correctly. This is because if there
   // are multiple scheduling regions in a basic block, the regions are scheduled
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -115,6 +115,7 @@
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
   GCNILPSched.cpp
+  SIModeRegister.cpp
   )
 
 add_subdirectory(AsmParser)
Index: lib/Target/AMDGPU/SIDefines.h
===================================================================
--- lib/Target/AMDGPU/SIDefines.h
+++ lib/Target/AMDGPU/SIDefines.h
@@ -88,7 +88,10 @@
   IsPacked = UINT64_C(1) << 49,
 
   // Is a D16 buffer instruction.
-  D16Buf = UINT64_C(1) << 50
+  D16Buf = UINT64_C(1) << 50,
+
+  // Uses floating point double precision rounding mode
+  FPDPRounding = UINT64_C(1) << 51
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4838,6 +4838,59 @@
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
                        Glue);
   }
+  case Intrinsic::amdgcn_interp_p1_f16: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+    SDValue Glue = M0.getValue(1);
+    if (getSubtarget()->getLDSBankCount() == 16) {
+      // 16 bank LDS
+      SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
+                              DAG.getConstant(2, DL, MVT::i32), // P0
+                              Op.getOperand(2), // Attrchan
+                              Op.getOperand(3), // Attr
+                              Glue);
+      SDValue Ops[] = {
+        Op.getOperand(1), // Src0
+        Op.getOperand(2), // Attrchan
+        Op.getOperand(3), // Attr
+        DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+        S, // Src2 - holds two f16 values selected by high
+        DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+        Op.getOperand(4), // high
+        DAG.getConstant(0, DL, MVT::i1), // $clamp
+        DAG.getConstant(0, DL, MVT::i32) // $omod
+      };
+      return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
+    } else {
+      // 32 bank LDS
+      SDValue Ops[] = {
+        Op.getOperand(1), // Src0
+        Op.getOperand(2), // Attrchan
+        Op.getOperand(3), // Attr
+        DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+        Op.getOperand(4), // high
+        DAG.getConstant(0, DL, MVT::i1), // $clamp
+        DAG.getConstant(0, DL, MVT::i32), // $omod
+        Glue
+      };
+      return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
+    }
+  }
+  case Intrinsic::amdgcn_interp_p2_f16: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
+    SDValue Glue = SDValue(M0.getNode(), 1);
+    SDValue Ops[] = {
+      Op.getOperand(2), // Src0
+      Op.getOperand(3), // Attrchan
+      Op.getOperand(4), // Attr
+      DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+      Op.getOperand(1), // Src2
+      DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+      Op.getOperand(5), // high
+      DAG.getConstant(0, DL, MVT::i1), // $clamp
+      Glue
+    };
+    return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
+  }
   case Intrinsic::amdgcn_sin:
     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
 
Index: lib/Target/AMDGPU/SIInstrFormats.td
===================================================================
--- lib/Target/AMDGPU/SIInstrFormats.td
+++ lib/Target/AMDGPU/SIInstrFormats.td
@@ -121,6 +121,10 @@
   // This bit indicates that this is a D16 buffer instruction.
   field bit D16Buf = 0;
 
+  // This bit indicates that this uses the floating point double precision
+  // rounding mode flags
+  field bit FPDPRounding = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -178,6 +182,8 @@
 
   let TSFlags{50} = D16Buf;
 
+  let TSFlags{51} = FPDPRounding;
+
   let SchedRW = [Write32Bit];
 
   field bits<1> DisableSIDecoder = 0;
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -589,6 +589,14 @@
       return MI.getDesc().TSFlags & ClampFlags;
   }
 
+  static bool usesFPDPRounding(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
+  }
+
+  bool usesFPDPRounding(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
+  }
+
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(MI.isCopy());
     unsigned Dest = MI.getOperand(0).getReg();
Index: lib/Target/AMDGPU/SIModeRegister.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIModeRegister.cpp
@@ -0,0 +1,207 @@
+//===-- SIModeRegister.cpp - Mode Register --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// The pass inserts changes to the Mode register settings as required.
+/// Currently only the double precision floating point rounding mode setting is
+/// handled.
+//===----------------------------------------------------------------------===//
+//
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "AMDGPUInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-mode-register"
+
+STATISTIC(NumSetregInserted,
+          "Number of setreg of mode register inserted.");
+
+using namespace llvm;
+
+namespace {
+
+class SIModeRegister : public MachineFunctionPass {
+public:
+  static char ID;
+  unsigned stop;
+  SmallVector<bool, 32> complete;
+  SmallVector<unsigned, 32> revisits;
+  SmallVector<bool, 32> canSkipBlock;
+
+  // We currently assume the default rounding mode is Round to Nearest
+  // NOTE: this should come from a per function rounding mode setting once such
+  // a setting exists.
+  int defaultMode = FP_ROUND_ROUND_TO_NEAREST;
+
+public:
+  SIModeRegister() : MachineFunctionPass(ID) {
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool processBlock(MachineBasicBlock &MBB, const SIInstrInfo *TII,
+                    int currentMode);
+
+  int getRequiredMode(unsigned opcode);
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE,
+                "Insert required mode register values", false, false)
+
+char SIModeRegister::ID = 0;
+
+char &llvm::SIModeRegisterID = SIModeRegister::ID;
+
+FunctionPass *llvm::createSIModeRegisterPass() {
+  return new SIModeRegister();
+}
+
+// Determine the DP FP rounding mode required for this instruction
+int SIModeRegister::getRequiredMode(unsigned opcode) {
+  switch (opcode) {
+  case AMDGPU::V_INTERP_P1LL_F16:
+  case AMDGPU::V_INTERP_P1LV_F16:
+  case AMDGPU::V_INTERP_P2_F16:
+    // f16 interpolation instructions need round to zero
+    return FP_ROUND_ROUND_TO_ZERO;
+  default:
+    return defaultMode;
+  }
+}
+
+// We iterate through the instructions of each block and for any that use the
+// FP DP rounding mode we check that the current mode is appropropriate. If
+// not we insert a setreg to change it. If we find a setreg that modifies the
+// rounding mode we track that as the current value.
+// We then recursively propagate the final value to all the successor blocks.
+// For back-edges we need to revisit blocks until we revisit a block and find
+// an instruction that uses the DP rounding mode or as setreg that modifies it
+// (in those cases we know successor blocks already have the required modes set)
+// or we visit a block for the second time (we know there are no instructions
+// that use or set the FP DP rounding mode)
+bool SIModeRegister::processBlock(MachineBasicBlock &MBB,
+                                  const SIInstrInfo *TII, int currentMode)  {
+  if (!canSkipBlock[MBB.getNumber()]) {
+    canSkipBlock[MBB.getNumber()] = true;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.SkipPHIsLabelsAndDebug(MBB.begin()); I != MBB.end();
+                                                      I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+      if (TII->usesFPDPRounding(MI)) {
+        canSkipBlock[MBB.getNumber()] = false;
+        // This instruction uses the DP rounding mode - check that the current
+        // mode is suitable, and if not insert a setreg to change the mode
+        int requiredMode = getRequiredMode(MI.getOpcode());
+        if (currentMode != requiredMode)  {
+          currentMode = requiredMode;
+          BuildMI(MBB, I, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+                 .addImm(currentMode).addImm(0x881);
+          ++NumSetregInserted;
+        }
+        if ((revisits[MBB.getNumber()] >= 1 ))
+          return true;
+      } else if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) ||
+                 (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) {
+        // track changes to the rounding mode
+
+        canSkipBlock[MBB.getNumber()] = false;
+
+        // ignore setreg if not writing to MODE register
+        unsigned dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)
+                          ->getImm();
+        if (((dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
+             AMDGPU::Hwreg::ID_MODE)
+          continue;
+
+        unsigned width = ((dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
+                          AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + 1;
+        unsigned offset = (dst & AMDGPU::Hwreg::OFFSET_MASK_) >>
+                           AMDGPU::Hwreg::OFFSET_SHIFT_;
+        unsigned mask = ((1 << width) - 1) << offset;
+
+        // skip if not updating any part of the DP rounding mode
+        if ((mask & FP_ROUND_MODE_DP(3)) == 0)
+          continue;
+        // it is possible for the setreg to update only part of the DP mode
+        // field so we'll mask the current and new modes appropriately -
+        // however, if we don't know the current mode we can't use a partial
+        // value
+        bool partial = ((mask & FP_ROUND_MODE_DP(3)) != FP_ROUND_MODE_DP(3));
+        if (partial && (currentMode == -1))
+          continue;
+        if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
+          unsigned val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)
+                            ->getImm();
+          currentMode = (((val << offset) & FP_ROUND_MODE_DP(3)) |
+                        ((FP_ROUND_MODE_DP(currentMode) & ~mask))) >> 2;
+        } else
+          currentMode = -1;
+        // if it was a partial update we may have a different currentMode from
+        // values via different paths so we need to continue the propagation,
+        // otherwise if we are revisiting the block we can return
+        if ((revisits[MBB.getNumber()] >= 1 ) && !partial)
+          return true;
+      }
+    }
+  }
+
+  // propagate the current mode to all successor blocks
+  bool successorsComplete = true;
+  if (revisits[MBB.getNumber()] < 2) {
+    ++revisits[MBB.getNumber()];
+    MachineBasicBlock::succ_iterator S;
+    for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
+         E = MBB.succ_end(); S != E; S = std::next(S)) {
+      MachineBasicBlock &B = *(*S);
+      if (!complete[B.getNumber()])
+        if (!processBlock(B, TII, currentMode))
+          successorsComplete = false;
+    }
+    --revisits[MBB.getNumber()];
+  }
+  complete[MBB.getNumber()] = successorsComplete;
+  return successorsComplete;
+}
+
+// The DP Rounding flags within the Mode register are used to control both
+// 64 bit and 16 bit floating point rounding behavior.
+// The 16 bit interpolation instructions require Round to Zero for correct
+// results, so explicit mode changes may need to be inserted to ensure
+// each instruction has the required mode.
+// Other mode register settings may need to be tracked in the future.
+bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  revisits.resize(MF.getNumBlockIDs());
+  complete.resize(MF.getNumBlockIDs());
+  canSkipBlock.resize(MF.getNumBlockIDs());
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineFunction::iterator BI = MF.begin();
+  processBlock(*BI, TII, defaultMode);
+
+  return NumSetregInserted > 0;
+}
Index: lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- lib/Target/AMDGPU/VOP1Instructions.td
+++ lib/Target/AMDGPU/VOP1Instructions.td
@@ -203,14 +203,14 @@
 defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
 } // End SchedRW = [WriteQuarterRate32]
 
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDouble], FPDPRounding = 1 in {
 defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
 } // End SchedRW = [WriteDouble];
 
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDouble], FPDPRounding = 1 in {
 defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
-} // End SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble], FPDPRounding = 1
 
 let SchedRW = [WriteQuarterRate32] in {
 defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
Index: lib/Target/AMDGPU/VOP3Instructions.td
===================================================================
--- lib/Target/AMDGPU/VOP3Instructions.td
+++ lib/Target/AMDGPU/VOP3Instructions.td
@@ -219,7 +219,8 @@
 // VOP3 INTERP
 //===----------------------------------------------------------------------===//
 
-class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> {
+class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
+                 VOP3_Pseudo<OpName, P, pattern> {
   let AsmMatchConverter = "cvtVOP3Interp";
 }
 
@@ -290,13 +291,13 @@
 def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
 def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
 def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
 def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 
 let SchedRW = [WriteQuarterRate32] in {
 def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>;
@@ -323,6 +324,7 @@
 def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
   getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
   let SchedRW = [WriteDouble];
+  let FPDPRounding = 1;
 }
 } // End Uses = [VCC, EXEC]
 
@@ -353,10 +355,10 @@
 def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
 def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
 
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
 def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
 def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
@@ -367,6 +369,7 @@
 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
   let SchedRW = [WriteDouble, WriteSALU];
   let AsmMatchConverter = "";
+  let FPDPRounding = 1;
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -377,6 +380,7 @@
 
 def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
   let SchedRW = [WriteDouble];
+  let FPDPRounding = 1;
 }
 
 let SchedRW = [Write64Bit] in {
@@ -428,8 +432,17 @@
 def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
-def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
-}
+let Uses = [M0, EXEC], FPDPRounding = 1 in {
+def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
+       [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
+                                                        (i32 imm:$attr),
+                                                        (i32 imm:$src0_modifiers),
+                                                        (f32 VRegSrc_32:$src2),
+                                                        (i32 imm:$src2_modifiers),
+                                                        (i1 imm:$high),
+                                                        (i1 imm:$clamp)))]>;
+} // End Uses = [M0, EXEC], FPDPRounding = 1
+} // End renamedInGfx9 = 1
 
 let SubtargetPredicate = isGFX9 in {
 def V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
@@ -439,8 +452,24 @@
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9
 
-def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
-def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+let Uses = [M0, EXEC], FPDPRounding = 1 in {
+def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
+       [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
+                                                          (i32 imm:$attr),
+                                                          (i32 imm:$src0_modifiers),
+                                                          (i1 imm:$high),
+                                                          (i1 imm:$clamp),
+                                                          (i32 imm:$omod)))]>;
+def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
+       [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
+                                                          (i32 imm:$attr),
+                                                          (i32 imm:$src0_modifiers),
+                                                          (f32 VRegSrc_32:$src2),
+                                                          (i32 imm:$src2_modifiers),
+                                                          (i1 imm:$high),
+                                                          (i1 imm:$clamp),
+                                                          (i32 imm:$omod)))]>;
+} // End Uses = [M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9_32BANK %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8_32BANK %s
+; RUN: llc -march=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8_16BANK %s
+
+; GFX9_32BANK-LABEL: {{^}}interp_f16:
+; GFX9_32BANK: s_mov_b32 m0, s{{[0-9]+}}
+; GFX9_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y{{$}}
+; GFX9_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y high
+; GFX9_32BANK: v_interp_p2_legacy_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}}
+; GFX9_32BANK: v_interp_p2_legacy_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high
+
+; GFX8_32BANK-LABEL: {{^}}interp_f16:
+; GFX8_32BANK: s_mov_b32 m0, s{{[0-9]+}}
+; GFX8_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y{{$}}
+; GFX8_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y high
+; GFX8_32BANK: v_interp_p2_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}}
+; GFX8_32BANK: v_interp_p2_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high
+
+; GFX8_16BANK-LABEL: {{^}}interp_f16:
+; GFX8_16BANK: s_mov_b32 m0, s{{[0-9]+}}
+; there should be only one v_interp_mov
+; GFX8_16BANK: v_interp_mov_f32_e32 v{{[0-9]+}}, p0, attr2.y
+; GFX8_16BANK-NOT: v_interp_mov_f32_e32 v{{[0-9]+}}, p0, attr2.y
+; GFX8_16BANK: v_interp_p1lv_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}}
+; GFX8_16BANK: v_interp_p1lv_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high
+; GFX8_16BANK: v_interp_p2_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}}
+; GFX8_16BANK: v_interp_p2_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high
+
+define amdgpu_ps half @interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+main_body:
+  %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
+  %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0)
+  %p1_1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 1, i32 %m0)
+  %p2_1 = call half @llvm.amdgcn.interp.p2.f16(float %p1_1, float %j, i32 1, i32 2, i1 1, i32 %m0)
+  %res = fadd half %p2_0, %p2_1
+  ret half %res
+}
+
+; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0)
+declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0
+; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0)
+declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/mode-register.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/mode-register.mir
@@ -0,0 +1,176 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register  %s -o - | FileCheck %s
+
+---
+# check that the mode is changed to rtz from default rtn for interp f16
+# CHECK-LABEL: name: interp_f16_default
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NOT: S_SETREG_IMM32_B32
+  
+name: interp_f16_default
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    $m0 = S_MOV_B32 killed $sgpr2
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is not changed for interp f16 when the mode is already RTZ
+# CHECK-LABEL: name: interp_f16_explicit_rtz
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NOT: S_SETREG_IMM32_B32
+  
+name: interp_f16_explicit_rtz
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_SETREG_IMM32_B32 3, 2177
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is unchanged from RTN for F64 instruction
+# CHECK-LABEL: name: rtn_default
+# CHECK-LABEL: bb.0:
+# CHECK-NOT: S_SETREG_IMM32_B32
+# CHECK: V_SQRT_F64
+  
+name: rtn_default
+
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is changed from RTZ to RTN for F64 instruction
+# CHECK-LABEL: name: rtn_from_rtz
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NOT: S_SETREG_IMM32_B32
+# CHECK: V_SQRT_F64
+  
+name: rtn_from_rtz
+
+body: |
+  bb.0:
+    liveins: $vgpr1_vgpr2
+    S_SETREG_IMM32_B32 3, 2177
+    $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    S_ENDPGM
+...
+---
+# check that the mode is changed from RTZ to RTN for F64 instruction
+# and back again for remaining interp instruction
+# CHECK-LABEL: name: interp_f16_plus_sqrt_f64
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P2_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_SQRT_F64
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P2_F16
+  
+name: interp_f16_plus_sqrt_f64
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    $m0 = S_MOV_B32 killed $sgpr2
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that an explicit change to the single precision mode has no effect
+# CHECK-LABEL: name: single_precision_mode_change
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P2_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_SQRT_F64
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P2_F16
+  
+name: single_precision_mode_change
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    $m0 = S_MOV_B32 killed $sgpr2
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_SETREG_IMM32_B32 2, 2049
+    $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+    S_ENDPGM
+...
+---
+# check that mode is propagated back to start of loop - first instruction is RTN but needs 
+# setreg as RTZ is set in loop
+# CHECK-LABEL: name: loop
+# CHECK-LABEL: bb.1:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_SQRT_F64
+# CHECK-LABEL: bb.2:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK-NOT: S_SETREG_IMM32_B32
+  
+name: loop
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+    successors: %bb.1
+    $m0 = S_MOV_B32 killed $sgpr2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.1, %bb.3
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.3:
+    S_ENDPGM
+...