diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -312,7 +312,7 @@
       (instrs
     BCLR, BCLRn, BDNZLR, BDNZLR8, BDNZLRm, BDNZLRp, BDZLR, BDZLR8, BDZLRm, BDZLRp, gBCLR,
     BCLRL, BCLRLn, BDNZLRL, BDNZLRLm, BDNZLRLp, BDZLRL, BDZLRLm, BDZLRLp, gBCLRL,
-    BL, BL8, BL8_NOP, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_TLS, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_TLS
+    BL, BL8, BL8_NOP, BL8_NOP_RM, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_RM, BL8_NOTOC_TLS, BL8_RM, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_NOP_RM, BL_RM, BL_TLS
 )>;
 
 // 2 Cycles Branch operations, 1 input operands
@@ -320,9 +320,9 @@
       (instrs
     B, BCC, BCCA, BCCCTR, BCCCTR8, BCCCTRL, BCCCTRL8, BCCL, BCCLA, BCCLR, BCCLRL, CTRL_DEP, TAILB, TAILB8,
     BA, TAILBA, TAILBA8,
-    BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL_LWZinto_toc, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat,
+    BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL8_LDinto_toc_RM, BCTRL8_RM, BCTRL_LWZinto_toc, BCTRL_LWZinto_toc_RM, BCTRL_RM, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat,
     BCL, BCLalways, BCLn, BDNZL, BDNZLm, BDNZLp, BDZL, BDZLm, BDZLp, gBCL, gBCLat,
-    BLA, BLA8, BLA8_NOP
+    BLA, BLA8, BLA8_NOP, BLA8_NOP_RM, BLA8_RM, BLA_RM
 )>;
 
 // 2 Cycles Branch operations, 3 input operands
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -1302,15 +1302,15 @@
   (instregex "BCCTR(L)?(8)?(n)?$"),
   (instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"),
   (instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"),
-  (instregex "BL(_TLS|_NOP)?$"),
-  (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"),
-  (instregex "BLA(8|8_NOP)?$"),
+  (instregex "BL(_TLS|_NOP)?(_RM)?$"),
+  (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?(_RM)?$"),
+  (instregex "BLA(8|8_NOP)?(_RM)?$"),
   (instregex "BLR(8|L)?$"),
   (instregex "TAILB(A)?(8)?$"),
   (instregex "TAILBCTR(8)?$"),
   (instregex "gBC(A|Aat|CTR|CTRL|L|LA|LAat|LR|LRL|Lat|at)?$"),
   (instregex "BCLR(L)?(n)?$"),
-  (instregex "BCTR(L)?(8)?$"),
+  (instregex "BCTR(L)?(8)?(_RM)?$"),
   B,
   BA,
   BC,
@@ -1321,6 +1321,8 @@
   BCLn,
   BCTRL8_LDinto_toc,
   BCTRL_LWZinto_toc,
+  BCTRL8_LDinto_toc_RM,
+  BCTRL_LWZinto_toc_RM,
   BCn,
   CTRL_DEP
 )>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -200,6 +200,14 @@
     /// and 64-bit AIX.
     BCTRL_LOAD_TOC,
 
+    /// The variants that implicitly define rounding mode for calls with
+    /// strictfp semantics.
+    CALL_RM,
+    CALL_NOP_RM,
+    CALL_NOTOC_RM,
+    BCTRL_RM,
+    BCTRL_LOAD_TOC_RM,
+
     /// Return with a flag operand, matched by 'blr'
     RET_FLAG,
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1630,9 +1630,19 @@
   case PPCISD::CALL:            return "PPCISD::CALL";
   case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
   case PPCISD::CALL_NOTOC:      return "PPCISD::CALL_NOTOC";
+  case PPCISD::CALL_RM:
+    return "PPCISD::CALL_RM";
+  case PPCISD::CALL_NOP_RM:
+    return "PPCISD::CALL_NOP_RM";
+  case PPCISD::CALL_NOTOC_RM:
+    return "PPCISD::CALL_NOTOC_RM";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
   case PPCISD::BCTRL:           return "PPCISD::BCTRL";
   case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
+  case PPCISD::BCTRL_RM:
+    return "PPCISD::BCTRL_RM";
+  case PPCISD::BCTRL_LOAD_TOC_RM:
+    return "PPCISD::BCTRL_LOAD_TOC_RM";
   case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
   case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
   case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
@@ -5172,13 +5182,14 @@
 }
 
 static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
-                              const Function &Caller,
-                              const SDValue &Callee,
+                              const Function &Caller, const SDValue &Callee,
                               const PPCSubtarget &Subtarget,
-                              const TargetMachine &TM) {
+                              const TargetMachine &TM,
+                              bool IsStrictFPCall = false) {
   if (CFlags.IsTailCall)
     return PPCISD::TC_RETURN;
 
+  unsigned RetOpc = 0;
   // This is a call through a function pointer.
   if (CFlags.IsIndirect) {
     // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
@@ -5189,28 +5200,46 @@
     // immediately followed by a load of the TOC pointer from the the stack save
     // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
     // as it is not saved or used.
-    return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
-                                               : PPCISD::BCTRL;
-  }
-
-  if (Subtarget.isUsingPCRelativeCalls()) {
+    RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
+                                                 : PPCISD::BCTRL;
+  } else if (Subtarget.isUsingPCRelativeCalls()) {
     assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
-    return PPCISD::CALL_NOTOC;
+    RetOpc = PPCISD::CALL_NOTOC;
+  } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
+    // The ABIs that maintain a TOC pointer accross calls need to have a nop
+    // immediately following the call instruction if the caller and callee may
+    // have different TOC bases. At link time if the linker determines the calls
+    // may not share a TOC base, the call is redirected to a trampoline inserted
+    // by the linker. The trampoline will (among other things) save the callers
+    // TOC pointer at an ABI designated offset in the linkage area and the
+    // linker will rewrite the nop to be a load of the TOC pointer from the
+    // linkage area into gpr2.
+    RetOpc = callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
+                                                    : PPCISD::CALL_NOP;
+  else
+    RetOpc = PPCISD::CALL;
+  if (IsStrictFPCall) {
+    switch (RetOpc) {
+    default:
+      llvm_unreachable("Unknown call opcode");
+    case PPCISD::BCTRL_LOAD_TOC:
+      RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
+      break;
+    case PPCISD::BCTRL:
+      RetOpc = PPCISD::BCTRL_RM;
+      break;
+    case PPCISD::CALL_NOTOC:
+      RetOpc = PPCISD::CALL_NOTOC_RM;
+      break;
+    case PPCISD::CALL:
+      RetOpc = PPCISD::CALL_RM;
+      break;
+    case PPCISD::CALL_NOP:
+      RetOpc = PPCISD::CALL_NOP_RM;
+      break;
+    }
   }
-
-  // The ABIs that maintain a TOC pointer accross calls need to have a nop
-  // immediately following the call instruction if the caller and callee may
-  // have different TOC bases. At link time if the linker determines the calls
-  // may not share a TOC base, the call is redirected to a trampoline inserted
-  // by the linker. The trampoline will (among other things) save the callers
-  // TOC pointer at an ABI designated offset in the linkage area and the linker
-  // will rewrite the nop to be a load of the TOC pointer from the linkage area
-  // into gpr2.
-  if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
-    return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
-                                                  : PPCISD::CALL_NOP;
-
-  return PPCISD::CALL;
+  return RetOpc;
 }
 
 static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
@@ -5506,7 +5535,7 @@
 
   unsigned CallOpc =
       getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
-                    Subtarget, DAG.getTarget());
+                    Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false);
 
   if (!CFlags.IsIndirect)
     Callee = transformCallee(Callee, DAG, dl, Subtarget);
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -178,6 +178,39 @@
   }
 }
 
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8, RM], hasSideEffects = 0,
+    isCodeGenOnly = 1, Uses = [RM] in {
+  // Convenient aliases for call instructions
+  def BL8_RM  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                      "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+
+  def BLA8_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                      "bla $func", IIC_BrB, [(PPCcall_rm (i64 imm:$func))]>;
+  def BL8_NOP_RM  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                           (outs), (ins calltarget:$func),
+                           "bl $func\n\tnop", IIC_BrB, []>;
+
+  def BLA8_NOP_RM : IForm_and_DForm_4_zero<18, 1, 1, 24,
+                           (outs), (ins abscalltarget:$func),
+                           "bla $func\n\tnop", IIC_BrB,
+                           [(PPCcall_nop_rm (i64 imm:$func))]>;
+  let Predicates = [PCRelativeMemops] in {
+    // BL8_NOTOC means that the caller does not use the TOC pointer and if
+    // it does use R2 then it is just a caller saved register. Therefore it is
+    // safe to emit only the bl and not the nop for this instruction. The
+    // linker will not try to restore R2 after the call.
+    def BL8_NOTOC_RM : IForm<18, 0, 1, (outs),
+                             (ins calltarget:$func),
+                             "bl $func", IIC_BrB, []>;
+  }
+  let Uses = [CTR8, RM] in {
+    let isPredicable = 1 in
+      def BCTRL8_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                                   "bctrl", IIC_BrB, [(PPCbctrl_rm)]>,
+                   Requires<[In64BitMode]>;
+  }
+}
+
 let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
     Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in {
   def BCTRL8_LDinto_toc :
@@ -188,6 +221,16 @@
     Requires<[In64BitMode]>;
 }
 
+let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
+    Defs = [LR8, X2, RM], Uses = [CTR8, RM], RST = 2 in {
+  def BCTRL8_LDinto_toc_RM :
+    XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
+                              (ins memrix:$src),
+                              "bctrl\n\tld 2, $src", IIC_BrB,
+                              [(PPCbctrl_load_toc_rm iaddrX4:$src)]>,
+    Requires<[In64BitMode]>;
+}
+
 } // Interpretation64Bit
 
 // FIXME: Duplicating this for the asm parser should be unnecessary, but the
@@ -214,12 +257,32 @@
 def : Pat<(PPCcall_notoc (i64 texternalsym:$dst)),
           (BL8_NOTOC texternalsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i64 tglobaladdr:$dst)),
+          (BL8_RM tglobaladdr:$dst)>;
+def : Pat<(PPCcall_nop_rm (i64 tglobaladdr:$dst)),
+          (BL8_NOP_RM tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall_rm (i64 texternalsym:$dst)),
+          (BL8_RM texternalsym:$dst)>;
+def : Pat<(PPCcall_nop_rm (i64 texternalsym:$dst)),
+          (BL8_NOP_RM texternalsym:$dst)>;
+
+def : Pat<(PPCcall_notoc_rm (i64 tglobaladdr:$dst)),
+          (BL8_NOTOC_RM tglobaladdr:$dst)>;
+def : Pat<(PPCcall_notoc_rm (i64 texternalsym:$dst)),
+          (BL8_NOTOC_RM texternalsym:$dst)>;
+
 // Calls for AIX
 def : Pat<(PPCcall (i64 mcsym:$dst)),
           (BL8 mcsym:$dst)>;
 def : Pat<(PPCcall_nop (i64 mcsym:$dst)),
           (BL8_NOP mcsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i64 mcsym:$dst)),
+          (BL8_RM mcsym:$dst)>;
+def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)),
+          (BL8_NOP_RM mcsym:$dst)>;
+
 // Atomic operations
 // FIXME: some of these might be used with constant operands. This will result
 // in constant materialization instructions that may be redundant. We currently
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2246,11 +2246,13 @@
 
     return true;
   } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || OpC == PPC::BCTRL ||
-             OpC == PPC::BCTRL8) {
+             OpC == PPC::BCTRL8 || OpC == PPC::BCTRL_RM ||
+             OpC == PPC::BCTRL8_RM) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR)
       llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
 
-    bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
+    bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8 ||
+                 OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM;
     bool isPPC64 = Subtarget.isPPC64();
 
     if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
@@ -2274,6 +2276,9 @@
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
           .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit)
           .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine);
+    if (OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM)
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(PPC::RM, RegState::ImplicitDefine);
 
     return true;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -316,6 +316,24 @@
                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                 SDNPVariadic]>;
 
+// Call nodes for strictfp calls (that define RM).
+def PPCcall_rm  : SDNode<"PPCISD::CALL_RM", SDT_PPCCall,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                          SDNPVariadic]>;
+def PPCcall_nop_rm  : SDNode<"PPCISD::CALL_NOP_RM", SDT_PPCCall,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic]>;
+def PPCcall_notoc_rm : SDNode<"PPCISD::CALL_NOTOC_RM", SDT_PPCCall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def PPCbctrl_rm : SDNode<"PPCISD::BCTRL_RM", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                          SDNPVariadic]>;
+def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM",
+                                  SDTypeProfile<0, 1, []>,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
+
 def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
@@ -1892,6 +1910,26 @@
   }
 }
 
+let isCall = 1, PPC970_Unit = 7, Defs = [LR, RM], isCodeGenOnly = 1 in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL_RM  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                       "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+    def BLA_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                       "bla $func", IIC_BrB, [(PPCcall_rm (i32 imm:$func))]>;
+
+    def BL_NOP_RM  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                            (outs), (ins calltarget:$func),
+                                            "bl $func\n\tnop", IIC_BrB, []>;
+  }
+  let Uses = [CTR, RM] in {
+    let isPredicable = 1 in
+      def BCTRL_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                                  "bctrl", IIC_BrB, [(PPCbctrl_rm)]>,
+                  Requires<[In32BitMode]>;
+  }
+}
+
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi :PPCEmitTimePseudo< (outs),
                         (ins calltarget:$dst, i32imm:$offset),
@@ -1918,6 +1956,14 @@
 
 }
 
+let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
+    Defs = [LR, R2, RM], Uses = [CTR, RM], RST = 2 in {
+  def BCTRL_LWZinto_toc_RM:
+    XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs),
+     (ins memri:$src), "bctrl\n\tlwz 2, $src", IIC_BrB,
+     [(PPCbctrl_load_toc_rm iaddr:$src)]>, Requires<[In32BitMode]>;
+
+}
 
 let isCodeGenOnly = 1, hasSideEffects = 0 in {
 
@@ -3435,6 +3481,12 @@
 def : Pat<(PPCcall (i32 texternalsym:$dst)),
           (BL texternalsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i32 tglobaladdr:$dst)),
+          (BL_RM tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall_rm (i32 texternalsym:$dst)),
+          (BL_RM texternalsym:$dst)>;
+
 // Calls for AIX only
 def : Pat<(PPCcall (i32 mcsym:$dst)),
           (BL mcsym:$dst)>;
@@ -3445,6 +3497,15 @@
 def : Pat<(PPCcall_nop (i32 texternalsym:$dst)),
           (BL_NOP texternalsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i32 mcsym:$dst)),
+          (BL_RM mcsym:$dst)>;
+
+def : Pat<(PPCcall_nop_rm (i32 mcsym:$dst)),
+          (BL_NOP_RM mcsym:$dst)>;
+
+def : Pat<(PPCcall_nop_rm (i32 texternalsym:$dst)),
+          (BL_NOP_RM texternalsym:$dst)>;
+
 def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
           (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -906,16 +906,13 @@
   // Rounding Instructions respecting current rounding mode
   def XSRDPIC : XX2Form<60, 107,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
-                      "xsrdpic $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (fnearbyint f64:$XB))]>;
+                      "xsrdpic $XT, $XB", IIC_VecFP, []>;
   def XVRDPIC : XX2Form<60, 235,
                       (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvrdpic $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+                      "xvrdpic $XT, $XB", IIC_VecFP, []>;
   def XVRSPIC : XX2Form<60, 171,
                       (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvrspic $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+                      "xvrspic $XT, $XB", IIC_VecFP, []>;
   // Max/Min Instructions
   let isCommutable = 1 in {
   def XSMAXDP : XX3Form<60, 160,
@@ -2783,9 +2780,6 @@
 def : Pat<(f32 (any_fround f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPI
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (fnearbyint f32:$S)),
-          (f32 (COPY_TO_REGCLASS (XSRDPIC
-                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
 def : Pat<(f32 (any_ffloor f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPIM
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
@@ -2804,6 +2798,19 @@
 def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>;
 def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
 
+// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
+// these need to be defined after the any_frint versions so ISEL will correctly
+// add the chain to the strict versions.
+def : Pat<(f32 (fnearbyint f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPIC
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(f64 (fnearbyint f64:$S)),
+          (f64 (XSRDPIC $S))>;
+def : Pat<(v2f64 (fnearbyint v2f64:$S)),
+          (v2f64 (XVRDPIC $S))>;
+def : Pat<(v4f32 (fnearbyint v4f32:$S)),
+          (v4f32 (XVRSPIC $S))>;
+
 // Materialize a zero-vector of long long
 def : Pat<(v2i64 immAllZerosV),
           (v2i64 (XXLXORz))>;
diff --git a/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll b/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll
@@ -0,0 +1,127 @@
+; The non-strictfp version of test/CodeGen/PowerPC/respect-rounding-mode.ll
+; Without strictfp, CSE should be free to eliminate the repeated multiply
+; and conversion instructions.
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
+; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
+
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
+; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
+@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8
+
+define dso_local signext i32 @func1() local_unnamed_addr #0 {
+entry:
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %0, i32 0
+  %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext3 = extractelement <2 x double> %1, i32 1
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @exit(i32 signext 2) #0
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 %conv
+}
+
+declare void @directCall(...) local_unnamed_addr
+
+declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
+
+declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)
+
+declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
+
+declare void @exit(i32 signext) local_unnamed_addr
+
+define dso_local signext i32 @func2() local_unnamed_addr #0 {
+entry:
+  %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
+  %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %mul, i32 0
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ]
+  ret i32 %retval.0
+}
+
+declare <2 x double> @getvector1(...) local_unnamed_addr
+
+declare <2 x double> @getvector2(...) local_unnamed_addr
+
+declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+
+declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>)
+
+define dso_local signext i32 @func3() local_unnamed_addr #0 {
+entry:
+  %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %0() #0
+  %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %1, i32 0
+  %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
+  %2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %2() #0
+  %3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext4 = extractelement <2 x double> %3, i32 1
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @exit(i32 signext 2) #0
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 %conv
+}
+
+define dso_local signext i32 @func4() local_unnamed_addr #0 {
+entry:
+  %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
+  %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
+  %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %0() #0
+  %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %mul, i32 0
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  %1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %1() #0
+  %mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ]
+  ret i32 %retval.0
+}
+
+declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll b/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll
@@ -0,0 +1,128 @@
+; The strictfp version of test/CodeGen/PowerPC/cse-despit-rounding-mode.ll
+; With strictfp, the MachineIR optimizations need to assume that a call
+; can change the rounding mode and must not move/eliminate the repeated
+; multiply/convert instructions in this test.
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
+; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
+
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
+; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
+; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
+@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8
+
+define dso_local signext i32 @func1() local_unnamed_addr #0 {
+entry:
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %0, i32 0
+  %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext3 = extractelement <2 x double> %1, i32 1
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @exit(i32 signext 2) #0
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 %conv
+}
+
+declare void @directCall(...) local_unnamed_addr
+
+declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
+
+declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)
+
+declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
+
+declare void @exit(i32 signext) local_unnamed_addr
+
+define dso_local signext i32 @func2() local_unnamed_addr #0 {
+entry:
+  %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
+  %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %mul, i32 0
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  tail call void bitcast (void (...)* @directCall to void ()*)() #0
+  %mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ]
+  ret i32 %retval.0
+}
+
+declare <2 x double> @getvector1(...) local_unnamed_addr
+
+declare <2 x double> @getvector2(...) local_unnamed_addr
+
+declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+
+declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>)
+
+define dso_local signext i32 @func3() local_unnamed_addr #0 {
+entry:
+  %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %0() #0
+  %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %1, i32 0
+  %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
+  %2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %2() #0
+  %3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext4 = extractelement <2 x double> %3, i32 1
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @exit(i32 signext 2) #0
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 %conv
+}
+
+define dso_local signext i32 @func4() local_unnamed_addr #0 {
+entry:
+  %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
+  %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
+  %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %0() #0
+  %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %vecext = extractelement <2 x double> %mul, i32 0
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
+  br i1 %cmp, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  %1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
+  tail call void %1() #0
+  %mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  %2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ]
+  ret i32 %retval.0
+}
+
+declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
+
+attributes #0 = { nounwind strictfp }
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -4631,14 +4631,14 @@
 define <4 x double> @constrained_vector_rint_v4f64(<4 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_rint_v4f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xvrdpic 34, 34
 ; PC64LE-NEXT:    xvrdpic 35, 35
+; PC64LE-NEXT:    xvrdpic 34, 34
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_rint_v4f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    xvrdpic 34, 34
 ; PC64LE9-NEXT:    xvrdpic 35, 35
+; PC64LE9-NEXT:    xvrdpic 34, 34
 ; PC64LE9-NEXT:    blr
 entry:
   %rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(