diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -25,6 +25,8 @@
 
 bool isVVPBinaryOp(unsigned Opcode);
 
+bool isVVPTernaryOp(unsigned Opcode);
+
 MVT splitVectorType(MVT VT);
 
 bool isPackedVectorType(EVT SomeVT);
diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -138,6 +138,16 @@
   return false;
 }
 
+bool isVVPTernaryOp(unsigned VVPOpcode) {
+  switch (VVPOpcode) {
+#define ADD_TERNARY_VVP_OP(VVPNAME, ...)                                       \
+  case VEISD::VVPNAME:                                                         \
+    return true;
+#include "VVPNodes.def"
+  }
+  return false;
+}
+
 // Return the AVL operand position for this VVP or VEC Op.
 Optional<int> getAVLPos(unsigned Opc) {
   // This is only available for VP SDNodes
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -81,19 +81,30 @@
     return CDAG.getNode(VVPOpcode, LegalVecVT,
                         {Op->getOperand(0), Op->getOperand(1), Mask, AVL});
   }
-  if (VVPOpcode == VEISD::VVP_SELECT) {
+  switch (VVPOpcode) {
+  default:
+    llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+  case VEISD::VVP_FFMA: {
+    // VE has a swizzled operand order in FMA (compared to LLVM IR and
+    // SDNodes).
+    auto X = Op->getOperand(2);
+    auto Y = Op->getOperand(0);
+    auto Z = Op->getOperand(1);
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {X, Y, Z, Mask, AVL});
+  }
+  case VEISD::VVP_SELECT: {
     auto Mask = Op->getOperand(0);
     auto OnTrue = Op->getOperand(1);
     auto OnFalse = Op->getOperand(2);
     return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
   }
-  if (VVPOpcode == VEISD::VVP_SETCC) {
+  case VEISD::VVP_SETCC: {
     auto LHS = Op->getOperand(0);
     auto RHS = Op->getOperand(1);
     auto Pred = Op->getOperand(2);
     return CDAG.getNode(VVPOpcode, LegalVecVT, {LHS, RHS, Pred, Mask, AVL});
   }
-  llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+  }
 }
 
 SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -57,6 +57,17 @@
   IsVLVT<4>
 ]>;
 
+// TernaryFPOp(x,y,z,mask,vl)
+def SDTFPTernaryOpVVP : SDTypeProfile<1, 5, [
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<0, 2>,
+  SDTCisSameAs<0, 3>,
+  SDTCisFP<0>,
+  SDTCisInt<4>,
+  SDTCisSameNumEltsAs<0, 4>,
+  IsVLVT<5>
+]>;
+
 // Select(OnTrue, OnFalse, SelMask, vl)
 def SDTSelectVVP : SDTypeProfile<1, 4, [       // vp_select, vp_merge
   SDTCisVec<0>,
@@ -86,6 +97,12 @@
   [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
    (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
 
+class vvp_fma_commutative<SDNode RootOp> :
+  PatFrags<
+  (ops node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),
+  [(RootOp node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),
+   (RootOp node:$X, node:$Z, node:$Y, node:$mask, node:$vlen)]>;
+
 // VVP node definitions.
 def vvp_add    : SDNode<"VEISD::VVP_ADD",  SDTIntBinOpVVP>;
 def c_vvp_add  : vvp_commutative<vvp_add>;
@@ -118,6 +135,9 @@
 def c_vvp_fmul  : vvp_commutative<vvp_fmul>;
 def vvp_fdiv    : SDNode<"VEISD::VVP_FDIV",  SDTFPBinOpVVP>;
 
+def vvp_ffma    : SDNode<"VEISD::VVP_FFMA",  SDTFPTernaryOpVVP>;
+def c_vvp_ffma  : vvp_fma_commutative<vvp_ffma>;
+
 // } Binary Operators
 
 def vvp_load    : SDNode<"VEISD::VVP_LOAD",  SDTLoadVVP,
diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -341,6 +341,122 @@
 defm : Binary_rv_vv<vvp_fsub,
                     i64, v512f32, v512i1, "PVFSUB">;
 
+multiclass Ternary_vvv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  //// Mask + vvp_select /////
+  // with mask
+  def : Pat<(vvp_select
+              (OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvml_v")
+              $vx, $vy, $vz, $mask, $avl, $vfalse)>;
+
+  ///// w/o Mask /////
+  // w/o mask
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvl")
+              $vx, $vy, $vz, $avl)>;
+
+  ///// Mask /////
+  // with mask
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvml")
+              $vx, $vy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_rvv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru, broadcast first.
+  def : Pat<(vvp_select
+              (OpNode
+                (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvml_v")
+              $sx, $vy, $vz, $mask, $avl, $vfalse)>;
+
+  // Unmasked, broadcast first.
+  def : Pat<(OpNode
+              (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvl")
+              $sx, $vy, $vz, $avl)>;
+
+  // Masked, broadcast first.
+  def : Pat<(OpNode 
+              (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvml")
+              $sx, $vy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_vrv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru, broadcast second.
+  def : Pat<(vvp_select
+              (OpNode
+                DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvml_v")
+              $vx, $sy, $vz,
+              $mask, $avl, $vfalse)>;
+
+  // Unmasked, broadcast second.
+  def : Pat<(OpNode
+              DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvl")
+              $vx, $sy, $vz, $avl)>;
+
+  // Masked, broadcast second.
+  def : Pat<(OpNode
+              DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvml")
+              $vx, $sy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_rvv_vrv_vvv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  defm : Ternary_rvv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Ternary_vrv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Ternary_vvv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+}
+
+// Expand both 64bit and 32 bit variant (256 elements)
+multiclass Ternary_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+  defm : Ternary_rvv_vrv_vvv<OpNode, LongScalarVT, LongDataVT,
+                             v256i1, LongOpBaseName>;
+  defm : Ternary_rvv_vrv_vvv<OpNode, ShortScalarVT, ShortDataVT,
+                             v256i1, ShortOpBaseName>;
+}
+
+defm : Ternary_ShortLong<c_vvp_ffma,
+                         f64, v256f64, "VFMADD", f32, v256f32, "VFMADS">;
+defm : Ternary_rvv_vrv_vvv<c_vvp_ffma,
+                           i64, v512f32, v512i1, "PVFMAD">;
+
 multiclass Merge_mvv<
     SDPatternOperator OpNode,
     ValueType DataVT, ValueType MaskVT,
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -33,6 +33,14 @@
             HANDLE_VP_TO_VVP(VPNAME, VVPNAME)
 #endif
 
+/// ADD_TERNARY_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP Ternary operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_TERNARY_VVP_OP
+#define ADD_TERNARY_VVP_OP(VVPNAME,SDNAME) \
+    ADD_VVP_OP(VVPNAME,SDNAME)
+#endif
+
 #ifndef ADD_BINARY_VVP_OP_COMPACT
 #define ADD_BINARY_VVP_OP_COMPACT(NAME) \
     ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME)
@@ -68,6 +76,8 @@
 ADD_BINARY_VVP_OP_COMPACT(FMUL) REGISTER_PACKED(VVP_FMUL)
 ADD_BINARY_VVP_OP_COMPACT(FDIV)
 
+ADD_TERNARY_VVP_OP(VVP_FFMA,FMA) HANDLE_VP_TO_VVP(VP_FMA, VVP_FFMA) REGISTER_PACKED(VVP_FFMA)
+
 ADD_VVP_OP(VVP_SETCC, SETCC)
 
 // Shuffles.
@@ -76,6 +86,7 @@
 HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
 
 #undef ADD_BINARY_VVP_OP
+#undef ADD_TERNARY_VVP_OP
 #undef ADD_BINARY_VVP_OP_COMPACT
 #undef ADD_VVP_OP
 #undef HANDLE_VP_TO_VVP
diff --git a/llvm/test/CodeGen/VE/Vector/vec_fma.ll b/llvm/test/CodeGen/VE/Vector/vec_fma.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_fma.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x float> @llvm.fma.v256f32(<256 x float>, <256 x float>, <256 x float>)
+
+define fastcc <256 x float> @test_vec_fma_v256f32_vvv(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f32_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.s %v0, %v2, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vec_fma_v256f32_rvv(float %s0, <256 x float> %i1, <256 x float> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f32_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vec_fma_v256f32_vrv(<256 x float> %i0, float %s1, <256 x float> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f32_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vec_fma_v256f32_vvr(<256 x float> %i0, <256 x float> %i1, float %s2) {
+; CHECK-LABEL: test_vec_fma_v256f32_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %s0, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x float> undef, float %s2, i32 0
+  %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+declare <256 x double> @llvm.fma.v256f64(<256 x double>, <256 x double>, <256 x double>)
+
+define fastcc <256 x double> @test_vec_fma_v256f64_vvv(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f64_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.d %v0, %v2, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vec_fma_v256f64_rvv(double %s0, <256 x double> %i1, <256 x double> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f64_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s0, i32 0
+  %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vec_fma_v256f64_vrv(<256 x double> %i0, double %s1, <256 x double> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f64_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vec_fma_v256f64_vvr(<256 x double> %i0, <256 x double> %i1, double %s2) {
+; CHECK-LABEL: test_vec_fma_v256f64_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %s0, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x double> undef, double %s2, i32 0
+  %i2 = shufflevector <256 x double> %zins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vp_fma.ll b/llvm/test/CodeGen/VE/Vector/vp_fma.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_fma.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x float> @llvm.vp.fma.v256f32(<256 x float>, <256 x float>, <256 x float>, <256 x i1>, i32)
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvv(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.s %v0, %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_rvv(float %s0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vrv(<256 x float> %i0, float %s1, <256 x float> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvr(<256 x float> %i0, <256 x float> %i1, float %s2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %s0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x float> undef, float %s2, i32 0
+  %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+declare <256 x double> @llvm.vp.fma.v256f64(<256 x double>, <256 x double>, <256 x double>, <256 x i1>, i32)
+
+define fastcc <256 x double> @test_vp_fma_v256f64_vvv(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.d %v0, %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fma_v256f64_rvv(double %s0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s0, i32 0
+  %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fma_v256f64_vrv(<256 x double> %i0, double %s1, <256 x double> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fma_v256f64_vvr(<256 x double> %i0, <256 x double> %i1, double %s2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %s0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x double> undef, double %s2, i32 0
+  %i2 = shufflevector <256 x double> %zins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32)
+declare <256 x float> @llvm.vp.fma.v256f32(<256 x float>, <256 x float>, <256 x float>, <256 x i1>, i32)
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvv_merge(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvv_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.s %v3, %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_rvv_merge(float %s0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_rvv_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v2, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vrv_merge(<256 x float> %i0, float %s1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_vrv_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v2, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvr_merge(<256 x float> %i0, <256 x float> %i1, float %s2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvr_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v2, %s0, %v0, %v1, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x float> undef, float %s2, i32 0
+  %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}