Index: lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
===================================================================
--- lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -323,10 +323,12 @@
   O << (unsigned int)Value;
 }
 
+// Operands of BUILD_VECTOR are signed and we use this to print operands
+// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and
+// print as unsigned.
 void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
-  unsigned int Value = MI->getOperand(OpNo).getImm();
-  assert(Value <= 255 && "Invalid u8imm argument!");
+  unsigned char Value = MI->getOperand(OpNo).getImm();
   O << (unsigned int)Value;
 }
 
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -672,6 +672,9 @@
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
     }
+
+    if (Subtarget.isISA3_0() && Subtarget.hasDirectMove())
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Legal);
   }
 
   if (Subtarget.hasQPX()) {
@@ -7079,6 +7082,16 @@
   return DAG.getNode(ISD::BITCAST, dl, VT, T);
 }
 
+static bool isNonConstSplatBV(BuildVectorSDNode *BVN, EVT Type) {
+  if (BVN->getValueType(0) != Type)
+    return false;
+  auto OpZero = BVN->getOperand(0);
+  for (int i = 1, e = BVN->getNumOperands(); i < e; i++)
+    if (BVN->getOperand(i) != OpZero)
+      return false;
+  return true;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
@@ -7200,8 +7213,17 @@
   bool HasAnyUndefs;
   if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
                              HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
-      SplatBitSize > 32)
+      SplatBitSize > 32) {
+    // We can splat a non-const value on CPU's that implement ISA 3.0
+    // in two ways: LXVWSX (load and splat) and MTVSRWS(move and splat).
+    auto OpZero = BVN->getOperand(0);
+    bool CanLoadAndSplat = OpZero.getOpcode() == ISD::LOAD &&
+      BVN->isOnlyUserOf(OpZero.getNode());
+    if (Subtarget.isISA3_0() &&
+        isNonConstSplatBV(BVN, MVT::v4i32) && !CanLoadAndSplat)
+      return Op;
     return SDValue();
+  }
 
   unsigned SplatBits = APSplatBits.getZExtValue();
   unsigned SplatUndef = APSplatUndef.getZExtValue();
@@ -7219,6 +7241,10 @@
     return Op;
   }
 
+  // We have XXSPLTIB for constant splats one byte wide
+  if (Subtarget.isISA3_0() && Op.getValueType() == MVT::v16i8)
+    return Op;
+
   // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
   int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
                     (32-SplatBitSize));
@@ -7462,6 +7488,18 @@
   if (Subtarget.hasVSX()) {
     if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
       int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+
+      // If the source for the shuffle is a scalar_to_vector that came from a
+      // 32-bit load, it will have used LXVWSX so we don't need to splat again.
+      if (Subtarget.isISA3_0() &&
+          ((isLittleEndian && SplatIdx == 3) ||
+           (!isLittleEndian && SplatIdx == 0))) {
+        SDValue Src = V1.getOperand(0);
+        if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+            Src.getOperand(0).getOpcode() == ISD::LOAD &&
+            Src.getOperand(0).hasOneUse())
+          return V1;
+      }
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
       SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
                                   DAG.getConstant(SplatIdx, dl, MVT::i32));
Index: lib/Target/PowerPC/PPCInstrFormats.td
===================================================================
--- lib/Target/PowerPC/PPCInstrFormats.td
+++ lib/Target/PowerPC/PPCInstrFormats.td
@@ -1059,6 +1059,13 @@
   let Inst{31}    = XT{5};
 }
 
+class XX3Form_Zero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let XA = XT;
+  let XB = XT;
+}
+
 class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
                 InstrItinClass itin, list<dag> pattern>
   : I<opcode, OOL, IOL, asmstr, itin> {
Index: lib/Target/PowerPC/PPCInstrInfo.td
===================================================================
--- lib/Target/PowerPC/PPCInstrInfo.td
+++ lib/Target/PowerPC/PPCInstrInfo.td
@@ -312,6 +312,7 @@
   // field.  Used by instructions like 'ori'.
   return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
 }], LO16>;
+def immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
 
 // imm16Shifted* - These match immediates where the low 16-bits are zero.  There
 // are two forms: imm16ShiftedSExt and imm16ShiftedZExt.  These two forms are
Index: lib/Target/PowerPC/PPCInstrVSX.td
===================================================================
--- lib/Target/PowerPC/PPCInstrVSX.td
+++ lib/Target/PowerPC/PPCInstrVSX.td
@@ -761,6 +761,10 @@
                        "xxlxor $XT, $XA, $XB", IIC_VecGeneral,
                        [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
   } // isCommutable
+  let isCodeGenOnly = 1 in
+  def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
+                       "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+                       [(set v4i32:$XT, (v4i32 immAllZerosV))]>;
 
   // Permutation Instructions
   def XXMRGHW : XX3Form<60, 18,
@@ -1304,8 +1308,7 @@
 
 let Predicates = [IsISA3_0, HasDirectMove] in {
   def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
-                              "mtvsrws $XT, $rA", IIC_VecGeneral,
-                              []>;
+                              "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
 
   def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
                        "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
@@ -1869,6 +1872,10 @@
   dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
 }
 
+// Materialize a zero-vector of long long
+def : Pat<(v2i64 immAllZerosV),
+          (v2i64 (XXLXORz))>;
+
 // The following VSX instructions were introduced in Power ISA 3.0
 def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">;
 let AddedComplexity = 400, Predicates = [HasP9Vector] in {
@@ -2282,4 +2289,41 @@
   def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
             (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
   } // IsLittleEndian, HasP9Vector
+
+  def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
+            (v4i32 (LXVWSX xoaddr:$src))>;
+  def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
+            (v4f32 (LXVWSX xoaddr:$src))>;
+  def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+            (v4i32 (MTVSRWS $A))>;
+  def : Pat<(v16i8 (build_vector immSExt8:$A, immSExt8:$A, immSExt8:$A,
+                                 immSExt8:$A, immSExt8:$A, immSExt8:$A,
+                                 immSExt8:$A, immSExt8:$A, immSExt8:$A,
+                                 immSExt8:$A, immSExt8:$A, immSExt8:$A,
+                                 immSExt8:$A, immSExt8:$A, immSExt8:$A,
+                                 immSExt8:$A)),
+            (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
+  def : Pat<(v16i8 immAllOnesV),
+            (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+  def : Pat<(v8i16 immAllOnesV),
+            (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+  def : Pat<(v4i32 immAllOnesV),
+            (v4i32 (XXSPLTIB 255))>;
+  def : Pat<(v2i64 immAllOnesV),
+            (v2i64 (XXSPLTIB 255))>;
 } // end HasP9Vector, AddedComplexity
+
+let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
+def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
+          (v2i64 (MTVSRDD $rB, $rA))>;
+def : Pat<(i64 (extractelt v2i64:$A, 0)),
+          (i64 (MFVSRLD $A))>;
+}
+
+let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
+def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
+          (v2i64 (MTVSRDD $rB, $rA))>;
+def : Pat<(i64 (extractelt v2i64:$A, 1)),
+          (i64 (MFVSRLD $A))>;
+}
+
Index: test/CodeGen/PowerPC/power9-moves-and-splats.ll
===================================================================
--- test/CodeGen/PowerPC/power9-moves-and-splats.ll
+++ test/CodeGen/PowerPC/power9-moves-and-splats.ll
@@ -0,0 +1,167 @@
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s \
+; RUN:   --check-prefix=CHECK-BE
+
+@Globi = external global i32, align 4
+@Globf = external global float, align 4
+
+define <2 x i64> @test1(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: test1
+; CHECK: mtvsrdd 34, 4, 3
+; CHECK-BE-LABEL: test1
+; CHECK-BE: mtvsrdd 34, 3, 4
+  %vecins = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecins1 = insertelement <2 x i64> %vecins, i64 %b, i32 1
+  ret <2 x i64> %vecins1
+}
+
+define i64 @test2(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test2
+; CHECK: mfvsrld 3, 34
+  %0 = extractelement <2 x i64> %a, i32 0
+  ret i64 %0
+}
+
+define i64 @test3(<2 x i64> %a) {
+entry:
+; CHECK-BE-LABEL: test3
+; CHECK-BE: mfvsrld 3, 34
+  %0 = extractelement <2 x i64> %a, i32 1
+  ret i64 %0
+}
+
+define <4 x i32> @test4(i32* nocapture readonly %in) {
+entry:
+; CHECK-LABEL: test4
+; CHECK: lxvwsx 34, 0, 3
+; CHECK-NOT: xxspltw
+; CHECK-BE-LABEL: test4
+; CHECK-BE: lxvwsx 34, 0, 3
+; CHECK-BE-NOT: xxspltw
+  %0 = load i32, i32* %in, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+}
+
+define <4 x float> @test5(float* nocapture readonly %in) {
+entry:
+; CHECK-LABEL: test5
+; CHECK: lxvwsx 34, 0, 3
+; CHECK-NOT: xxspltw
+; CHECK-BE-LABEL: test5
+; CHECK-BE: lxvwsx 34, 0, 3
+; CHECK-BE-NOT: xxspltw
+  %0 = load float, float* %in, align 4
+  %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %splat.splat
+}
+
+define <4 x i32> @test6() {
+entry:
+; CHECK-LABEL: test6
+; CHECK: addis
+; CHECK: ld [[TOC:[0-9]+]], .LC0
+; CHECK: lxvwsx 34, 0, 3
+; CHECK-NOT: xxspltw
+; CHECK-BE-LABEL: test6
+; CHECK-BE: addis
+; CHECK-BE: ld [[TOC:[0-9]+]], .LC0
+; CHECK-BE: lxvwsx 34, 0, 3
+; CHECK-BE-NOT: xxspltw
+  %0 = load i32, i32* @Globi, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+}
+
+define <4 x float> @test7() {
+entry:
+; CHECK-LABEL: test7
+; CHECK: addis
+; CHECK: ld [[TOC:[0-9]+]], .LC1
+; CHECK: lxvwsx 34, 0, 3
+; CHECK-NOT: xxspltw
+; CHECK-BE-LABEL: test7
+; CHECK-BE: addis
+; CHECK-BE: ld [[TOC:[0-9]+]], .LC1
+; CHECK-BE: lxvwsx 34, 0, 3
+; CHECK-BE-NOT: xxspltw
+  %0 = load float, float* @Globf, align 4
+  %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %splat.splat
+}
+
+define <16 x i8> @test8() {
+entry:
+; CHECK-LABEL: test8
+; CHECK: xxlxor 34, 34, 34
+; CHECK-BE-LABEL: test8
+; CHECK-BE: xxlxor 34, 34, 34
+  ret <16 x i8> zeroinitializer
+}
+
+define <16 x i8> @test9() {
+entry:
+; CHECK-LABEL: test9
+; CHECK: xxspltib 34, 1
+; CHECK-BE-LABEL: test9
+; CHECK-BE: xxspltib 34, 1
+  ret <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+}
+
+define <16 x i8> @test10() {
+entry:
+; CHECK-LABEL: test10
+; CHECK: xxspltib 34, 127
+; CHECK-BE-LABEL: test10
+; CHECK-BE: xxspltib 34, 127
+  ret <16 x i8> <i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127>
+}
+
+define <16 x i8> @test11() {
+entry:
+; CHECK-LABEL: test11
+; CHECK: xxspltib 34, 128
+; CHECK-BE-LABEL: test11
+; CHECK-BE: xxspltib 34, 128
+  ret <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+}
+
+define <16 x i8> @test12() {
+entry:
+; CHECK-LABEL: test12
+; CHECK: xxspltib 34, 255
+; CHECK-BE-LABEL: test12
+; CHECK-BE: xxspltib 34, 255
+  ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+}
+
+define <16 x i8> @test13() {
+entry:
+; CHECK-LABEL: test13
+; CHECK: xxspltib 34, 129
+; CHECK-BE-LABEL: test13
+; CHECK-BE: xxspltib 34, 129
+  ret <16 x i8> <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+}
+
+define <4 x i32> @test14(<4 x i32> %a, i32* nocapture readonly %b) {
+entry:
+; CHECK-LABEL: test14
+; CHECK: lwz [[LD:[0-9]+]],
+; CHECK: mtvsrws 34, [[LD]]
+; CHECK-BE-LABEL: test14
+; CHECK-BE: lwz [[LD:[0-9]+]],
+; CHECK-BE: mtvsrws 34, [[LD]]
+  %0 = load i32, i32* %b, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = add i32 %0, 5
+  store i32 %1, i32* %b, align 4
+  ret <4 x i32> %splat.splat
+}