Index: lib/Target/PowerPC/PPCMIPeephole.cpp
===================================================================
--- lib/Target/PowerPC/PPCMIPeephole.cpp
+++ lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -400,9 +400,9 @@
       case PPC::VSPLTH:
       case PPC::XXSPLTW: {
         unsigned MyOpcode = MI.getOpcode();
-        unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
+        unsigned RegOpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
         unsigned TrueReg =
-          TII->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
+          TII->lookThruCopyLike(MI.getOperand(RegOpNo).getReg(), MRI);
         if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
           break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -433,32 +433,90 @@
           DEBUG(MI.dump());
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                   MI.getOperand(0).getReg())
-              .add(MI.getOperand(OpNo));
+              .add(MI.getOperand(RegOpNo));
           ToErase = &MI;
           Simplified = true;
         }
-        // Splat fed by a shift. Usually when we align value to splat into
-        // vector element zero.
-        if (DefOpcode == PPC::XXSLDWI) {
-          unsigned ShiftRes = DefMI->getOperand(0).getReg();
-          unsigned ShiftOp1 = DefMI->getOperand(1).getReg();
-          unsigned ShiftOp2 = DefMI->getOperand(2).getReg();
-          unsigned ShiftImm = DefMI->getOperand(3).getImm();
-          unsigned SplatImm = MI.getOperand(2).getImm();
-          if (ShiftOp1 == ShiftOp2) {
-            unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
-            if (MRI->hasOneNonDBGUse(ShiftRes)) {
-              DEBUG(dbgs() << "Removing redundant shift: ");
-              DEBUG(DefMI->dump());
-              ToErase = DefMI;
-            }
-            Simplified = true;
-            DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
-                  " to " << NewElem << " in instruction: ");
+
+        // Get the op number for the immediate and then check to make sure
+        //  that it is an immediate.
+        unsigned ImmOpNo = MyOpcode == PPC::XXSPLTW ? 2 : 1;
+        assert(MI.getOperand(ImmOpNo).isImm() &&
+               "Operand should be an immediate.");
+
+        // Deal with the situation where a splat is fed by either a shift
+        //  or a swap.
+        // Spalt is fed by a SHIFT of the form
+        //  XXSLDWI %VA, %VA, imm
+        // Splat is fed by a SWAP which is a permute of this form
+        //  XXPERMDI %VA, %VA, 2
+        // Since the splat instruction can use any of the vector elements to do
+        //  the splat we do not have to rearrange the elements in the vector
+        //  with a swap or shift before we do the splat. We can simply do the
+        //  splat from a different index.
+        // If the swap or shift has only one use (the splat) then we can
+        //  completely remove it.
+        if (DefOpcode == PPC::XXSLDWI || DefOpcode == PPC::XXPERMDI) {
+          unsigned DefRes = DefMI->getOperand(0).getReg();
+          unsigned DefOp1 = DefMI->getOperand(1).getReg();
+          unsigned DefOp2 = DefMI->getOperand(2).getReg();
+
+          // Note that in both cases, the immediate represents the
+          // number of words the input vector is rotated left.
+          // For example: xxsldwi x, y, y, 2 == xxpermdi x, y, y, 2.
+          unsigned DefImm = DefMI->getOperand(3).getImm();
+
+          unsigned SplatImm = MI.getOperand(ImmOpNo).getImm();
+
+          // If the two register operands differ, this isn't a
+          // shift/swap but a permutation of a pair of concatenated
+          // registers.
+          if (DefOp1 != DefOp2)
+            break;
+
+          // The permute is not a swap so there is nothing we can do.
+          if (DefOpcode == PPC::XXPERMDI && DefImm != 2)
+            break;
+
+          unsigned NewElem = 0;
+          // Compute the new index to use for the splat.
+          if (MI.getOpcode() == PPC::VSPLTB)
+            NewElem = (SplatImm + DefImm*4) & 0xF;
+          else if (MI.getOpcode() == PPC::VSPLTH)
+            NewElem = (SplatImm + DefImm*2) & 0x7;
+          else if (MI.getOpcode() == PPC::XXSPLTW)
+            NewElem = (SplatImm + DefImm) & 0x3;
+          else {
+            DEBUG(dbgs() << "Unknown splat opcode.");
             DEBUG(MI.dump());
-            MI.getOperand(1).setReg(ShiftOp1);
-            MI.getOperand(2).setImm(NewElem);
+            break;
+          }
+
+          if (MRI->hasOneNonDBGUse(DefRes)) {
+            DEBUG(dbgs() << "Removing redundant instruction: ");
+            DEBUG(DefMI->dump());
+            ToErase = DefMI;
+          }
+          Simplified = true;
+          DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
+                " to " << NewElem << " in instruction: ");
+          DEBUG(MI.dump());
+
+          // Depending on the type of splat we may need a VSR or a VR register.
+          unsigned TmpReg = 0;
+          if (MI.getOpcode() == PPC::VSPLTB || MI.getOpcode() == PPC::VSPLTH) {
+            // Need a VR register so copy into one.
+            TmpReg = MF->getRegInfo().createVirtualRegister(&PPC::VRRCRegClass);
+            BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), TmpReg)
+              .add(DefMI->getOperand(1));
+          } else {
+            // MI.getOpcode() == PPC::XXSPLTW
+            // Need a VSR register so just use the existing one.
+            TmpReg = DefOp1;
           }
+
+          MI.getOperand(RegOpNo).setReg(TmpReg);
+          MI.getOperand(ImmOpNo).setImm(NewElem);
         }
         break;
       }
Index: test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
===================================================================
--- test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -16,7 +16,7 @@
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 56
 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
-; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -28,7 +28,7 @@
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 48
 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
-; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
 }
 
 ; Function Attrs: norecurse nounwind readnone
Index: test/CodeGen/PowerPC/ppc-xxsldwi-peephole.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/ppc-xxsldwi-peephole.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+
+; Function Attrs: norecurse nounwind readnone
+define <16 x i8> @test(<16 x i8> %a, i8 zeroext %b) local_unnamed_addr {
+entry:
+  %0 = bitcast <16 x i8> %a to <4 x i32>
+  %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  %2 = bitcast <4 x i32> %1 to <16 x i8>
+  %add = add <16 x i8> %2, %a
+  %splat.splat = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %add1 = add <16 x i8> %add, %splat.splat
+  ret <16 x i8> %add1
+; CHECK-LABEL: test
+; CHECK: vspltb
+; CHECK: blr
+}
+
Index: test/CodeGen/PowerPC/ppc64-peephole-swap.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/ppc64-peephole-swap.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s --implicit-check-not xxswapd
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 --implicit-check-not xxswapd
+
+; The strightforward expansion of this code will result in a swap followed by a
+;  splat. However, the swap is not needed since in this case the splat is the
+;  only use.
+; We want to check that we are not using the swap and that we have indexed the
+;  splat to the correct location.
+; 8 Bit Signed Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <16 x i8> @splat_8_plus(<16 x i8> %v, i8 signext %c) local_unnamed_addr {
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add = add <16 x i8> %splat.splat.i, %v
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_8_plus
+; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_8_plus
+; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK-PWR8: blr
+}
+
+; 8 Bit Unsigned Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <16 x i8> @splat_u8_plus(<16 x i8> %v, i8 zeroext %c) local_unnamed_addr {
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add = add <16 x i8> %splat.splat.i, %v
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_u8_plus
+; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u8_plus
+; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK-PWR8: blr
+}
+
+; 16 Bit Signed Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <8 x i16> @splat_16_plus(<8 x i16> %v, i16 signext %c) local_unnamed_addr {
+entry:
+  %0 = shl i16 %c, 8
+  %conv.i = ashr exact i16 %0, 8
+  %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0
+  %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i16> %splat.splat.i, %v
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_16_plus
+; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_16_plus
+; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK-PWR8: blr
+}
+
+; 16 Bit Unsigned Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <8 x i16> @splat_u16_plus(<8 x i16> %v, i16 zeroext %c) local_unnamed_addr {
+entry:
+  %0 = shl i16 %c, 8
+  %conv.i = ashr exact i16 %0, 8
+  %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0
+  %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i16> %splat.splat.i, %v
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_u16_plus
+; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u16_plus
+; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK-PWR8: blr
+}
+
+; 32 Bit Signed Version of the test.
+; The 32 bit examples work differently than the 8 and 16 bit versions of the
+;  test. On Power 9 we have the mtvsrws instruction that does both the move to
+;  register and the splat so it does not really test the newly implemented code.
+; On Power 9 for the 32 bit case we don't need the new simplification. It is
+;  just here for completeness.
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @splat_32_plus(<4 x i32> %v, i32 signext %c) local_unnamed_addr {
+entry:
+  %sext = shl i32 %c, 24
+  %conv.i = ashr exact i32 %sext, 24
+  %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+  %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat.i, %v
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_32_plus
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_32_plus
+; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK-PWR8: blr
+}
+
+; 32 Bit Unsigned Version of the test.
+; The 32 bit examples work differently than the 8 and 16 bit versions of the
+;  test. On Power 9 we have the mtvsrws instruction that does both the move to
+;  register and the splat so it does not really test the newly implemented code.
+; On Power 9 for the 32 bit case we don't need the new simplification. It is
+;  just here for completeness.
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @splat_u32_plus(<4 x i32> %v, i32 zeroext %c) local_unnamed_addr {
+entry:
+  %sext = shl i32 %c, 24
+  %conv.i = ashr exact i32 %sext, 24
+  %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+  %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat.i, %v
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_u32_plus
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u32_plus
+; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK-PWR8: blr
+}
+
Index: test/CodeGen/PowerPC/vsx.ll
===================================================================
--- test/CodeGen/PowerPC/vsx.ll
+++ test/CodeGen/PowerPC/vsx.ll
@@ -1163,10 +1163,9 @@
 
 ; CHECK-LE-LABEL: @test80
 ; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
-; CHECK-LE-DAG: xxswapd  [[V1:[0-9]+]], [[R1]]
 ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
 ; CHECK-LE-DAG: lvx 3, 0, [[R2]]
-; CHECK-LE-DAG: xxspltw 34, [[V1]]
+; CHECK-LE-DAG: xxspltw 34, {{[0-9]+}}
 ; CHECK-LE-NOT: xxswapd 35, [[V2]]
 ; CHECK-LE: vadduwm 2, 2, 3
 ; CHECK-LE: blr