Index: llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -435,28 +435,178 @@
           ToErase = &MI;
           Simplified = true;
         }
-        // Splat fed by a shift. Usually when we align value to splat into
-        // vector element zero.
-        if (DefOpcode == PPC::XXSLDWI) {
+
+        // Word splats fed by permutations/shifts. The following cases apply:
+        //    1) Simplify the following sequence: load->xxpermdi->splt and emit
+        //    a splatting load (LXVWSX). This is performed on POWER 9.
+        //    2) Simplify a splat fed by a permute (eg. xxpermdi->splt) and
+        //    remove the permute because it is redundant.
+        bool LoadSplatted = false;
+
+        // Types of permutes or shifts allowed to be immediately fed by a splat.
+        bool isSupportedPermuteOp = (DefOpcode == PPC::XXPERMDIs) ||
+          (DefOpcode == PPC::XXSLDWIs) || (DefOpcode == PPC::XXPERMDI) ||
+          (DefOpcode == PPC::XXSLDWI);
+
+        // Types of loads allowed to preceed a splat, in the situation of
+        // emitting a splatting load (ld->splt or ld->perm->splat => LXVWSX).
+        auto isSupportedLoadOp = [=](unsigned opCode) -> bool {
+          return (opCode == PPC::LIWZX   || opCode == PPC::LIWAX   ||
+                  opCode == PPC::LXSDX   || opCode == PPC::LXSIBZX ||
+                  opCode == PPC::LXSIHZX || opCode == PPC::LXSSPX  ||
+                  opCode == PPC::LXVX    || opCode == PPC::LXVB16X ||
+                  opCode == PPC::LXVD2X  || opCode == PPC::LXVDSX  ||
+                  opCode == PPC::LXVH8X  || opCode == PPC::LXVL    ||
+                  opCode == PPC::LXVLL   || opCode == PPC::LXVW4X);
+        };
+
+        // Detect a load -> permute -> splat sequence.
+        auto isLoadPermAndSplat = [=]() -> bool {
+          if (MF->getSubtarget<PPCSubtarget>().hasP9Vector() &&
+              MyOpcode == PPC::XXSPLTW && isSupportedPermuteOp) {
+            // Retrieve the instruction that precedes the splat (represented as
+            // DefMI) and ensure that it is a valid (permute) instruction.
+            unsigned PermuteReg =
+              TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+
+            if (TargetRegisterInfo::isVirtualRegister(PermuteReg)) {
+              // The instruction that preceeds the splat should be a load.
+              MachineInstr *LoadMI = MRI->getVRegDef(PermuteReg);
+              if (LoadMI && isSupportedLoadOp(LoadMI->getOpcode()))
+                return true;
+            }
+          }
+          return false;
+        };
+
+        // Detect a load -> splat sequence.
+        bool isLoadAndSplat = (MF->getSubtarget<PPCSubtarget>().hasP9Vector()
+                               && MyOpcode == PPC::XXSPLTW
+                               && isSupportedLoadOp(DefOpcode));
+
+        // CASE 1: (P9 ONY) Emit a splatting load in the following sequence:
+        // load->swap/shift/any permutation->splat, or load->splat.
+        // This case covers when the permutations are of XXPERMDI, XXPERMDIs,
+        // as well as shifts (XXSLDWIs, XXSLDWI).
+        if (isLoadPermAndSplat()) {
+
+          // Retrieve the load (LoadMI) that precedes the splat instruction.
+          // Checking if the load is valid is handled in isLoadPermAndSplat().
+          unsigned SwapReg =
+            TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+          MachineInstr *LoadMI = MRI->getVRegDef(SwapReg);
+
+          // Get the results of the permute and the load instruction to
+          // check if they are of single use.
+          unsigned PermRes = DefMI->getOperand(0).getReg();
+          unsigned LoadRes = LoadMI->getOperand(0).getReg();
+
+          // If the we have: load->perm->splat, emit a splatting load instead.
+          // Change the load into a splatting load, if it has a single use.
+          if (MRI->hasOneNonDBGUse(LoadRes)) {
+            LLVM_DEBUG(dbgs() << "Optimizing ld/swp/splat => splatting load. ");
+            LLVM_DEBUG(LoadMI->dump());
+
+            LoadMI->setDesc(TII->get(PPC::LXVWSX));
+            MRI->setRegClass(LoadMI->getOperand(0).getReg(), &PPC::VSRCRegClass);
+
+            // Change the splat into a copy, and ensure the result of the
+            // splatting load gets forwarded into the users of the splat.
+            MI.setDesc(TII->get(PPC::COPY));
+            MI.getOperand(1).setReg(LoadMI->getOperand(0).getReg());
+            MI.RemoveOperand(2);
+          }
+
+          // Remove the permute operation (DefMI) if it has a single use.
+          if (MRI->hasOneNonDBGUse(PermRes)) {
+            LLVM_DEBUG(dbgs() << "Removing redundant permute: ");
+            LLVM_DEBUG(DefMI->dump());
+            ToErase = DefMI;
+          }
+          Simplified = true;
+          LoadSplatted = true;
+        }
+        else if (isLoadAndSplat) {
+          // Emit a splatting load if the load (DefMI) has a single use.
+          unsigned LoadRes = DefMI->getOperand(0).getReg();
+          if (MRI->hasOneNonDBGUse(LoadRes)) {
+            LLVM_DEBUG(dbgs() << "Optimizing load/splat => splatting load");
+            LLVM_DEBUG(DefMI->dump());
+            DefMI->setDesc(TII->get(PPC::LXVWSX));
+            MRI->setRegClass(DefMI->getOperand(0).getReg(), &PPC::VSRCRegClass);
+
+            // Change the splat (MI) to COPY to forward the splatting load's
+            // result to the users of the splat.
+            MI.setDesc(TII->get(PPC::COPY));
+            MI.getOperand(1).setReg(DefMI->getOperand(0).getReg());
+            MI.RemoveOperand(2);
+          }
+          Simplified = true;
+          LoadSplatted = true;
+        }
+
+        // CASE 2: (P8, P9) Check for a splat fed by a shift, or permute;
+        // usually when we align value to splat into vector element zero.
+        // The splat can select any of the vector elements to perform the splat,
+        // so a shift/permute of the vector elements prior to splatting is
+        // unnecessary. Keep track of where the element was prior to the splat
+        // and splat that element instead.
+        if (!LoadSplatted && MyOpcode == PPC::XXSPLTW && isSupportedPermuteOp) {
+          // Save the necessary operands of the permute/shift and splat to be
+          // later used to keep track of where the element was before.
           unsigned ShiftRes = DefMI->getOperand(0).getReg();
           unsigned ShiftOp1 = DefMI->getOperand(1).getReg();
-          unsigned ShiftOp2 = DefMI->getOperand(2).getReg();
-          unsigned ShiftImm = DefMI->getOperand(3).getImm();
           unsigned SplatImm = MI.getOperand(2).getImm();
-          if (ShiftOp1 == ShiftOp2) {
-            unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
-            if (MRI->hasOneNonDBGUse(ShiftRes)) {
-              LLVM_DEBUG(dbgs() << "Removing redundant shift: ");
-              LLVM_DEBUG(DefMI->dump());
-              ToErase = DefMI;
-            }
-            Simplified = true;
-            LLVM_DEBUG(dbgs() << "Changing splat immediate from " << SplatImm
-                              << " to " << NewElem << " in instruction: ");
-            LLVM_DEBUG(MI.dump());
-            MI.getOperand(1).setReg(ShiftOp1);
-            MI.getOperand(2).setImm(NewElem);
+          unsigned ShiftOp2;
+          unsigned ShiftImm;
+
+          // Depending on if we have a double loading operand version of
+          // the permute/shift, save that operand as well, and check if they
+          // are equal before proceeding with the transformation.
+          if (DefOpcode == PPC::XXPERMDI || DefOpcode == PPC::XXSLDWI) {
+            ShiftOp2 = DefMI->getOperand(2).getReg();
+            ShiftImm = DefMI->getOperand(3).getImm();
+            // The transformation is not performed if the operands differ.
+            if (ShiftOp1 != ShiftOp2)
+              break;
+          } else if (DefOpcode == PPC::XXPERMDIs || DefOpcode == PPC::XXSLDWIs) {
+            ShiftImm = DefMI->getOperand(2).getImm();
           }
+
+          // Find the new element index for the splat instruction to use.
+          unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
+          if (MRI->hasOneNonDBGUse(ShiftRes)) {
+            LLVM_DEBUG(dbgs() << "Removing redundant shift/permute: ");
+            LLVM_DEBUG(DefMI->dump());
+            ToErase = DefMI;
+          }
+          Simplified = true;
+          LLVM_DEBUG(dbgs() << "Changing splat immediate from " << SplatImm
+                            << " to " << NewElem << " in instruction: ");
+          LLVM_DEBUG(MI.dump());
+
+          // Depending on the VSX register class of the shift/permute's input
+          // operand, ensure that it matches the class of the splat (VSRC).
+          unsigned NewReg =
+            MF->getRegInfo().createVirtualRegister(&PPC::VSRCRegClass);
+
+          if (MRI->getRegClass(ShiftOp1) == &PPC::VSSRCRegClass ||
+              MRI->getRegClass(ShiftOp1) == &PPC::VSFRCRegClass) {
+            // Use SUBREG_TO_REG to insert the input value into the proper
+            // register subclass when the input is a VSSRC or VSFRC.
+            BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::SUBREG_TO_REG),
+                    NewReg)
+              .addImm(1)
+              .add(DefMI->getOperand(1))
+              .addImm(PPC::sub_64);
+          } else {
+            // Copy the input into a new VSX register of the correct subclass.
+            BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), NewReg)
+              .add(DefMI->getOperand(1));
+          }
+
+          MI.getOperand(1).setReg(NewReg);
+          MI.getOperand(2).setImm(NewElem);
         }
         break;
       }
Index: llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll
+++ llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll
@@ -13,8 +13,7 @@
 ; CHECK-P8:  # %bb.0: # %entry
 ; CHECK-P8:    lfiwzx f0, 0, r3
 ; CHECK-P8:    ld r4, .LC0@toc@l(r4)
-; CHECK-P8:    xxpermdi vs0, f0, f0, 2
-; CHECK-P8:    xxspltw v2, vs0, 3
+; CHECK-P8:    xxspltw v2, vs0, 1
 ; CHECK-P8:    stvx v2, 0, r4
 ; CHECK-P8:    lis r4, 1024
 ; CHECK-P8:    lfiwax f0, 0, r3
@@ -27,11 +26,9 @@
 ;
 ; CHECK-P9-LABEL: testExpandPostRAPseudo:
 ; CHECK-P9:  # %bb.0: # %entry
-; CHECK-P9:    lfiwzx f0, 0, r3
 ; CHECK-P9:    addis r4, r2, .LC0@toc@ha
 ; CHECK-P9:    ld r4, .LC0@toc@l(r4)
-; CHECK-P9:    xxpermdi vs0, f0, f0, 2
-; CHECK-P9:    xxspltw vs0, vs0, 3
+; CHECK-P9:    lxvwsx vs0, 0, r3
 ; CHECK-P9:    stxvx vs0, 0, r4
 ; CHECK-P9:    lis r4, 1024
 ; CHECK-P9:    lfiwax f0, 0, r3
Index: llvm/test/CodeGen/PowerPC/build-vector-tests.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -109,8 +109,8 @@
 ;vector int spltRegVali(int val) {                                            //
 ;  return (vector int) val;                                                   //
 ;}                                                                            //
-;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw         //
-;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw         //
+;// P8: (LE) lfiwzx, xxspltw (BE): lfiwzx, xxspltw                            //
+;// P9: (LE) lxvwsx (BE): lxvwsx                                              //
 ;vector int spltMemVali(int *ptr) {                                           //
 ;  return (vector int)*ptr;                                                   //
 ;}                                                                            //
@@ -284,8 +284,8 @@
 ;vector unsigned int spltRegValui(unsigned int val) {                         //
 ;  return (vector unsigned int) val;                                          //
 ;}                                                                            //
-;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw         //
-;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw         //
+;// P8: (LE) lfiwzx, xxspltw (BE): lfiwzx, xxspltw                            //
+;// P9: (LE) lxvwsx (BE): lxvwsx                                              //
 ;vector unsigned int spltMemValui(unsigned int *ptr) {                        //
 ;  return (vector unsigned int)*ptr;                                          //
 ;}                                                                            //
@@ -1202,21 +1202,15 @@
 ; P9LE-LABEL: spltMemVali
 ; P8BE-LABEL: spltMemVali
 ; P8LE-LABEL: spltMemVali
-; P9BE: lfiwzx f0, 0, r3
-; P9BE: xxsldwi vs0, f0, f0, 1
-; P9BE: xxspltw v2, vs0, 0
+; P9BE: lxvwsx v2, 0, r3
 ; P9BE: blr
-; P9LE: lfiwzx f0, 0, r3
-; P9LE: xxpermdi vs0, f0, f0, 2
-; P9LE: xxspltw v2, vs0, 3
+; P9LE: lxvwsx v2, 0, r3
 ; P9LE: blr
 ; P8BE: lfiwzx f0, 0, r3
-; P8BE: xxsldwi vs0, f0, f0, 1
-; P8BE: xxspltw v2, vs0, 0
+; P8BE: xxspltw v2, vs0, 1
 ; P8BE: blr
 ; P8LE: lfiwzx f0, 0, r3
-; P8LE: xxpermdi vs0, f0, f0, 2
-; P8LE: xxspltw v2, vs0, 3
+; P8LE: xxspltw v2, vs0, 1
 ; P8LE: blr
 }
 
@@ -2344,21 +2338,15 @@
 ; P9LE-LABEL: spltMemValui
 ; P8BE-LABEL: spltMemValui
 ; P8LE-LABEL: spltMemValui
-; P9BE: lfiwzx f0, 0, r3
-; P9BE: xxsldwi vs0, f0, f0, 1
-; P9BE: xxspltw v2, vs0, 0
+; P9BE: lxvwsx v2, 0, r3
 ; P9BE: blr
-; P9LE: lfiwzx f0, 0, r3
-; P9LE: xxpermdi vs0, f0, f0, 2
-; P9LE: xxspltw v2, vs0, 3
+; P9LE: lxvwsx v2, 0, r3
 ; P9LE: blr
 ; P8BE: lfiwzx f0, 0, r3
-; P8BE: xxsldwi vs0, f0, f0, 1
-; P8BE: xxspltw v2, vs0, 0
+; P8BE: xxspltw v2, vs0, 1
 ; P8BE: blr
 ; P8LE: lfiwzx f0, 0, r3
-; P8LE: xxpermdi vs0, f0, f0, 2
-; P8LE: xxspltw v2, vs0, 3
+; P8LE: xxspltw v2, vs0, 1
 ; P8LE: blr
 }
 
Index: llvm/test/CodeGen/PowerPC/load-v4i8-improved.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/load-v4i8-improved.ll
+++ llvm/test/CodeGen/PowerPC/load-v4i8-improved.ll
@@ -1,23 +1,30 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s \
-; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-LE \
-; RUN:   -implicit-check-not vmrg -implicit-check-not=vperm %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s \
-; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck \
-; RUN:   -implicit-check-not vmrg -implicit-check-not=vperm %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 \
+; RUN:     -mtriple=powerpc64le-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \
+; RUN:     -ppc-asm-full-reg-names | FileCheck -implicit-check-not vmrg \
+; RUN:     -implicit-check-not=vperm %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 \
+; RUN:     -mtriple=powerpc64-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \
+; RUN:     -ppc-asm-full-reg-names | FileCheck -implicit-check-not vmrg \
+; RUN:     -implicit-check-not=vperm %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 \
+; RUN:     -mtriple=powerpc64le-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \
+; RUN:     -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-P9 \
+; RUN:     -implicit-check-not vmrg -implicit-check-not=vperm %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 \
+; RUN:     -mtriple=powerpc64-unknown-linux-gnu < %s -ppc-vsr-nums-as-vr \
+; RUN:     -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-P9 \
+; RUN:     -implicit-check-not vmrg -implicit-check-not=vperm %s
 
 define <16 x i8> @test(i32* %s, i32* %t) {
-; CHECK-LE-LABEL: test:
-; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-LE-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-LE-NEXT:    xxspltw v2, vs0, 3
-; CHECK-LE-NEXT:    blr
+; CHECK-P9-LABEL: test:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
 
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-NEXT:    xxspltw v2, vs0, 0
+; CHECK-NEXT:    xxspltw v2, vs0, 1
 ; CHECK-NEXT:    blr
 entry:
   %0 = bitcast i32* %s to <4 x i8>*
Index: llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll
+++ llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll
@@ -57,16 +57,12 @@
 define <4 x i32> @test4(i32* nocapture readonly %in) {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 
 ; CHECK-BE-LABEL: test4:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load i32, i32* %in, align 4
@@ -78,16 +74,12 @@
 define <4 x float> @test5(float* nocapture readonly %in) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 
 ; CHECK-BE-LABEL: test5:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load float, float* %in, align 4
@@ -101,18 +93,14 @@
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r3, r2, .LC0@toc@ha
 ; CHECK-NEXT:    ld r3, .LC0@toc@l(r3)
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 
 ; CHECK-BE-LABEL: test6:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r3, r2, .LC0@toc@ha
 ; CHECK-BE-NEXT:    ld r3, .LC0@toc@l(r3)
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load i32, i32* @Globi, align 4
@@ -126,18 +114,14 @@
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r3, r2, .LC1@toc@ha
 ; CHECK-NEXT:    ld r3, .LC1@toc@l(r3)
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 
 ; CHECK-BE-LABEL: test7:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r3, r2, .LC1@toc@ha
 ; CHECK-BE-NEXT:    ld r3, .LC1@toc@l(r3)
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load float, float* @Globf, align 4
Index: llvm/test/CodeGen/PowerPC/qpx-load-splat.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/qpx-load-splat.ll
+++ llvm/test/CodeGen/PowerPC/qpx-load-splat.ll
@@ -1,13 +1,25 @@
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \
 ; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names -verify-machineinstrs < %s | \
+; RUN:   FileCheck --check-prefix=CHECK-P9 %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -ppc-vsr-nums-as-vr \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names -verify-machineinstrs < %s | \
+; RUN:   FileCheck --check-prefix=CHECK-P9 %s
 
 ; Function Attrs: norecurse nounwind readonly
 define <4 x double> @foo(double* nocapture readonly %a) #0 {
 ; CHECK-LABEL: foo:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxvdsx v2, 0, r3
-; CHECK-NEXT:    vmr v3, v2
-; CHECK-NEXT:    blr
+; CHECK:  # %bb.0: # %entry
+; CHECK:    lxvdsx v2, 0, r3
+; CHECK:    vmr v3, v2
+; CHECK:    blr
+
+; CHECK-P9-LABEL: foo:
+; CHECK-P9:  # %bb.0: # %entry
+; CHECK-P9:    lxvdsx v2, 0, r3
+; CHECK-P9:    vmr v3, v2
+; CHECK-P9:    blr
 entry:
   %0 = load double, double* %a, align 8
   %vecinit.i = insertelement <4 x double> undef, double %0, i32 0
@@ -17,11 +29,18 @@
 
 define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 {
 ; CHECK-LABEL: foox:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sldi r4, r4, 3
-; CHECK-NEXT:    lxvdsx v2, r3, r4
-; CHECK-NEXT:    vmr v3, v2
-; CHECK-NEXT:    blr
+; CHECK:  # %bb.0: # %entry
+; CHECK:    sldi r4, r4, 3
+; CHECK:    lxvdsx v2, r3, r4
+; CHECK:    vmr v3, v2
+; CHECK:    blr
+
+; CHECK-P9-LABEL: foox:
+; CHECK-P9:  # %bb.0: # %entry
+; CHECK-P9:    sldi r4, r4, 3
+; CHECK-P9:    lxvdsx v2, r3, r4
+; CHECK-P9:    vmr v3, v2
+; CHECK-P9:    blr
 entry:
   %p = getelementptr double, double* %a, i64 %idx
   %0 = load double, double* %p, align 8
@@ -32,13 +51,22 @@
 
 define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 {
 ; CHECK-LABEL: fooxu:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sldi r4, r4, 3
-; CHECK-NEXT:    lfdux f0, r3, r4
-; CHECK-NEXT:    xxspltd v2, vs0, 0
-; CHECK-NEXT:    std r3, 0(r5)
-; CHECK-NEXT:    vmr v3, v2
-; CHECK-NEXT:    blr
+; CHECK:  # %bb.0: # %entry
+; CHECK:    sldi r4, r4, 3
+; CHECK:    lfdux f0, r3, r4
+; CHECK:    xxspltd v2, vs0, 0
+; CHECK:    std r3, 0(r5)
+; CHECK:    vmr v3, v2
+; CHECK:    blr
+
+; CHECK-P9-LABEL: fooxu:
+; CHECK-P9:  # %bb.0: # %entry
+; CHECK-P9:    sldi r4, r4, 3
+; CHECK-P9:    lfdux f0, r3, r4
+; CHECK-P9:    std r3, 0(r5)
+; CHECK-P9:    xxspltd v2, vs0, 0
+; CHECK-P9:    vmr v3, v2
+; CHECK-P9:    blr
 entry:
   %p = getelementptr double, double* %a, i64 %idx
   %0 = load double, double* %p, align 8
@@ -51,10 +79,14 @@
 define <4 x float> @foof(float* nocapture readonly %a) #0 {
 ; CHECK-LABEL: foof:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
-; CHECK-NEXT:    blr
+; CHECK:         lfiwzx f0, 0, r3
+; CHECK-NEXT:    xxspltw v2, vs0, 1
+; CHECK:         blr
+
+; CHECK-P9-LABEL: foof:
+; CHECK-P9:  # %bb.0: # %entry
+; CHECK-P9:    lxvwsx v2, 0, r3
+; CHECK-P9:    blr
 entry:
   %0 = load float, float* %a, align 4
   %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
@@ -65,11 +97,16 @@
 define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 {
 ; CHECK-LABEL: foofx:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sldi r4, r4, 2
-; CHECK-NEXT:    lfiwzx f0, r3, r4
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
-; CHECK-NEXT:    blr
+; CHECK:         sldi r4, r4, 2
+; CHECK:         lfiwzx f0, r3, r4
+; CHECK-NEXT:    xxspltw v2, vs0, 1
+; CHECK:         blr
+
+; CHECK-P9-LABEL: foofx:
+; CHECK-P9:  # %bb.0: # %entry
+; CHECK-P9:    sldi r4, r4, 2
+; CHECK-P9:    lxvwsx v2, r3, r4
+; CHECK-P9:    blr
 entry:
   %p = getelementptr float, float* %a, i64 %idx
   %0 = load float, float* %p, align 4
Index: llvm/test/CodeGen/PowerPC/vsx.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/vsx.ll
+++ llvm/test/CodeGen/PowerPC/vsx.ll
@@ -1173,10 +1173,10 @@
 
 ; CHECK-LE-LABEL: @test80
 ; CHECK-LE-DAG: mtvsrd f0, r3
-; CHECK-LE-DAG: xxswapd  vs0, vs0
+; CHECK-LE-DAG: addis r4, r2, .LCPI65_0@toc@ha
 ; CHECK-LE-DAG: addi r3, r4, .LCPI65_0@toc@l
 ; CHECK-LE-DAG: lvx v3, 0, r3
-; CHECK-LE-DAG: xxspltw v2, vs0, 3
+; CHECK-LE-DAG: xxspltw v2, vs0, 1
 ; CHECK-LE-NOT: xxswapd v3,
 ; CHECK-LE: vadduwm v2, v2, v3
 ; CHECK-LE: blr