diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -738,6 +738,8 @@
 MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true)
 MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true)
 MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true)
+MMA_BUILTIN(lxvp, "W256SLLiW256C*", false)
+MMA_BUILTIN(stxvp, "vW256SLLiW256C*", false)
 
 // FIXME: Obviously incomplete.
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14776,6 +14776,19 @@
       break;
   #include "clang/Basic/BuiltinsPPC.def"
     }
+    if (BuiltinID == PPC::BI__builtin_mma_lxvp ||
+        BuiltinID == PPC::BI__builtin_mma_stxvp) {
+      if (BuiltinID == PPC::BI__builtin_mma_lxvp) {
+        Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
+        Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]);
+      } else {
+        Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
+        Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]);
+      }
+      Ops.pop_back();
+      llvm::Function *F = CGM.getIntrinsic(ID);
+      return Builder.CreateCall(F, Ops, "");
+    }
     SmallVector<Value*, 4> CallOps;
     if (Accumulate) {
       Address Addr = EmitPointerWithAlignment(E->getArg(0));
diff --git a/clang/test/CodeGen/builtins-ppc-mma.c b/clang/test/CodeGen/builtins-ppc-mma.c
--- a/clang/test/CodeGen/builtins-ppc-mma.c
+++ b/clang/test/CodeGen/builtins-ppc-mma.c
@@ -1036,3 +1036,162 @@
   __builtin_mma_pmxvbf16ger2nn(&vq, vc, vc, 0, 0, 0);
   *((__vector_quad *)resp) = vq;
 }
+
+// CHECK-LABEL: @test66(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP1]], i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test66(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(0LL, vpp);
+  __builtin_mma_stxvp(vp, 0LL, vp2);
+}
+
+// CHECK-LABEL: @test67(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[OFFSET:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[OFFSET]]
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test67(const __vector_pair *vpp, signed long long offset, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(offset, vpp);
+  __builtin_mma_stxvp(vp, offset, vp2);
+}
+
+// CHECK-LABEL: @test68(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 18
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 18
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test68(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(18LL, vpp);
+  __builtin_mma_stxvp(vp, 18LL, vp2);
+}
+
+// CHECK-LABEL: @test69(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 1
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test69(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(1LL, vpp);
+  __builtin_mma_stxvp(vp, 1LL, vp2);
+}
+
+// CHECK-LABEL: @test70(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 42
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 42
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test70(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(42LL, vpp);
+  __builtin_mma_stxvp(vp, 42LL, vp2);
+}
+
+// CHECK-LABEL: @test71(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VPP:%.*]], i64 128
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i1>* [[TMP0]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VP2:%.*]], i64 128
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <256 x i1>* [[TMP3]] to i8*
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test71(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(32768LL, vpp);
+  __builtin_mma_stxvp(vp, 32768LL, vp2);
+}
+
+// CHECK-LABEL: @test72(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 32799
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 32799
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test72(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(32799LL, vpp);
+  __builtin_mma_stxvp(vp, 32799LL, vp2);
+}
+
+// CHECK-LABEL: @test73(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 8
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+  __vector_quad vq = *((__vector_quad *)vqp);
+  __vector_pair vp = __builtin_mma_lxvp(8LL, vpp);
+  __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0);
+  *((__vector_quad *)resp) = vq;
+}
+
+// CHECK-LABEL: @test74(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+  __vector_quad vq = *((__vector_quad *)vqp);
+  __vector_pair vp = __builtin_mma_lxvp(0LL, vpp);
+  __builtin_mma_xvf64gernp(&vq, vp, vc);
+  *((__vector_quad *)resp) = vq;
+}
+
+// CHECK-LABEL: @test75(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[OFFS:%.*]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+void test75(unsigned char *vqp, signed long long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+  __vector_quad vq = *((__vector_quad *)vqp);
+  __vector_pair vp = __builtin_mma_lxvp(offs, vpp);
+  __builtin_mma_xvf64gernp(&vq, vp, vc);
+  *((__vector_quad *)resp) = vq;
+}
diff --git a/clang/test/Sema/ppc-mma-types.c b/clang/test/Sema/ppc-mma-types.c
--- a/clang/test/Sema/ppc-mma-types.c
+++ b/clang/test/Sema/ppc-mma-types.c
@@ -319,3 +319,17 @@
   __vector_pair vp2 = (__vector_pair)vpp; // expected-error {{used type '__vector_pair' where arithmetic or pointer type is required}}
 }
 
+void testBuiltinTypes1(const __vector_pair *vpp, const __vector_pair *vp2, float f) {
+  __vector_pair vp = __builtin_mma_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}}
+  __builtin_mma_stxvp(vp, 32799, vp2);           // expected-error {{passing 'int' to parameter of incompatible type 'long long'}}
+}
+
+void testBuiltinTypes2(__vector_pair *vpp, const __vector_pair *vp2, unsigned char c) {
+  __vector_pair vp = __builtin_mma_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}}
+  __builtin_mma_stxvp(vp, c, vp2);                 // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}}
+}
+
+void testBuiltinTypes3(vector int v, __vector_pair *vp2, signed long long ll, unsigned short s) {
+  __vector_pair vp = __builtin_mma_lxvp(ll, v); // expected-error {{passing '__vector int' (vector of 4 'int' values) to parameter of incompatible type 'const __vector_pair *'}}
+  __builtin_mma_stxvp(vp, ll, s);               // expected-error {{passing 'unsigned short' to parameter of incompatible type 'const __vector_pair *'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1422,6 +1422,14 @@
   def int_ppc_mma_xxsetaccz :
         Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>;
 
+  def int_ppc_mma_lxvp :
+        Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty],
+                  [IntrReadMem, IntrArgMemOnly]>;
+
+  def int_ppc_mma_stxvp :
+        Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty],
+                  [IntrWriteMem, IntrArgMemOnly]>;
+
   // MMA Reduced-Precision: Outer Product Intrinsic Definitions.
   defm int_ppc_mma_xvi4ger8 :
         PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -293,6 +293,13 @@
                                               Align(16));
     }
 
+    /// SelectAddrImmX34 - Returns true if the address N can be represented by
+    /// a base register plus a signed 34-bit displacement. Suitable for use by
+    /// PSTXVP and friends.
+    bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG);
+    }
+
     // Select an address into a single register.
     bool SelectAddr(SDValue N, SDValue &Base) {
       Base = N;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -770,6 +770,8 @@
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
                              SelectionDAG &DAG,
                              MaybeAlign EncodingAlignment) const;
+    bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base,
+                               SelectionDAG &DAG) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
@@ -1325,6 +1327,8 @@
 
   bool isIntS16Immediate(SDNode *N, int16_t &Imm);
   bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+  bool isIntS34Immediate(SDNode *N, int64_t &Imm);
+  bool isIntS34Immediate(SDValue Op, int64_t &Imm);
 
   bool convertToNonDenormSingle(APInt &ArgAPInt);
   bool convertToNonDenormSingle(APFloat &ArgAPFloat);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2399,6 +2399,20 @@
   return false;
 }
 
+/// isIntS34Immediate - This method tests if value of node given can be
+/// accurately represented as a sign extension from a 34-bit value.  If so,
+/// this returns true and the immediate.
+bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+  return isInt<34>(Imm);
+}
+bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
+  return isIntS34Immediate(Op.getNode(), Imm);
+}
+
 /// SelectAddressRegReg - Given the specified addressed, check to see if it
 /// can be represented as an indexed [r+r] operation.  Returns false if it
 /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
@@ -2599,6 +2613,55 @@
   return true;      // [r+0]
 }
 
+/// Similar to the 16-bit case but for instructions that take a 34-bit
+/// displacement field (prefixed loads/stores).
+bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
+                                              SDValue &Base,
+                                              SelectionDAG &DAG) const {
+  // Only on 64-bit targets.
+  if (N.getValueType() != MVT::i64)
+    return false;
+
+  SDLoc dl(N);
+  int64_t Imm = 0;
+
+  if (N.getOpcode() == ISD::ADD) {
+    if (!isIntS34Immediate(N.getOperand(1), Imm))
+      return false;
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+      Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    else
+      Base = N.getOperand(0);
+    return true;
+  }
+
+  if (N.getOpcode() == ISD::OR) {
+    if (!isIntS34Immediate(N.getOperand(1), Imm))
+      return false;
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are
+    // provably disjoint.
+    KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
+    if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
+      return false;
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+      Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    else
+      Base = N.getOperand(0);
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    return true;
+  }
+
+  if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
+    return true;
+  }
+
+  return false;
+}
+
 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
 /// represented as an indexed [r+r] operation.
 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1031,11 +1031,13 @@
 // Define PowerPC specific addressing mode.
 
 // d-form
-def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>;  // "stb"
+def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>; // "stb"
 // ds-form
-def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>;  // "std"
+def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>; // "std"
 // dq-form
-def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>;  // "stxv"
+def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
+// 8LS:d-form
+def iaddrX34 : ComplexPattern<iPTR, 2, "SelectAddrImmX34",  [], []>; // "pstxvp"
 
 // Below forms are all x-form addressing mode, use three different ones so we
 // can make a accurate check for x-form instructions in ISEL.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1654,6 +1654,24 @@
                                 "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
 }
 
+let Predicates = [PairedVectorMemops] in {
+  // Intrinsics for Paired Vector Loads.
+  def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>;
+  def : Pat<(v256i1 (int_ppc_mma_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>;
+  }
+  // Intrinsics for Paired Vector Stores.
+  def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst),
+            (STXVP $XSp, memrix16:$dst)>;
+  def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddrX16:$dst),
+            (STXVPX $XSp, xaddrX16:$dst)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst),
+              (PSTXVP $XSp, memri34:$dst)>;
+  }
+}
+
 // TODO: We have an added complexity of 500 here. This is only a temporary
 // solution to have tablegen consider these patterns first. The way we do
 // addressing for PowerPC is complex depending on available D form, X form, or
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -60,6 +60,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -277,8 +278,11 @@
   } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
     return SMemI->getPointerOperand();
   } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
-    if (IMemI->getIntrinsicID() == Intrinsic::prefetch)
+    if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+        IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp)
       return IMemI->getArgOperand(0);
+    if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp)
+      return IMemI->getArgOperand(1);
   }
 
   return nullptr;
@@ -345,9 +349,13 @@
         MemI = SMemI;
         PtrValue = SMemI->getPointerOperand();
       } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
-        if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+        if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+            IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) {
           MemI = IMemI;
           PtrValue = IMemI->getArgOperand(0);
+        } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) {
+          MemI = IMemI;
+          PtrValue = IMemI->getArgOperand(1);
         } else continue;
       } else continue;
 
@@ -827,6 +835,11 @@
     if (ST && ST->hasAltivec() &&
         PtrValue->getType()->getPointerElementType()->isVectorTy())
       return false;
+    // There are no update forms for P10 lxvp/stxvp intrinsic.
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (II && ((II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) ||
+               II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp))
+      return false;
     // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
     // be 4's multiple (DS-form). For i64 loads/stores when the displacement
     // fits in a 16-bit signed field but isn't a multiple of 4, it will be
@@ -864,7 +877,13 @@
   // Check if a load/store has DQ form.
   auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) {
     assert((PtrValue && I) && "Invalid parameter!");
-    return !isa<IntrinsicInst>(I) && ST && ST->hasP9Vector() &&
+    // Check if it is a P10 lxvp/stxvp intrinsic.
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (II)
+      return II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp ||
+             II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp;
+    // Check if it is a P9 vector load/store.
+    return ST && ST->hasP9Vector() &&
            (PtrValue->getType()->getPointerElementType()->isVectorTy());
   };
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1223,7 +1223,8 @@
   case Intrinsic::ppc_vsx_lxvd2x_be:
   case Intrinsic::ppc_vsx_lxvw4x_be:
   case Intrinsic::ppc_vsx_lxvl:
-  case Intrinsic::ppc_vsx_lxvll: {
+  case Intrinsic::ppc_vsx_lxvll:
+  case Intrinsic::ppc_mma_lxvp: {
     Info.PtrVal = Inst->getArgOperand(0);
     Info.ReadMem = true;
     Info.WriteMem = false;
@@ -1239,7 +1240,8 @@
   case Intrinsic::ppc_vsx_stxvd2x_be:
   case Intrinsic::ppc_vsx_stxvw4x_be:
   case Intrinsic::ppc_vsx_stxvl:
-  case Intrinsic::ppc_vsx_stxvll: {
+  case Intrinsic::ppc_vsx_stxvll:
+  case Intrinsic::ppc_mma_stxvp: {
     Info.PtrVal = Inst->getArgOperand(1);
     Info.ReadMem = false;
     Info.WriteMem = true;
diff --git a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | FileCheck %s \
+; RUN:   --check-prefix=CHECK-BE
+
+; This test checks that LSR properly recognizes lxvp/stxvp as load/store
+; intrinsics to avoid generating x-form instructions instead of d-forms.
+
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*)
+define void @foo(i32 zeroext %n, <256 x i1>* %ptr, <256 x i1>* %ptr2) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmplwi r3, 0
+; CHECK-NEXT:    beqlr cr0
+; CHECK-NEXT:  # %bb.1: # %for.body.lr.ph
+; CHECK-NEXT:    clrldi r6, r3, 32
+; CHECK-NEXT:    addi r3, r4, 64
+; CHECK-NEXT:    addi r4, r5, 64
+; CHECK-NEXT:    mtctr r6
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lxvp vsp0, -64(r3)
+; CHECK-NEXT:    lxvp vsp2, -32(r3)
+; CHECK-NEXT:    lxvp vsp4, 0(r3)
+; CHECK-NEXT:    lxvp vsp6, 32(r3)
+; CHECK-NEXT:    addi r3, r3, 1
+; CHECK-NEXT:    stxvp vsp0, -64(r4)
+; CHECK-NEXT:    stxvp vsp2, -32(r4)
+; CHECK-NEXT:    stxvp vsp4, 0(r4)
+; CHECK-NEXT:    stxvp vsp6, 32(r4)
+; CHECK-NEXT:    addi r4, r4, 1
+; CHECK-NEXT:    bdnz .LBB0_2
+; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: foo:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    cmplwi r3, 0
+; CHECK-BE-NEXT:    beqlr cr0
+; CHECK-BE-NEXT:  # %bb.1: # %for.body.lr.ph
+; CHECK-BE-NEXT:    clrldi r6, r3, 32
+; CHECK-BE-NEXT:    addi r3, r4, 64
+; CHECK-BE-NEXT:    addi r4, r5, 64
+; CHECK-BE-NEXT:    mtctr r6
+; CHECK-BE-NEXT:    .p2align 4
+; CHECK-BE-NEXT:  .LBB0_2: # %for.body
+; CHECK-BE-NEXT:    #
+; CHECK-BE-NEXT:    lxvp vsp0, -64(r3)
+; CHECK-BE-NEXT:    lxvp vsp2, -32(r3)
+; CHECK-BE-NEXT:    lxvp vsp4, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp6, 32(r3)
+; CHECK-BE-NEXT:    addi r3, r3, 1
+; CHECK-BE-NEXT:    stxvp vsp0, -64(r4)
+; CHECK-BE-NEXT:    stxvp vsp2, -32(r4)
+; CHECK-BE-NEXT:    stxvp vsp4, 0(r4)
+; CHECK-BE-NEXT:    stxvp vsp6, 32(r4)
+; CHECK-BE-NEXT:    addi r4, r4, 1
+; CHECK-BE-NEXT:    bdnz .LBB0_2
+; CHECK-BE-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-BE-NEXT:    blr
+entry:
+  %cmp35.not = icmp eq i32 %n, 0
+  br i1 %cmp35.not, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:
+  %0 = bitcast <256 x i1>* %ptr to i8*
+  %1 = bitcast <256 x i1>* %ptr2 to i8*
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %2 = getelementptr i8, i8* %0, i64 %indvars.iv
+  %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+  %add2 = add nuw nsw i64 %indvars.iv, 32
+  %4 = getelementptr i8, i8* %0, i64 %add2
+  %5 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %4)
+  %add4 = add nuw nsw i64 %indvars.iv, 64
+  %6 = getelementptr i8, i8* %0, i64 %add4
+  %7 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %6)
+  %add6 = add nuw nsw i64 %indvars.iv, 96
+  %8 = getelementptr i8, i8* %0, i64 %add6
+  %9 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %8)
+  %10 = getelementptr i8, i8* %1, i64 %indvars.iv
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %3, i8* %10)
+  %11 = getelementptr i8, i8* %1, i64 %add2
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %5, i8* %11)
+  %12 = getelementptr i8, i8* %1, i64 %add4
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %7, i8* %12)
+  %13 = getelementptr i8, i8* %1, i64 %add6
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %9, i8* %13)
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
diff --git a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \
+; RUN:   -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s
+; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \
+; RUN:   -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s \
+; RUN:   --check-prefix=CHECK-BE
+
+; This test checks the PPCLoopInstrFormPrep pass supports the lxvp and stxvp
+; intrinsics so we generate more dq-form instructions instead of x-forms.
+
+%_elem_type_of_x = type <{ double }>
+%_elem_type_of_y = type <{ double }>
+
+define void @foo(i64* %.n, [0 x %_elem_type_of_x]* %.x, [0 x %_elem_type_of_y]* %.y, <2 x double>* %.sum) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ld r5, 0(r3)
+; CHECK-NEXT:    cmpdi r5, 1
+; CHECK-NEXT:    bltlr cr0
+; CHECK-NEXT:  # %bb.1: # %_loop_1_do_.lr.ph
+; CHECK-NEXT:    addi r3, r4, 1
+; CHECK-NEXT:    addi r4, r5, -1
+; CHECK-NEXT:    lxv vs0, 0(r6)
+; CHECK-NEXT:    rldicl r4, r4, 60, 4
+; CHECK-NEXT:    addi r4, r4, 1
+; CHECK-NEXT:    mtctr r4
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB0_2: # %_loop_1_do_
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lxvp vsp2, 0(r3)
+; CHECK-NEXT:    lxvp vsp4, 32(r3)
+; CHECK-NEXT:    addi r3, r3, 128
+; CHECK-NEXT:    xvadddp vs0, vs0, vs3
+; CHECK-NEXT:    xvadddp vs0, vs0, vs2
+; CHECK-NEXT:    xvadddp vs0, vs0, vs5
+; CHECK-NEXT:    xvadddp vs0, vs0, vs4
+; CHECK-NEXT:    bdnz .LBB0_2
+; CHECK-NEXT:  # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge
+; CHECK-NEXT:    stxv vs0, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: foo:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    ld r5, 0(r3)
+; CHECK-BE-NEXT:    cmpdi r5, 1
+; CHECK-BE-NEXT:    bltlr cr0
+; CHECK-BE-NEXT:  # %bb.1: # %_loop_1_do_.lr.ph
+; CHECK-BE-NEXT:    addi r3, r4, 1
+; CHECK-BE-NEXT:    addi r4, r5, -1
+; CHECK-BE-NEXT:    lxv vs0, 0(r6)
+; CHECK-BE-NEXT:    rldicl r4, r4, 60, 4
+; CHECK-BE-NEXT:    addi r4, r4, 1
+; CHECK-BE-NEXT:    mtctr r4
+; CHECK-BE-NEXT:    .p2align 5
+; CHECK-BE-NEXT:  .LBB0_2: # %_loop_1_do_
+; CHECK-BE-NEXT:    #
+; CHECK-BE-NEXT:    lxvp vsp2, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp4, 32(r3)
+; CHECK-BE-NEXT:    addi r3, r3, 128
+; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs2
+; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs3
+; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs4
+; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs5
+; CHECK-BE-NEXT:    bdnz .LBB0_2
+; CHECK-BE-NEXT:  # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge
+; CHECK-BE-NEXT:    stxv vs0, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %_val_n_2 = load i64, i64* %.n, align 8
+  %_grt_tmp7 = icmp slt i64 %_val_n_2, 1
+  br i1 %_grt_tmp7, label %_return_bb, label %_loop_1_do_.lr.ph
+
+_loop_1_do_.lr.ph:                                ; preds = %entry
+  %x_rvo_based_addr_5 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1
+  %.sum.promoted = load <2 x double>, <2 x double>* %.sum, align 16
+  br label %_loop_1_do_
+
+_loop_1_do_:                                      ; preds = %_loop_1_do_.lr.ph, %_loop_1_do_
+  %_val_sum_9 = phi <2 x double> [ %.sum.promoted, %_loop_1_do_.lr.ph ], [ %_add_tmp49, %_loop_1_do_ ]
+  %i.08 = phi i64 [ 1, %_loop_1_do_.lr.ph ], [ %_loop_1_update_loop_ix, %_loop_1_do_ ]
+  %x_ix_dim_0_6 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_5, i64 %i.08
+  %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_6 to i8*
+  %0 = getelementptr i8, i8* %x_ix_dim_0_, i64 1
+  %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0)
+  %2 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %1)
+  %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %2, 0
+  %.fca.1.extract2 = extractvalue { <16 x i8>, <16 x i8> } %2, 1
+  %3 = getelementptr i8, i8* %x_ix_dim_0_, i64 33
+  %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3)
+  %5 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %4)
+  %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 0
+  %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 1
+  %6 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double>
+  %_add_tmp23 = fadd contract <2 x double> %_val_sum_9, %6
+  %7 = bitcast <16 x i8> %.fca.1.extract2 to <2 x double>
+  %_add_tmp32 = fadd contract <2 x double> %_add_tmp23, %7
+  %8 = bitcast <16 x i8> %.fca.0.extract to <2 x double>
+  %_add_tmp40 = fadd contract <2 x double> %_add_tmp32, %8
+  %9 = bitcast <16 x i8> %.fca.1.extract to <2 x double>
+  %_add_tmp49 = fadd contract <2 x double> %_add_tmp40, %9
+  %_loop_1_update_loop_ix = add nuw nsw i64 %i.08, 16
+  %_grt_tmp = icmp sgt i64 %_loop_1_update_loop_ix, %_val_n_2
+  br i1 %_grt_tmp, label %_loop_1_loopHeader_._return_bb_crit_edge, label %_loop_1_do_
+
+_loop_1_loopHeader_._return_bb_crit_edge:         ; preds = %_loop_1_do_
+  store <2 x double> %_add_tmp49, <2 x double>* %.sum, align 16
+  br label %_return_bb
+
+_return_bb:                                       ; preds = %_loop_1_loopHeader_._return_bb_crit_edge, %entry
+  ret void
+}
+
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>)
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -698,3 +698,315 @@
 
 declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32)
 declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)
+
+; Function Attrs: nounwind
+define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp0, 0(r3)
+; CHECK-NEXT:    stxvp vsp0, 0(r4)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_1:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp0, 0(r3)
+; CHECK-BE-NEXT:    stxvp vsp0, 0(r4)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0)
+  %2 = bitcast <256 x i1>* %vp2 to i8*
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+
+; Function Attrs: argmemonly nounwind writeonly
+declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*)
+
+; Function Attrs: nounwind
+define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvpx vsp0, r3, r4
+; CHECK-NEXT:    stxvpx vsp0, r5, r4
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_2:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvpx vsp0, r3, r4
+; CHECK-BE-NEXT:    stxvpx vsp0, r5, r4
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 %offset
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 %offset
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li r5, 18
+; CHECK-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_3:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 18
+; CHECK-BE-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 18
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 18
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li r5, 1
+; CHECK-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_4:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 1
+; CHECK-BE-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 1
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 1
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li r5, 42
+; CHECK-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_5:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 42
+; CHECK-BE-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 42
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 42
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp0, 4096(r3)
+; CHECK-NEXT:    stxvp vsp0, 4096(r4)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_6:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp0, 4096(r3)
+; CHECK-BE-NEXT:    stxvp vsp0, 4096(r4)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128
+  %1 = bitcast <256 x i1>* %0 to i8*
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128
+  %4 = bitcast <256 x i1>* %3 to i8*
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; FIXME: A prefixed load (plxvp) is expected here as the offset in this
+; test case is a constant that fits within 34-bits.
+; CHECK-LABEL: test_ldst_7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li r5, 0
+; CHECK-NEXT:    ori r5, r5, 32799
+; CHECK-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 0
+; CHECK-BE-NEXT:    ori r5, r5, 32799
+; CHECK-BE-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 32799
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 32799
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_8(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp)  {
+; CHECK-LABEL: test_ldst_8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv vs1, 32(r3)
+; CHECK-NEXT:    lxv vs0, 48(r3)
+; CHECK-NEXT:    lxv vs3, 0(r3)
+; CHECK-NEXT:    lxv vs2, 16(r3)
+; CHECK-NEXT:    li r3, 8
+; CHECK-NEXT:    lxvpx vsp4, r4, r3
+; CHECK-NEXT:    xxmtacc acc0
+; CHECK-NEXT:    pmxvf64gernn acc0, vsp4, v2, 0, 0
+; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    stxv vs0, 48(r7)
+; CHECK-NEXT:    stxv vs1, 32(r7)
+; CHECK-NEXT:    stxv vs2, 16(r7)
+; CHECK-NEXT:    stxv vs3, 0(r7)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    li r3, 8
+; CHECK-BE-NEXT:    lxvpx vsp4, r4, r3
+; CHECK-BE-NEXT:    xxmtacc acc0
+; CHECK-BE-NEXT:    pmxvf64gernn acc0, vsp4, v2, 0, 0
+; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    stxv vs1, 16(r7)
+; CHECK-BE-NEXT:    stxv vs0, 0(r7)
+; CHECK-BE-NEXT:    stxv vs3, 48(r7)
+; CHECK-BE-NEXT:    stxv vs2, 32(r7)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i8* %vqp to <512 x i1>*
+  %1 = load <512 x i1>, <512 x i1>* %0, align 64
+  %2 = bitcast <256 x i1>* %vpp to i8*
+  %3 = getelementptr i8, i8* %2, i64 8
+  %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3)
+  %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0)
+  %6 = bitcast i8* %resp to <512 x i1>*
+  store <512 x i1> %5, <512 x i1>* %6, align 64
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_9(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp)  {
+; CHECK-LABEL: test_ldst_9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv vs1, 32(r3)
+; CHECK-NEXT:    lxv vs0, 48(r3)
+; CHECK-NEXT:    lxv vs3, 0(r3)
+; CHECK-NEXT:    lxv vs2, 16(r3)
+; CHECK-NEXT:    lxvp vsp4, 0(r4)
+; CHECK-NEXT:    xxmtacc acc0
+; CHECK-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    stxv vs0, 48(r7)
+; CHECK-NEXT:    stxv vs1, 32(r7)
+; CHECK-NEXT:    stxv vs2, 16(r7)
+; CHECK-NEXT:    stxv vs3, 0(r7)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_9:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp4, 0(r4)
+; CHECK-BE-NEXT:    xxmtacc acc0
+; CHECK-BE-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    stxv vs1, 16(r7)
+; CHECK-BE-NEXT:    stxv vs0, 0(r7)
+; CHECK-BE-NEXT:    stxv vs3, 48(r7)
+; CHECK-BE-NEXT:    stxv vs2, 32(r7)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i8* %vqp to <512 x i1>*
+  %1 = load <512 x i1>, <512 x i1>* %0, align 64
+  %2 = bitcast <256 x i1>* %vpp to i8*
+  %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+  %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
+  %5 = bitcast i8* %resp to <512 x i1>*
+  store <512 x i1> %4, <512 x i1>* %5, align 64
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_10(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp)  {
+; CHECK-LABEL: test_ldst_10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv vs1, 32(r3)
+; CHECK-NEXT:    lxv vs0, 48(r3)
+; CHECK-NEXT:    lxv vs3, 0(r3)
+; CHECK-NEXT:    lxv vs2, 16(r3)
+; CHECK-NEXT:    lxvp vsp4, 0(r5)
+; CHECK-NEXT:    xxmtacc acc0
+; CHECK-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    stxv vs0, 48(r9)
+; CHECK-NEXT:    stxv vs1, 32(r9)
+; CHECK-NEXT:    stxv vs2, 16(r9)
+; CHECK-NEXT:    stxv vs3, 0(r9)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_10:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp4, 0(r5)
+; CHECK-BE-NEXT:    xxmtacc acc0
+; CHECK-BE-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    stxv vs1, 16(r9)
+; CHECK-BE-NEXT:    stxv vs0, 0(r9)
+; CHECK-BE-NEXT:    stxv vs3, 48(r9)
+; CHECK-BE-NEXT:    stxv vs2, 32(r9)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i8* %vqp to <512 x i1>*
+  %1 = load <512 x i1>, <512 x i1>* %0, align 64
+  %2 = bitcast <256 x i1>* %vpp to i8*
+  %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+  %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
+  %5 = bitcast i8* %resp to <512 x i1>*
+  store <512 x i1> %4, <512 x i1>* %5, align 64
+  ret void
+}