diff --git a/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td b/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
--- a/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
+++ b/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
@@ -84,3 +84,7 @@
 let TargetPrefix = "ve" in def int_ve_vl_lvsl_svs : GCCBuiltin<"__builtin_ve_vl_lvsl_svs">, Intrinsic<[LLVMType<i64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
 let TargetPrefix = "ve" in def int_ve_vl_lvsd_svs : GCCBuiltin<"__builtin_ve_vl_lvsd_svs">, Intrinsic<[LLVMType<f64>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
 let TargetPrefix = "ve" in def int_ve_vl_lvss_svs : GCCBuiltin<"__builtin_ve_vl_lvss_svs">, Intrinsic<[LLVMType<f32>], [LLVMType<v256f64>, LLVMType<i32>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvm_mmss : GCCBuiltin<"__builtin_ve_vl_lvm_mmss">, Intrinsic<[LLVMType<v256i1>], [LLVMType<v256i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_lvm_MMss : GCCBuiltin<"__builtin_ve_vl_lvm_MMss">, Intrinsic<[LLVMType<v512i1>], [LLVMType<v512i1>, LLVMType<i64>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_svm_sms : GCCBuiltin<"__builtin_ve_vl_svm_sms">, Intrinsic<[LLVMType<i64>], [LLVMType<v256i1>, LLVMType<i64>], [IntrNoMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_svm_sMs : GCCBuiltin<"__builtin_ve_vl_svm_sMs">, Intrinsic<[LLVMType<i64>], [LLVMType<v512i1>, LLVMType<i64>], [IntrNoMem]>;
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -717,6 +717,12 @@
   return GlobalBaseReg;
 }
 
+static Register getVM512Upper(Register reg) {
+  return (reg - VE::VMP0) * 2 + VE::VM0;
+}
+
+static Register getVM512Lower(Register reg) { return getVM512Upper(reg) + 1; }
+
 bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case VE::EXTEND_STACK: {
@@ -729,6 +735,84 @@
   case VE::GETSTACKTOP: {
     return expandGetStackTopPseudo(MI);
   }
+
+  case VE::LVMyir:
+  case VE::LVMyim:
+  case VE::LVMyir_y:
+  case VE::LVMyim_y: {
+    Register VMXu = getVM512Upper(MI.getOperand(0).getReg());
+    Register VMXl = getVM512Lower(MI.getOperand(0).getReg());
+    int64_t Imm = MI.getOperand(1).getImm();
+    bool IsSrcReg =
+        MI.getOpcode() == VE::LVMyir || MI.getOpcode() == VE::LVMyir_y;
+    Register Src = IsSrcReg ? MI.getOperand(2).getReg() : VE::NoRegister;
+    int64_t MImm = IsSrcReg ? 0 : MI.getOperand(2).getImm();
+    bool KillSrc = IsSrcReg ? MI.getOperand(2).isKill() : false;
+    Register VMX = VMXl;
+    if (Imm >= 4) {
+      VMX = VMXu;
+      Imm -= 4;
+    }
+    MachineBasicBlock *MBB = MI.getParent();
+    DebugLoc DL = MI.getDebugLoc();
+    switch (MI.getOpcode()) {
+    case VE::LVMyir:
+      BuildMI(*MBB, MI, DL, get(VE::LVMir))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addReg(Src, getKillRegState(KillSrc));
+      break;
+    case VE::LVMyim:
+      BuildMI(*MBB, MI, DL, get(VE::LVMim))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addImm(MImm);
+      break;
+    case VE::LVMyir_y:
+      assert(MI.getOperand(0).getReg() == MI.getOperand(3).getReg() &&
+             "LVMyir_y has different register in 3rd operand");
+      BuildMI(*MBB, MI, DL, get(VE::LVMir_m))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addReg(Src, getKillRegState(KillSrc))
+          .addReg(VMX);
+      break;
+    case VE::LVMyim_y:
+      assert(MI.getOperand(0).getReg() == MI.getOperand(3).getReg() &&
+             "LVMyim_y has different register in 3rd operand");
+      BuildMI(*MBB, MI, DL, get(VE::LVMim_m))
+          .addDef(VMX)
+          .addImm(Imm)
+          .addImm(MImm)
+          .addReg(VMX);
+      break;
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+  case VE::SVMyi: {
+    Register Dest = MI.getOperand(0).getReg();
+    Register VMZu = getVM512Upper(MI.getOperand(1).getReg());
+    Register VMZl = getVM512Lower(MI.getOperand(1).getReg());
+    bool KillSrc = MI.getOperand(1).isKill();
+    int64_t Imm = MI.getOperand(2).getImm();
+    Register VMZ = VMZl;
+    if (Imm >= 4) {
+      VMZ = VMZu;
+      Imm -= 4;
+    }
+    MachineBasicBlock *MBB = MI.getParent();
+    DebugLoc DL = MI.getDebugLoc();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, MI, DL, get(VE::SVMmi), Dest).addReg(VMZ).addImm(Imm);
+    MachineInstr *Inst = MIB.getInstr();
+    MI.eraseFromParent();
+    if (KillSrc) {
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      Inst->addRegisterKilled(MI.getOperand(1).getReg(), TRI, true);
+    }
+    return true;
+  }
   }
   return false;
 }
diff --git a/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td b/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
--- a/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
+++ b/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
@@ -162,3 +162,7 @@
 def : Pat<(int_ve_vl_pfchv_ssl simm7:$I, i64:$sz, i32:$vl), (PFCHVirl (LO7 $I), i64:$sz, i32:$vl)>;
 def : Pat<(int_ve_vl_pfchvnc_ssl i64:$sy, i64:$sz, i32:$vl), (PFCHVNCrrl i64:$sy, i64:$sz, i32:$vl)>;
 def : Pat<(int_ve_vl_pfchvnc_ssl simm7:$I, i64:$sz, i32:$vl), (PFCHVNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_lvm_mmss v256i1:$ptm, uimm6:$N, i64:$sz), (LVMir_m (ULO7 $N), i64:$sz, v256i1:$ptm)>;
+def : Pat<(int_ve_vl_lvm_MMss v512i1:$ptm, uimm6:$N, i64:$sz), (LVMyir_y (ULO7 $N), i64:$sz, v512i1:$ptm)>;
+def : Pat<(int_ve_vl_svm_sms v256i1:$vmz, uimm6:$N), (SVMmi v256i1:$vmz, (ULO7 $N))>;
+def : Pat<(int_ve_vl_svm_sMs v512i1:$vmz, uimm6:$N), (SVMyi v512i1:$vmz, (ULO7 $N))>;
diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td
--- a/llvm/lib/Target/VE/VEInstrVec.td
+++ b/llvm/lib/Target/VE/VEInstrVec.td
@@ -2,6 +2,26 @@
 // Vector Instructions
 //===----------------------------------------------------------------------===//
 
+// Pseudo instructions for VM512 modifications
+//
+// Specifies hasSideEffects = 0 to disable UnmodeledSideEffects.
+
+let hasSideEffects = 0 in {
+  let Constraints = "$vx = $vd", DisableEncoding = "$vd" in {
+    def LVMyir_y : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz, VM512:$vd),
+                          "# pseudo LVM $vx, $sy, $sz, $vd">;
+    def LVMyim_y : Pseudo<(outs VM512:$vx),
+                          (ins uimm3:$sy, mimm:$sz, VM512:$vd),
+                          "# pseudo LVM $vx, $sy, $sz, $vd">;
+  }
+  def LVMyir : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz),
+                      "# pseudo LVM $vx, $sy, $sz">;
+  def LVMyim : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, mimm:$sz),
+                      "# pseudo LVM $vx, $sy, $sz">;
+  def SVMyi : Pseudo<(outs I64:$sx), (ins VM512:$vz, uimm3:$sy),
+                     "# pseudo SVM $sx, $vz, $sy">;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -86,10 +86,22 @@
          ++ItAlias)
       Reserved.set(*ItAlias);
 
+  // Reserve constant registers.
+  Reserved.set(VE::VM0);
+  Reserved.set(VE::VMP0);
+
   return Reserved;
 }
 
-bool VERegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { return false; }
+bool VERegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+  switch (PhysReg) {
+  case VE::VM0:
+  case VE::VMP0:
+    return true;
+  default:
+    return false;
+  }
+}
 
 const TargetRegisterClass *
 VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
diff --git a/llvm/test/CodeGen/VE/VELIntrinsics/lvm.ll b/llvm/test/CodeGen/VE/VELIntrinsics/lvm.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/VE/VELIntrinsics/lvm.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=ve -mattr=+vpu | FileCheck %s
+
+;;; Test load/save vector mask intrinsic instructions
+;;;
+;;; Note:
+;;;   We test LVMir_m, LVMyir_y, SVMmi, and SVMyi instructions.
+
+; Function Attrs: nounwind readnone
+define i64 @lvm_mmss(i8* nocapture readnone %0, i64 %1) {
+; CHECK-LABEL: lvm_mmss:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lvm %vm1, 3, %s1
+; CHECK-NEXT:    svm %s0, %vm1, 3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call <256 x i1> @llvm.ve.vl.lvm.mmss(<256 x i1> undef, i64 3, i64 %1)
+  %4 = tail call i64 @llvm.ve.vl.svm.sms(<256 x i1> %3, i64 3)
+  ret i64 %4
+}
+
+; Function Attrs: nounwind readnone
+declare <256 x i1> @llvm.ve.vl.lvm.mmss(<256 x i1>, i64, i64)
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.ve.vl.svm.sms(<256 x i1>, i64)
+
+; Function Attrs: nounwind readnone
+define i64 @lvml_MMss(i8* nocapture readnone %0, i64 %1) {
+; CHECK-LABEL: lvml_MMss:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lvm %vm2, 1, %s1
+; CHECK-NEXT:    svm %s0, %vm3, 3
+; CHECK-NEXT:    svm %s1, %vm2, 2
+; CHECK-NEXT:    adds.l %s0, %s1, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call <512 x i1> @llvm.ve.vl.lvm.MMss(<512 x i1> undef, i64 5, i64 %1)
+  %4 = tail call i64 @llvm.ve.vl.svm.sMs(<512 x i1> %3, i64 3)
+  %5 = tail call i64 @llvm.ve.vl.svm.sMs(<512 x i1> %3, i64 6)
+  %6 = add i64 %5, %4
+  ret i64 %6
+}
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.ve.vl.lvm.MMss(<512 x i1>, i64, i64)
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.ve.vl.svm.sMs(<512 x i1>, i64)
diff --git a/llvm/test/CodeGen/VE/VELIntrinsics/vst.ll b/llvm/test/CodeGen/VE/VELIntrinsics/vst.ll
--- a/llvm/test/CodeGen/VE/VELIntrinsics/vst.ll
+++ b/llvm/test/CodeGen/VE/VELIntrinsics/vst.ll
@@ -32,7 +32,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -63,7 +63,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -94,7 +94,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst.nc %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst.nc %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstnc.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -125,7 +125,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst.nc %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst.nc %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstnc.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -156,7 +156,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -187,7 +187,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -218,7 +218,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst.nc.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst.nc.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstncot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -249,7 +249,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst.nc.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst.nc.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstncot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -280,7 +280,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -311,7 +311,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -342,7 +342,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu.nc %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu.nc %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstunc.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -373,7 +373,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu.nc %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu.nc %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstunc.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -404,7 +404,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstuot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -435,7 +435,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstuot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -466,7 +466,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu.nc.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu.nc.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstuncot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -497,7 +497,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu.nc.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu.nc.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstuncot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -528,7 +528,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -559,7 +559,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -590,7 +590,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl.nc %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl.nc %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstlnc.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -621,7 +621,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl.nc %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl.nc %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstlnc.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -652,7 +652,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstlot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -683,7 +683,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstlot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -714,7 +714,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl.nc.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl.nc.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstlncot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -745,7 +745,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl.nc.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl.nc.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstlncot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -776,7 +776,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst2d %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst2d %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2d.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -807,7 +807,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst2d %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst2d %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2d.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -838,7 +838,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst2d.nc %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst2d.nc %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2dnc.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -869,7 +869,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst2d.nc %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst2d.nc %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2dnc.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -900,7 +900,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst2d.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst2d.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2dot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -931,7 +931,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst2d.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst2d.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2dot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -962,7 +962,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vst2d.nc.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vst2d.nc.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2dncot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -993,7 +993,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vst2d.nc.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vst2d.nc.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vst2dncot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1024,7 +1024,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu2d %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu2d %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2d.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1055,7 +1055,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu2d %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu2d %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2d.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1086,7 +1086,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu2d.nc %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu2d.nc %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2dnc.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1117,7 +1117,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu2d.nc %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu2d.nc %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2dnc.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1148,7 +1148,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu2d.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu2d.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2dot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1179,7 +1179,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu2d.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu2d.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2dot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1210,7 +1210,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstu2d.nc.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstu2d.nc.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2dncot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1241,7 +1241,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstu2d.nc.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstu2d.nc.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstu2dncot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1272,7 +1272,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl2d %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl2d %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2d.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1303,7 +1303,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl2d %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl2d %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2d.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1334,7 +1334,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl2d.nc %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl2d.nc %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2dnc.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1365,7 +1365,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl2d.nc %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl2d.nc %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2dnc.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1396,7 +1396,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl2d.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl2d.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2dot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1427,7 +1427,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl2d.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl2d.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2dot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)
@@ -1458,7 +1458,7 @@
 ; CHECK-NEXT:    lea %s2, 256
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vld %v0, %s1, %s0
-; CHECK-NEXT:    vstl2d.nc.ot %v0, %s1, %s0, %vm0
+; CHECK-NEXT:    vstl2d.nc.ot %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2dncot.vssml(<256 x double> %3, i64 %1, i8* %0, <256 x i1> undef, i32 256)
@@ -1489,7 +1489,7 @@
 ; CHECK-NEXT:    lea %s1, 256
 ; CHECK-NEXT:    lvl %s1
 ; CHECK-NEXT:    vld %v0, 8, %s0
-; CHECK-NEXT:    vstl2d.nc.ot %v0, 8, %s0, %vm0
+; CHECK-NEXT:    vstl2d.nc.ot %v0, 8, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
   tail call void @llvm.ve.vl.vstl2dncot.vssml(<256 x double> %2, i64 8, i8* %0, <256 x i1> undef, i32 256)