diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -843,15 +843,25 @@
   return Offset;
 }
 
+static bool hasRVVSpillWithFIs(MachineFunction &MF, const RISCVInstrInfo &TII) {
+  return any_of(MF, [&TII](const MachineBasicBlock &MBB) {
+    return any_of(MBB, [&TII](const MachineInstr &MI) {
+      return TII.isRVVSpill(MI, /*CheckFIs*/ true);
+    });
+  });
+}
+
 void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
-  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  const RISCVRegisterInfo *RegInfo =
+      MF.getSubtarget<RISCVSubtarget>().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterClass *RC = &RISCV::GPRRegClass;
   auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
 
   int64_t RVVStackSize = assignRVVStackObjectOffsets(MFI);
   RVFI->setRVVStackSize(RVVStackSize);
+  const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
 
   // estimateStackSize has been observed to under-estimate the final stack
   // size, so give ourselves wiggle-room by checking for stack size
@@ -859,7 +869,10 @@
   // FIXME: It may be possible to craft a function with a small stack that
   // still needs an emergency spill slot for branch relaxation. This case
   // would currently be missed.
-  if (!isInt<11>(MFI.estimateStackSize(MF)) || RVVStackSize != 0) {
+  // RVV loads & stores have no capacity to hold the immediate address offsets
+  // so we must always reserve an emergency spill slot if the MachineFunction
+  // contains any RVV spills.
+  if (!isInt<11>(MFI.estimateStackSize(MF)) || hasRVVSpillWithFIs(MF, TII)) {
     int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
                                           RegInfo->getSpillAlign(*RC), false);
     RS->addScavengingFrameIndex(RegScavFI);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -147,6 +147,11 @@
                                  MachineBasicBlock::iterator II,
                                  const DebugLoc &DL, int64_t Amount) const;
 
+  // Returns true if the given MI is an RVV instruction opcode for which we may
+  // expect to see a FrameIndex operand. When CheckFIs is true, the instruction
+  // must contain at least one FrameIndex operand.
+  bool isRVVSpill(const MachineInstr &MI, bool CheckFIs) const;
+
   Optional<std::pair<unsigned, unsigned>>
   isRVVSpillForZvlsseg(unsigned Opcode) const;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1412,6 +1412,46 @@
   return VL;
 }
 
+static bool isRVVWholeLoadStore(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case RISCV::VS1R_V:
+  case RISCV::VS2R_V:
+  case RISCV::VS4R_V:
+  case RISCV::VS8R_V:
+  case RISCV::VL1RE8_V:
+  case RISCV::VL2RE8_V:
+  case RISCV::VL4RE8_V:
+  case RISCV::VL8RE8_V:
+  case RISCV::VL1RE16_V:
+  case RISCV::VL2RE16_V:
+  case RISCV::VL4RE16_V:
+  case RISCV::VL8RE16_V:
+  case RISCV::VL1RE32_V:
+  case RISCV::VL2RE32_V:
+  case RISCV::VL4RE32_V:
+  case RISCV::VL8RE32_V:
+  case RISCV::VL1RE64_V:
+  case RISCV::VL2RE64_V:
+  case RISCV::VL4RE64_V:
+  case RISCV::VL8RE64_V:
+    return true;
+  }
+}
+
+bool RISCVInstrInfo::isRVVSpill(const MachineInstr &MI, bool CheckFIs) const {
+  // RVV lacks any support for immediate addressing for stack addresses, so be
+  // conservative.
+  unsigned Opcode = MI.getOpcode();
+  if (!RISCVVPseudosTable::getPseudoInfo(Opcode) &&
+      !isRVVWholeLoadStore(Opcode) && !isRVVSpillForZvlsseg(Opcode))
+    return false;
+  return !CheckFIs || any_of(MI.operands(), [](const MachineOperand &MO) {
+    return MO.isFI();
+  });
+}
+
 Optional<std::pair<unsigned, unsigned>>
 RISCVInstrInfo::isRVVSpillForZvlsseg(unsigned Opcode) const {
   switch (Opcode) {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -156,34 +156,6 @@
   return true;
 }
 
-static bool isRVVWholeLoadStore(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case RISCV::VS1R_V:
-  case RISCV::VS2R_V:
-  case RISCV::VS4R_V:
-  case RISCV::VS8R_V:
-  case RISCV::VL1RE8_V:
-  case RISCV::VL2RE8_V:
-  case RISCV::VL4RE8_V:
-  case RISCV::VL8RE8_V:
-  case RISCV::VL1RE16_V:
-  case RISCV::VL2RE16_V:
-  case RISCV::VL4RE16_V:
-  case RISCV::VL8RE16_V:
-  case RISCV::VL1RE32_V:
-  case RISCV::VL2RE32_V:
-  case RISCV::VL4RE32_V:
-  case RISCV::VL8RE32_V:
-  case RISCV::VL1RE64_V:
-  case RISCV::VL2RE64_V:
-  case RISCV::VL4RE64_V:
-  case RISCV::VL8RE64_V:
-    return true;
-  }
-}
-
 void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                             int SPAdj, unsigned FIOperandNum,
                                             RegScavenger *RS) const {
@@ -199,10 +171,8 @@
   Register FrameReg;
   StackOffset Offset =
       getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg);
-  bool isRVV = RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()) ||
-               isRVVWholeLoadStore(MI.getOpcode()) ||
-               TII->isRVVSpillForZvlsseg(MI.getOpcode());
-  if (!isRVV)
+  bool IsRVVSpill = TII->isRVVSpill(MI, /*CheckFIs*/ false);
+  if (!IsRVVSpill)
     Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm());
 
   if (!isInt<32>(Offset.getFixed())) {
@@ -255,7 +225,7 @@
     // Offset = (fixed offset, 0)
     MI.getOperand(FIOperandNum)
         .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
-    if (!isRVV)
+    if (!IsRVVSpill)
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
     else {
       if (Offset.getFixed()) {
@@ -286,7 +256,7 @@
         .addReg(FrameReg, getKillRegState(FrameRegIsKill))
         .addReg(ScalableFactorRegister, RegState::Kill);
 
-    if (isRVV && Offset.getFixed()) {
+    if (IsRVVSpill && Offset.getFixed()) {
       // Scalable load/store has no immediate argument. We need to add the
       // fixed part into the load/store base address.
       BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), VL)
@@ -296,7 +266,7 @@
 
     // 3. Replace address register with calculated address register
     MI.getOperand(FIOperandNum).ChangeToRegister(VL, false, false, true);
-    if (!isRVV)
+    if (!IsRVVSpill)
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
--- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
@@ -5,12 +5,10 @@
 define void @lmul1() nounwind {
 ; CHECK-LABEL: lmul1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v = alloca <vscale x 1 x i64>
   ret void
@@ -19,14 +17,12 @@
 define void @lmul2() nounwind {
 ; CHECK-LABEL: lmul2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v = alloca <vscale x 2 x i64>
   ret void
@@ -75,7 +71,6 @@
 define void @lmul1_and_2() nounwind {
 ; CHECK-LABEL: lmul1_and_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
@@ -84,7 +79,6 @@
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v1 = alloca <vscale x 1 x i64>
   %v2 = alloca <vscale x 2 x i64>
@@ -138,7 +132,6 @@
 define void @lmul2_and_1() nounwind {
 ; CHECK-LABEL: lmul2_and_1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
@@ -147,7 +140,6 @@
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v1 = alloca <vscale x 2 x i64>
   %v2 = alloca <vscale x 1 x i64>
@@ -250,18 +242,18 @@
 define void @gpr_and_lmul1_and_2() nounwind {
 ; CHECK-LABEL: gpr_and_lmul1_and_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, zero, 3
-; CHECK-NEXT:    sd a0, 24(sp)
+; CHECK-NEXT:    sd a0, 8(sp)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 32
+; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %x1 = alloca i64
   %v1 = alloca <vscale x 1 x i64>
@@ -273,21 +265,21 @@
 define void @gpr_and_lmul1_and_4() nounwind {
 ; CHECK-LABEL: gpr_and_lmul1_and_4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -64
-; CHECK-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    addi s0, sp, 64
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi s0, sp, 32
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 2
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    andi sp, sp, -32
 ; CHECK-NEXT:    addi a0, zero, 3
-; CHECK-NEXT:    sd a0, 40(sp)
-; CHECK-NEXT:    addi sp, s0, -64
-; CHECK-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 64
+; CHECK-NEXT:    sd a0, 8(sp)
+; CHECK-NEXT:    addi sp, s0, -32
+; CHECK-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    ret
   %x1 = alloca i64
   %v1 = alloca <vscale x 1 x i64>
@@ -379,14 +371,12 @@
 define void @masks() nounwind {
 ; CHECK-LABEL: masks:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %v1 = alloca <vscale x 1 x i1>
   %v2 = alloca <vscale x 2 x i1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -7,8 +7,8 @@
 define void @bswap_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV32-LABEL: bswap_v8i16:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle16.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -16,7 +16,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
@@ -24,59 +24,60 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 30(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 26(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 22(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle16.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle16.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse16.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bswap_v8i16:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle16.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
@@ -84,7 +85,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -92,59 +93,60 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle16.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle16.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse16.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: bswap_v8i16:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -152,7 +154,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -160,59 +162,60 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a2, a1
-; LMULMAX1-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v8i16:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -220,7 +223,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -228,53 +231,54 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    or a1, a2, a1
-; LMULMAX1-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i16>, <8 x i16>* %x
   %b = load <8 x i16>, <8 x i16>* %y
@@ -287,8 +291,8 @@
 define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV32-LABEL: bswap_v4i32:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle32.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -304,7 +308,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a2
-; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
@@ -317,7 +321,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a2
-; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    srli a2, a1, 8
@@ -329,7 +333,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a2
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    srli a2, a1, 8
@@ -341,17 +345,18 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a2
-; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bswap_v4i32:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle32.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
@@ -367,7 +372,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a2
-; LMULMAX2-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -380,7 +385,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a2
-; LMULMAX2-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    srliw a2, a1, 8
@@ -392,7 +397,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a2
-; LMULMAX2-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV64-NEXT:    srliw a2, a1, 8
@@ -404,17 +409,18 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a2
-; LMULMAX2-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse32.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: bswap_v4i32:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -430,7 +436,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -443,7 +449,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 8
@@ -455,7 +461,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 8
@@ -467,17 +473,18 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v4i32:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -493,7 +500,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a2
-; LMULMAX1-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -506,7 +513,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a2
-; LMULMAX1-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 8
@@ -518,7 +525,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a2
-; LMULMAX1-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 8
@@ -530,11 +537,12 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a2
-; LMULMAX1-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <4 x i32>, <4 x i32>* %x
   %b = load <4 x i32>, <4 x i32>* %y
@@ -547,8 +555,8 @@
 define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: bswap_v2i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -564,7 +572,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a2
-; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
@@ -577,7 +585,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a2
-; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    addi a1, zero, 32
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v25, v25, a1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a2, v25
@@ -590,7 +598,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a2, a2, 24
 ; LMULMAX2-RV32-NEXT:    or a2, a2, a5
 ; LMULMAX2-RV32-NEXT:    or a2, a2, a4
-; LMULMAX2-RV32-NEXT:    sw a2, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a2, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v25, v26, a1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    srli a2, a1, 8
@@ -602,12 +610,13 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a2
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bswap_v2i64:
@@ -676,8 +685,8 @@
 ;
 ; LMULMAX1-RV32-LABEL: bswap_v2i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -693,7 +702,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -706,7 +715,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, zero, 32
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v25, v25, a1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
@@ -719,7 +728,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a4
-; LMULMAX1-RV32-NEXT:    sw a2, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a2, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v25, v26, a1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 8
@@ -731,12 +740,13 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v2i64:
@@ -813,13 +823,13 @@
 define void @bswap_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV32-LABEL: bswap_v16i16:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
@@ -829,7 +839,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
@@ -837,123 +847,124 @@
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 30(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 62(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 26(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 58(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 22(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 54(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 18(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 50(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 46(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 42(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 38(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV32-NEXT:    slli a1, a1, 16
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 34(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle16.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle16.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bswap_v16i16:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
@@ -963,7 +974,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -971,118 +982,119 @@
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 30(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 62(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 28(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 26(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 58(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 22(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 54(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 20(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 18(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 50(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 46(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 42(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 38(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    slli a2, a1, 8
 ; LMULMAX2-RV64-NEXT:    slli a1, a1, 48
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 34(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle16.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle16.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: bswap_v16i16:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle16.v v26, (a1)
@@ -1092,7 +1104,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 16(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
@@ -1100,117 +1112,118 @@
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 30(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 46(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 28(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 26(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 42(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 24(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 22(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 38(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 20(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 18(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 34(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 0(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 14(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 12(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 10(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 8(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 6(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 4(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV32-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV32-NEXT:    slli a2, a2, 16
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    or a2, a3, a2
-; LMULMAX1-RV32-NEXT:    sh a2, 2(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a2, sp, 16
+; LMULMAX1-RV32-NEXT:    vle16.v v25, (a2)
+; LMULMAX1-RV32-NEXT:    addi a2, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle16.v v26, (a2)
 ; LMULMAX1-RV32-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse16.v v26, (a1)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v16i16:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle16.v v26, (a1)
@@ -1220,7 +1233,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 16(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -1228,111 +1241,112 @@
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 30(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 46(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 28(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 26(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 42(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 24(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 22(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 38(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 20(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 18(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 34(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 0(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 14(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 12(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 10(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 8(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 6(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 4(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    slli a3, a2, 8
 ; LMULMAX1-RV64-NEXT:    slli a2, a2, 48
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    or a2, a3, a2
-; LMULMAX1-RV64-NEXT:    sh a2, 2(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a2, sp, 16
+; LMULMAX1-RV64-NEXT:    vle16.v v25, (a2)
+; LMULMAX1-RV64-NEXT:    addi a2, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle16.v v26, (a2)
 ; LMULMAX1-RV64-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse16.v v26, (a1)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %x
   %b = load <16 x i16>, <16 x i16>* %y
@@ -1345,13 +1359,13 @@
 define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV32-LABEL: bswap_v8i32:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
@@ -1369,7 +1383,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v28
@@ -1382,7 +1396,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1394,7 +1408,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 24(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1406,7 +1420,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 20(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1418,7 +1432,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 16(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1430,7 +1444,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1442,7 +1456,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v26
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1454,25 +1468,26 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a2, a3, a2
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse32.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bswap_v8i32:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
@@ -1490,7 +1505,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a4
-; LMULMAX2-RV64-NEXT:    sw a3, 0(sp)
+; LMULMAX2-RV64-NEXT:    sw a3, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
@@ -1503,7 +1518,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a4
-; LMULMAX2-RV64-NEXT:    sw a3, 28(sp)
+; LMULMAX2-RV64-NEXT:    sw a3, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV64-NEXT:    srliw a4, a3, 8
@@ -1515,7 +1530,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a4
-; LMULMAX2-RV64-NEXT:    sw a3, 24(sp)
+; LMULMAX2-RV64-NEXT:    sw a3, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV64-NEXT:    srliw a4, a3, 8
@@ -1527,7 +1542,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a4
-; LMULMAX2-RV64-NEXT:    sw a3, 20(sp)
+; LMULMAX2-RV64-NEXT:    sw a3, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV64-NEXT:    srliw a4, a3, 8
@@ -1539,7 +1554,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a4
-; LMULMAX2-RV64-NEXT:    sw a3, 16(sp)
+; LMULMAX2-RV64-NEXT:    sw a3, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV64-NEXT:    srliw a4, a3, 8
@@ -1551,7 +1566,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a4
-; LMULMAX2-RV64-NEXT:    sw a3, 12(sp)
+; LMULMAX2-RV64-NEXT:    sw a3, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV64-NEXT:    srliw a4, a3, 8
@@ -1563,7 +1578,7 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    or a3, a3, a4
-; LMULMAX2-RV64-NEXT:    sw a3, 8(sp)
+; LMULMAX2-RV64-NEXT:    sw a3, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v26
 ; LMULMAX2-RV64-NEXT:    srliw a4, a3, 8
@@ -1575,20 +1590,21 @@
 ; LMULMAX2-RV64-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV64-NEXT:    or a2, a3, a2
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse32.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: bswap_v8i32:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a6)
@@ -1606,7 +1622,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a4, a4, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a4, a1
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
-; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
@@ -1619,7 +1635,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -1631,7 +1647,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -1643,7 +1659,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
 ; LMULMAX1-RV32-NEXT:    and a4, a4, a2
@@ -1654,7 +1670,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -1666,7 +1682,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -1678,7 +1694,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -1690,20 +1706,21 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse32.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v8i32:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle32.v v26, (a6)
@@ -1721,7 +1738,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a4, a4, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a4, a1
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a5
-; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -1734,7 +1751,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
-; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    srliw a4, a1, 8
@@ -1746,7 +1763,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
-; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a4, a1, 8
@@ -1758,7 +1775,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
-; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    srliw a4, a1, 8
 ; LMULMAX1-RV64-NEXT:    and a4, a4, a2
@@ -1769,7 +1786,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
-; LMULMAX1-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a4, a1, 8
@@ -1781,7 +1798,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
-; LMULMAX1-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a4, a1, 8
@@ -1793,7 +1810,7 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a4
-; LMULMAX1-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    srliw a4, a1, 8
@@ -1805,14 +1822,15 @@
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a2
-; LMULMAX1-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse32.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %x
   %b = load <8 x i32>, <8 x i32>* %y
@@ -1825,13 +1843,13 @@
 define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: bswap_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
@@ -1849,7 +1867,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v28
@@ -1862,7 +1880,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v30, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v30
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1874,7 +1892,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 20(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v8, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a3, v8
 ; LMULMAX2-RV32-NEXT:    srli a4, a3, 8
@@ -1886,7 +1904,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a3, a3, 24
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a5
 ; LMULMAX2-RV32-NEXT:    or a3, a3, a4
-; LMULMAX2-RV32-NEXT:    sw a3, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a3, 44(sp)
 ; LMULMAX2-RV32-NEXT:    addi a3, zero, 32
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v26, v26, a3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a4, v26
@@ -1899,7 +1917,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a4, a4, 24
 ; LMULMAX2-RV32-NEXT:    or a4, a4, a5
 ; LMULMAX2-RV32-NEXT:    or a2, a4, a2
-; LMULMAX2-RV32-NEXT:    sw a2, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a2, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v26, v28, a3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX2-RV32-NEXT:    srli a4, a2, 8
@@ -1911,7 +1929,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a2, a2, 24
 ; LMULMAX2-RV32-NEXT:    or a2, a2, a5
 ; LMULMAX2-RV32-NEXT:    or a2, a2, a4
-; LMULMAX2-RV32-NEXT:    sw a2, 24(sp)
+; LMULMAX2-RV32-NEXT:    sw a2, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v26, v30, a3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX2-RV32-NEXT:    srli a4, a2, 8
@@ -1923,7 +1941,7 @@
 ; LMULMAX2-RV32-NEXT:    slli a2, a2, 24
 ; LMULMAX2-RV32-NEXT:    or a2, a2, a5
 ; LMULMAX2-RV32-NEXT:    or a2, a2, a4
-; LMULMAX2-RV32-NEXT:    sw a2, 16(sp)
+; LMULMAX2-RV32-NEXT:    sw a2, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v26, v8, a3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX2-RV32-NEXT:    srli a3, a2, 8
@@ -1935,26 +1953,27 @@
 ; LMULMAX2-RV32-NEXT:    slli a2, a2, 24
 ; LMULMAX2-RV32-NEXT:    or a2, a2, a3
 ; LMULMAX2-RV32-NEXT:    or a1, a2, a1
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vse64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: bswap_v4i64:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
@@ -1989,7 +2008,7 @@
 ; LMULMAX2-RV64-NEXT:    or a2, a2, a4
 ; LMULMAX2-RV64-NEXT:    or a2, a2, a3
 ; LMULMAX2-RV64-NEXT:    or a1, a2, a1
-; LMULMAX2-RV64-NEXT:    sd a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -2014,7 +2033,7 @@
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a2
-; LMULMAX2-RV64-NEXT:    sd a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    srli a2, a1, 40
@@ -2038,7 +2057,7 @@
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a2
-; LMULMAX2-RV64-NEXT:    sd a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    srli a2, a1, 40
@@ -2062,20 +2081,21 @@
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a2
-; LMULMAX2-RV64-NEXT:    sd a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle64.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle64.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse64.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: bswap_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle64.v v26, (a6)
@@ -2093,7 +2113,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a4, a4, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a4, a1
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
-; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
@@ -2106,7 +2126,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV32-NEXT:    addi a7, zero, 32
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v26, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a4, v26
@@ -2119,7 +2139,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a4, a4, 24
 ; LMULMAX1-RV32-NEXT:    or a4, a4, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a4, a1
-; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v27, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -2131,7 +2151,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
 ; LMULMAX1-RV32-NEXT:    and a4, a4, a2
@@ -2142,7 +2162,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -2154,7 +2174,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v25, v25, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -2166,7 +2186,7 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a4
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v25, v26, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 8
@@ -2178,15 +2198,16 @@
 ; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse64.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
@@ -782,67 +782,69 @@
 define <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) {
 ; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32:
 ; LMULMAX8:       # %bb.0:
-; LMULMAX8-NEXT:    addi sp, sp, -256
-; LMULMAX8-NEXT:    .cfi_def_cfa_offset 256
-; LMULMAX8-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; LMULMAX8-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; LMULMAX8-NEXT:    addi sp, sp, -384
+; LMULMAX8-NEXT:    .cfi_def_cfa_offset 384
+; LMULMAX8-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; LMULMAX8-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; LMULMAX8-NEXT:    .cfi_offset ra, -8
 ; LMULMAX8-NEXT:    .cfi_offset s0, -16
-; LMULMAX8-NEXT:    addi s0, sp, 256
+; LMULMAX8-NEXT:    addi s0, sp, 384
 ; LMULMAX8-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX8-NEXT:    andi sp, sp, -128
 ; LMULMAX8-NEXT:    addi a2, zero, 32
 ; LMULMAX8-NEXT:    vsetvli zero, a2, e32,m8,ta,mu
 ; LMULMAX8-NEXT:    vle32.v v24, (a0)
-; LMULMAX8-NEXT:    mv a0, sp
+; LMULMAX8-NEXT:    addi a0, sp, 128
 ; LMULMAX8-NEXT:    addi a2, zero, 42
-; LMULMAX8-NEXT:    vse32.v v8, (sp)
+; LMULMAX8-NEXT:    addi a3, sp, 128
+; LMULMAX8-NEXT:    vse32.v v8, (a3)
 ; LMULMAX8-NEXT:    vmv8r.v v8, v24
 ; LMULMAX8-NEXT:    call ext3@plt
-; LMULMAX8-NEXT:    addi sp, s0, -256
-; LMULMAX8-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; LMULMAX8-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; LMULMAX8-NEXT:    addi sp, sp, 256
+; LMULMAX8-NEXT:    addi sp, s0, -384
+; LMULMAX8-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; LMULMAX8-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; LMULMAX8-NEXT:    addi sp, sp, 384
 ; LMULMAX8-NEXT:    ret
 ;
 ; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32:
 ; LMULMAX4:       # %bb.0:
-; LMULMAX4-NEXT:    addi sp, sp, -256
-; LMULMAX4-NEXT:    .cfi_def_cfa_offset 256
-; LMULMAX4-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; LMULMAX4-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; LMULMAX4-NEXT:    addi sp, sp, -384
+; LMULMAX4-NEXT:    .cfi_def_cfa_offset 384
+; LMULMAX4-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; LMULMAX4-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; LMULMAX4-NEXT:    .cfi_offset ra, -8
 ; LMULMAX4-NEXT:    .cfi_offset s0, -16
-; LMULMAX4-NEXT:    addi s0, sp, 256
+; LMULMAX4-NEXT:    addi s0, sp, 384
 ; LMULMAX4-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX4-NEXT:    andi sp, sp, -128
 ; LMULMAX4-NEXT:    vsetivli zero, 16, e32,m4,ta,mu
 ; LMULMAX4-NEXT:    vle32.v v28, (a0)
 ; LMULMAX4-NEXT:    addi a0, a0, 64
 ; LMULMAX4-NEXT:    vle32.v v24, (a0)
-; LMULMAX4-NEXT:    addi a0, sp, 64
+; LMULMAX4-NEXT:    addi a0, sp, 192
 ; LMULMAX4-NEXT:    vse32.v v12, (a0)
-; LMULMAX4-NEXT:    mv a0, sp
+; LMULMAX4-NEXT:    addi a0, sp, 128
 ; LMULMAX4-NEXT:    addi a3, zero, 42
-; LMULMAX4-NEXT:    vse32.v v8, (sp)
+; LMULMAX4-NEXT:    addi a1, sp, 128
+; LMULMAX4-NEXT:    vse32.v v8, (a1)
 ; LMULMAX4-NEXT:    vmv4r.v v8, v28
 ; LMULMAX4-NEXT:    vmv4r.v v12, v24
 ; LMULMAX4-NEXT:    call ext3@plt
-; LMULMAX4-NEXT:    addi sp, s0, -256
-; LMULMAX4-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; LMULMAX4-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; LMULMAX4-NEXT:    addi sp, sp, 256
+; LMULMAX4-NEXT:    addi sp, s0, -384
+; LMULMAX4-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; LMULMAX4-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; LMULMAX4-NEXT:    addi sp, sp, 384
 ; LMULMAX4-NEXT:    ret
 ;
 ; LMULMAX2-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32:
 ; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    addi sp, sp, -256
-; LMULMAX2-NEXT:    .cfi_def_cfa_offset 256
-; LMULMAX2-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; LMULMAX2-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; LMULMAX2-NEXT:    addi sp, sp, -384
+; LMULMAX2-NEXT:    .cfi_def_cfa_offset 384
+; LMULMAX2-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; LMULMAX2-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; LMULMAX2-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-NEXT:    addi s0, sp, 256
+; LMULMAX2-NEXT:    addi s0, sp, 384
 ; LMULMAX2-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-NEXT:    andi sp, sp, -128
 ; LMULMAX2-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
@@ -853,24 +855,25 @@
 ; LMULMAX2-NEXT:    vle32.v v30, (a1)
 ; LMULMAX2-NEXT:    addi a0, a0, 96
 ; LMULMAX2-NEXT:    vle32.v v24, (a0)
-; LMULMAX2-NEXT:    addi a0, sp, 96
+; LMULMAX2-NEXT:    addi a0, sp, 224
 ; LMULMAX2-NEXT:    vse32.v v14, (a0)
-; LMULMAX2-NEXT:    addi a0, sp, 64
+; LMULMAX2-NEXT:    addi a0, sp, 192
 ; LMULMAX2-NEXT:    vse32.v v12, (a0)
-; LMULMAX2-NEXT:    addi a0, sp, 32
+; LMULMAX2-NEXT:    addi a0, sp, 160
 ; LMULMAX2-NEXT:    vse32.v v10, (a0)
-; LMULMAX2-NEXT:    mv a0, sp
+; LMULMAX2-NEXT:    addi a0, sp, 128
 ; LMULMAX2-NEXT:    addi a5, zero, 42
-; LMULMAX2-NEXT:    vse32.v v8, (sp)
+; LMULMAX2-NEXT:    addi a1, sp, 128
+; LMULMAX2-NEXT:    vse32.v v8, (a1)
 ; LMULMAX2-NEXT:    vmv2r.v v8, v26
 ; LMULMAX2-NEXT:    vmv2r.v v10, v28
 ; LMULMAX2-NEXT:    vmv2r.v v12, v30
 ; LMULMAX2-NEXT:    vmv2r.v v14, v24
 ; LMULMAX2-NEXT:    call ext3@plt
-; LMULMAX2-NEXT:    addi sp, s0, -256
-; LMULMAX2-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; LMULMAX2-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; LMULMAX2-NEXT:    addi sp, sp, 256
+; LMULMAX2-NEXT:    addi sp, s0, -384
+; LMULMAX2-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; LMULMAX2-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; LMULMAX2-NEXT:    addi sp, sp, 384
 ; LMULMAX2-NEXT:    ret
 ;
 ; LMULMAX1-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32:
@@ -1013,13 +1016,13 @@
 define <32 x i32> @call_split_vector_args(<2 x i32>* %pa, <32 x i32>* %pb) {
 ; LMULMAX8-LABEL: call_split_vector_args:
 ; LMULMAX8:       # %bb.0:
-; LMULMAX8-NEXT:    addi sp, sp, -256
-; LMULMAX8-NEXT:    .cfi_def_cfa_offset 256
-; LMULMAX8-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; LMULMAX8-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; LMULMAX8-NEXT:    addi sp, sp, -384
+; LMULMAX8-NEXT:    .cfi_def_cfa_offset 384
+; LMULMAX8-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; LMULMAX8-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; LMULMAX8-NEXT:    .cfi_offset ra, -8
 ; LMULMAX8-NEXT:    .cfi_offset s0, -16
-; LMULMAX8-NEXT:    addi s0, sp, 256
+; LMULMAX8-NEXT:    addi s0, sp, 384
 ; LMULMAX8-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX8-NEXT:    andi sp, sp, -128
 ; LMULMAX8-NEXT:    vsetivli zero, 2, e32,mf2,ta,mu
@@ -1027,28 +1030,29 @@
 ; LMULMAX8-NEXT:    addi a0, zero, 32
 ; LMULMAX8-NEXT:    vsetvli zero, a0, e32,m8,ta,mu
 ; LMULMAX8-NEXT:    vle32.v v16, (a1)
-; LMULMAX8-NEXT:    mv a0, sp
-; LMULMAX8-NEXT:    vse32.v v16, (sp)
+; LMULMAX8-NEXT:    addi a0, sp, 128
+; LMULMAX8-NEXT:    addi a1, sp, 128
+; LMULMAX8-NEXT:    vse32.v v16, (a1)
 ; LMULMAX8-NEXT:    vmv1r.v v9, v8
 ; LMULMAX8-NEXT:    vmv1r.v v10, v8
 ; LMULMAX8-NEXT:    vmv1r.v v11, v8
 ; LMULMAX8-NEXT:    vmv1r.v v12, v8
 ; LMULMAX8-NEXT:    call split_vector_args@plt
-; LMULMAX8-NEXT:    addi sp, s0, -256
-; LMULMAX8-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; LMULMAX8-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; LMULMAX8-NEXT:    addi sp, sp, 256
+; LMULMAX8-NEXT:    addi sp, s0, -384
+; LMULMAX8-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; LMULMAX8-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; LMULMAX8-NEXT:    addi sp, sp, 384
 ; LMULMAX8-NEXT:    ret
 ;
 ; LMULMAX4-LABEL: call_split_vector_args:
 ; LMULMAX4:       # %bb.0:
-; LMULMAX4-NEXT:    addi sp, sp, -256
-; LMULMAX4-NEXT:    .cfi_def_cfa_offset 256
-; LMULMAX4-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; LMULMAX4-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; LMULMAX4-NEXT:    addi sp, sp, -384
+; LMULMAX4-NEXT:    .cfi_def_cfa_offset 384
+; LMULMAX4-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; LMULMAX4-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; LMULMAX4-NEXT:    .cfi_offset ra, -8
 ; LMULMAX4-NEXT:    .cfi_offset s0, -16
-; LMULMAX4-NEXT:    addi s0, sp, 256
+; LMULMAX4-NEXT:    addi s0, sp, 384
 ; LMULMAX4-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX4-NEXT:    andi sp, sp, -128
 ; LMULMAX4-NEXT:    vsetivli zero, 2, e32,mf2,ta,mu
@@ -1057,30 +1061,31 @@
 ; LMULMAX4-NEXT:    vle32.v v16, (a1)
 ; LMULMAX4-NEXT:    addi a0, a1, 64
 ; LMULMAX4-NEXT:    vle32.v v20, (a0)
-; LMULMAX4-NEXT:    addi a0, sp, 64
+; LMULMAX4-NEXT:    addi a0, sp, 192
 ; LMULMAX4-NEXT:    vse32.v v20, (a0)
-; LMULMAX4-NEXT:    mv a0, sp
-; LMULMAX4-NEXT:    vse32.v v16, (sp)
+; LMULMAX4-NEXT:    addi a0, sp, 128
+; LMULMAX4-NEXT:    addi a1, sp, 128
+; LMULMAX4-NEXT:    vse32.v v16, (a1)
 ; LMULMAX4-NEXT:    vmv1r.v v9, v8
 ; LMULMAX4-NEXT:    vmv1r.v v10, v8
 ; LMULMAX4-NEXT:    vmv1r.v v11, v8
 ; LMULMAX4-NEXT:    vmv1r.v v12, v8
 ; LMULMAX4-NEXT:    call split_vector_args@plt
-; LMULMAX4-NEXT:    addi sp, s0, -256
-; LMULMAX4-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; LMULMAX4-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; LMULMAX4-NEXT:    addi sp, sp, 256
+; LMULMAX4-NEXT:    addi sp, s0, -384
+; LMULMAX4-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; LMULMAX4-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; LMULMAX4-NEXT:    addi sp, sp, 384
 ; LMULMAX4-NEXT:    ret
 ;
 ; LMULMAX2-LABEL: call_split_vector_args:
 ; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    addi sp, sp, -128
-; LMULMAX2-NEXT:    .cfi_def_cfa_offset 128
-; LMULMAX2-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; LMULMAX2-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; LMULMAX2-NEXT:    addi sp, sp, -256
+; LMULMAX2-NEXT:    .cfi_def_cfa_offset 256
+; LMULMAX2-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
+; LMULMAX2-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
 ; LMULMAX2-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-NEXT:    addi s0, sp, 128
+; LMULMAX2-NEXT:    addi s0, sp, 256
 ; LMULMAX2-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-NEXT:    andi sp, sp, -128
 ; LMULMAX2-NEXT:    vsetivli zero, 2, e32,mf2,ta,mu
@@ -1093,33 +1098,34 @@
 ; LMULMAX2-NEXT:    vle32.v v18, (a0)
 ; LMULMAX2-NEXT:    addi a0, a1, 96
 ; LMULMAX2-NEXT:    vle32.v v20, (a0)
-; LMULMAX2-NEXT:    addi a0, sp, 64
+; LMULMAX2-NEXT:    addi a0, sp, 192
 ; LMULMAX2-NEXT:    vse32.v v20, (a0)
-; LMULMAX2-NEXT:    addi a0, sp, 32
+; LMULMAX2-NEXT:    addi a0, sp, 160
 ; LMULMAX2-NEXT:    vse32.v v18, (a0)
-; LMULMAX2-NEXT:    mv a0, sp
-; LMULMAX2-NEXT:    vse32.v v16, (sp)
+; LMULMAX2-NEXT:    addi a0, sp, 128
+; LMULMAX2-NEXT:    addi a1, sp, 128
+; LMULMAX2-NEXT:    vse32.v v16, (a1)
 ; LMULMAX2-NEXT:    vmv1r.v v9, v8
 ; LMULMAX2-NEXT:    vmv1r.v v10, v8
 ; LMULMAX2-NEXT:    vmv1r.v v11, v8
 ; LMULMAX2-NEXT:    vmv1r.v v12, v8
 ; LMULMAX2-NEXT:    vmv2r.v v22, v14
 ; LMULMAX2-NEXT:    call split_vector_args@plt
-; LMULMAX2-NEXT:    addi sp, s0, -128
-; LMULMAX2-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; LMULMAX2-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; LMULMAX2-NEXT:    addi sp, sp, 128
+; LMULMAX2-NEXT:    addi sp, s0, -256
+; LMULMAX2-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
+; LMULMAX2-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
+; LMULMAX2-NEXT:    addi sp, sp, 256
 ; LMULMAX2-NEXT:    ret
 ;
 ; LMULMAX1-LABEL: call_split_vector_args:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -128
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 128
-; LMULMAX1-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; LMULMAX1-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; LMULMAX1-NEXT:    addi sp, sp, -256
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 256
+; LMULMAX1-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
+; LMULMAX1-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
 ; LMULMAX1-NEXT:    .cfi_offset ra, -8
 ; LMULMAX1-NEXT:    .cfi_offset s0, -16
-; LMULMAX1-NEXT:    addi s0, sp, 128
+; LMULMAX1-NEXT:    addi s0, sp, 256
 ; LMULMAX1-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX1-NEXT:    andi sp, sp, -128
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32,mf2,ta,mu
@@ -1140,16 +1146,17 @@
 ; LMULMAX1-NEXT:    vle32.v v19, (a0)
 ; LMULMAX1-NEXT:    addi a0, a1, 112
 ; LMULMAX1-NEXT:    vle32.v v20, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 64
+; LMULMAX1-NEXT:    addi a0, sp, 192
 ; LMULMAX1-NEXT:    vse32.v v20, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 48
+; LMULMAX1-NEXT:    addi a0, sp, 176
 ; LMULMAX1-NEXT:    vse32.v v19, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 32
+; LMULMAX1-NEXT:    addi a0, sp, 160
 ; LMULMAX1-NEXT:    vse32.v v18, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    addi a0, sp, 144
 ; LMULMAX1-NEXT:    vse32.v v17, (a0)
-; LMULMAX1-NEXT:    mv a0, sp
-; LMULMAX1-NEXT:    vse32.v v16, (sp)
+; LMULMAX1-NEXT:    addi a0, sp, 128
+; LMULMAX1-NEXT:    addi a1, sp, 128
+; LMULMAX1-NEXT:    vse32.v v16, (a1)
 ; LMULMAX1-NEXT:    vmv1r.v v9, v8
 ; LMULMAX1-NEXT:    vmv1r.v v10, v8
 ; LMULMAX1-NEXT:    vmv1r.v v11, v8
@@ -1158,10 +1165,10 @@
 ; LMULMAX1-NEXT:    vmv1r.v v22, v14
 ; LMULMAX1-NEXT:    vmv1r.v v23, v15
 ; LMULMAX1-NEXT:    call split_vector_args@plt
-; LMULMAX1-NEXT:    addi sp, s0, -128
-; LMULMAX1-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; LMULMAX1-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; LMULMAX1-NEXT:    addi sp, sp, 128
+; LMULMAX1-NEXT:    addi sp, s0, -256
+; LMULMAX1-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
+; LMULMAX1-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
+; LMULMAX1-NEXT:    addi sp, sp, 256
 ; LMULMAX1-NEXT:    ret
   %a = load <2 x i32>, <2 x i32>* %pa
   %b = load <32 x i32>, <32 x i32>* %pb
@@ -1174,55 +1181,70 @@
 define <32 x i32> @vector_arg_via_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8) {
 ; LMULMAX8-LABEL: vector_arg_via_stack:
 ; LMULMAX8:       # %bb.0:
+; LMULMAX8-NEXT:    addi sp, sp, -16
+; LMULMAX8-NEXT:    .cfi_def_cfa_offset 16
 ; LMULMAX8-NEXT:    addi a0, zero, 32
 ; LMULMAX8-NEXT:    vsetvli zero, a0, e32,m8,ta,mu
-; LMULMAX8-NEXT:    vle32.v v16, (sp)
+; LMULMAX8-NEXT:    addi a0, sp, 16
+; LMULMAX8-NEXT:    vle32.v v16, (a0)
 ; LMULMAX8-NEXT:    vadd.vv v8, v8, v16
+; LMULMAX8-NEXT:    addi sp, sp, 16
 ; LMULMAX8-NEXT:    ret
 ;
 ; LMULMAX4-LABEL: vector_arg_via_stack:
 ; LMULMAX4:       # %bb.0:
+; LMULMAX4-NEXT:    addi sp, sp, -16
+; LMULMAX4-NEXT:    .cfi_def_cfa_offset 16
 ; LMULMAX4-NEXT:    vsetivli zero, 16, e32,m4,ta,mu
-; LMULMAX4-NEXT:    vle32.v v28, (sp)
-; LMULMAX4-NEXT:    addi a0, sp, 64
+; LMULMAX4-NEXT:    addi a0, sp, 16
+; LMULMAX4-NEXT:    vle32.v v28, (a0)
+; LMULMAX4-NEXT:    addi a0, sp, 80
 ; LMULMAX4-NEXT:    vle32.v v16, (a0)
 ; LMULMAX4-NEXT:    vadd.vv v8, v8, v28
 ; LMULMAX4-NEXT:    vadd.vv v12, v12, v16
+; LMULMAX4-NEXT:    addi sp, sp, 16
 ; LMULMAX4-NEXT:    ret
 ;
 ; LMULMAX2-LABEL: vector_arg_via_stack:
 ; LMULMAX2:       # %bb.0:
+; LMULMAX2-NEXT:    addi sp, sp, -16
+; LMULMAX2-NEXT:    .cfi_def_cfa_offset 16
 ; LMULMAX2-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-NEXT:    vle32.v v26, (sp)
-; LMULMAX2-NEXT:    addi a0, sp, 32
+; LMULMAX2-NEXT:    addi a0, sp, 16
+; LMULMAX2-NEXT:    vle32.v v26, (a0)
+; LMULMAX2-NEXT:    addi a0, sp, 48
 ; LMULMAX2-NEXT:    vle32.v v28, (a0)
-; LMULMAX2-NEXT:    addi a0, sp, 64
+; LMULMAX2-NEXT:    addi a0, sp, 80
 ; LMULMAX2-NEXT:    vle32.v v30, (a0)
-; LMULMAX2-NEXT:    addi a0, sp, 96
+; LMULMAX2-NEXT:    addi a0, sp, 112
 ; LMULMAX2-NEXT:    vle32.v v16, (a0)
 ; LMULMAX2-NEXT:    vadd.vv v8, v8, v26
 ; LMULMAX2-NEXT:    vadd.vv v10, v10, v28
 ; LMULMAX2-NEXT:    vadd.vv v12, v12, v30
 ; LMULMAX2-NEXT:    vadd.vv v14, v14, v16
+; LMULMAX2-NEXT:    addi sp, sp, 16
 ; LMULMAX2-NEXT:    ret
 ;
 ; LMULMAX1-LABEL: vector_arg_via_stack:
 ; LMULMAX1:       # %bb.0:
+; LMULMAX1-NEXT:    addi sp, sp, -16
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 16
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-NEXT:    addi a0, sp, 112
+; LMULMAX1-NEXT:    addi a0, sp, 128
 ; LMULMAX1-NEXT:    vle32.v v25, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 96
+; LMULMAX1-NEXT:    addi a0, sp, 112
 ; LMULMAX1-NEXT:    vle32.v v26, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 80
+; LMULMAX1-NEXT:    addi a0, sp, 96
 ; LMULMAX1-NEXT:    vle32.v v27, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 64
+; LMULMAX1-NEXT:    addi a0, sp, 80
 ; LMULMAX1-NEXT:    vle32.v v28, (a0)
-; LMULMAX1-NEXT:    vle32.v v29, (sp)
 ; LMULMAX1-NEXT:    addi a0, sp, 16
-; LMULMAX1-NEXT:    vle32.v v30, (a0)
+; LMULMAX1-NEXT:    vle32.v v29, (a0)
 ; LMULMAX1-NEXT:    addi a0, sp, 32
-; LMULMAX1-NEXT:    vle32.v v31, (a0)
+; LMULMAX1-NEXT:    vle32.v v30, (a0)
 ; LMULMAX1-NEXT:    addi a0, sp, 48
+; LMULMAX1-NEXT:    vle32.v v31, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 64
 ; LMULMAX1-NEXT:    vle32.v v16, (a0)
 ; LMULMAX1-NEXT:    vadd.vv v8, v8, v29
 ; LMULMAX1-NEXT:    vadd.vv v9, v9, v30
@@ -1232,6 +1254,7 @@
 ; LMULMAX1-NEXT:    vadd.vv v13, v13, v27
 ; LMULMAX1-NEXT:    vadd.vv v14, v14, v26
 ; LMULMAX1-NEXT:    vadd.vv v15, v15, v25
+; LMULMAX1-NEXT:    addi sp, sp, 16
 ; LMULMAX1-NEXT:    ret
   %s = add <32 x i32> %x, %z
   ret <32 x i32> %s
@@ -1392,9 +1415,12 @@
 define <4 x i1> @vector_mask_arg_via_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8, <4 x i1> %9, <4 x i1> %10) {
 ; CHECK-LABEL: vector_mask_arg_via_stack:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vsetivli zero, 4, e8,mf4,ta,mu
-; CHECK-NEXT:    addi a0, sp, 136
+; CHECK-NEXT:    addi a0, sp, 152
 ; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   ret <4 x i1> %10
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -7,8 +7,8 @@
 define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v16i8:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle8.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -45,7 +45,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV32-NEXT:    srli a5, a5, 24
 ; LMULMAX2-RV32-NEXT:    addi a5, a5, -24
-; LMULMAX2-RV32-NEXT:    sb a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sb a5, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a5, v26
@@ -74,7 +74,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 31(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -102,7 +102,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 30(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -130,7 +130,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 29(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -158,7 +158,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -186,7 +186,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 27(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -214,7 +214,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 26(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -242,7 +242,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 25(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -270,7 +270,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -298,7 +298,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 23(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -326,7 +326,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 22(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -354,7 +354,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 21(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -382,7 +382,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -410,7 +410,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 19(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -438,7 +438,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 18(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -466,17 +466,18 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 17(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle8.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle8.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse8.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v16i8:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle8.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
@@ -537,7 +538,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, -56
-; LMULMAX2-RV64-NEXT:    sb a5, 0(sp)
+; LMULMAX2-RV64-NEXT:    sb a5, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a5, v26
@@ -568,7 +569,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 15(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 31(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -598,7 +599,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -628,7 +629,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -658,7 +659,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -688,7 +689,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -718,7 +719,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -748,7 +749,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -778,7 +779,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -808,7 +809,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -838,7 +839,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -868,7 +869,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -898,7 +899,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -928,7 +929,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -958,7 +959,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -988,17 +989,18 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle8.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle8.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse8.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v16i8:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle8.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -1035,7 +1037,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV32-NEXT:    srli a5, a5, 24
 ; LMULMAX1-RV32-NEXT:    addi a5, a5, -24
-; LMULMAX1-RV32-NEXT:    sb a5, 0(sp)
+; LMULMAX1-RV32-NEXT:    sb a5, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a5, v26
@@ -1064,7 +1066,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1092,7 +1094,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1120,7 +1122,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1148,7 +1150,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1176,7 +1178,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1204,7 +1206,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1232,7 +1234,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1260,7 +1262,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1288,7 +1290,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1316,7 +1318,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1344,7 +1346,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1372,7 +1374,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1400,7 +1402,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1428,7 +1430,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -1456,17 +1458,18 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle8.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle8.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse8.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v16i8:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle8.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -1527,7 +1530,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV64-NEXT:    srli a5, a5, 56
 ; LMULMAX1-RV64-NEXT:    addi a5, a5, -56
-; LMULMAX1-RV64-NEXT:    sb a5, 0(sp)
+; LMULMAX1-RV64-NEXT:    sb a5, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a5, v26
@@ -1558,7 +1561,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 15(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1588,7 +1591,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1618,7 +1621,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1648,7 +1651,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1678,7 +1681,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1708,7 +1711,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1738,7 +1741,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1768,7 +1771,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1798,7 +1801,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1828,7 +1831,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1858,7 +1861,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1888,7 +1891,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1918,7 +1921,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1948,7 +1951,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -1978,11 +1981,12 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle8.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle8.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse8.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <16 x i8>, <16 x i8>* %x
   %b = load <16 x i8>, <16 x i8>* %y
@@ -1995,8 +1999,8 @@
 define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v8i16:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle16.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a2, v25
@@ -2035,7 +2039,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
@@ -2064,7 +2068,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 30(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -2092,7 +2096,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -2120,7 +2124,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 26(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -2148,7 +2152,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -2176,7 +2180,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 22(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -2204,7 +2208,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -2232,17 +2236,18 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle16.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle16.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse16.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v8i16:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle16.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a2, v25
@@ -2305,7 +2310,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -2336,7 +2341,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -2366,7 +2371,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -2396,7 +2401,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -2426,7 +2431,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -2456,7 +2461,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -2486,7 +2491,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -2516,17 +2521,18 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle16.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle16.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse16.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v8i16:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
@@ -2565,7 +2571,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -2594,7 +2600,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a6
@@ -2622,7 +2628,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a6
@@ -2650,7 +2656,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a6
@@ -2678,7 +2684,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a6
@@ -2706,7 +2712,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a6
@@ -2734,7 +2740,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a6
@@ -2762,17 +2768,18 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v8i16:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
@@ -2835,7 +2842,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -2866,7 +2873,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a6
@@ -2896,7 +2903,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a6
@@ -2926,7 +2933,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a6
@@ -2956,7 +2963,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a6
@@ -2986,7 +2993,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a6
@@ -3016,7 +3023,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a6
@@ -3046,11 +3053,12 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i16>, <8 x i16>* %x
   %b = load <8 x i16>, <8 x i16>* %y
@@ -3063,8 +3071,8 @@
 define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v4i32:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle32.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -3099,7 +3107,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV32-NEXT:    srli a5, a5, 24
-; LMULMAX2-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a5, v26
@@ -3126,7 +3134,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -3152,7 +3160,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -3178,17 +3186,18 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v4i32:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle32.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
@@ -3250,7 +3259,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, -32
-; LMULMAX2-RV64-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV64-NEXT:    sw a5, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a5, v26
@@ -3282,7 +3291,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -3313,7 +3322,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -3344,17 +3353,18 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse32.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v4i32:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -3389,7 +3399,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a4, a4, 257
 ; LMULMAX1-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV32-NEXT:    srli a5, a5, 24
-; LMULMAX1-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a5, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a5, v26
@@ -3416,7 +3426,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a5, a1, 1
@@ -3442,7 +3452,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a5, a1, 1
@@ -3468,17 +3478,18 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v4i32:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -3540,7 +3551,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV64-NEXT:    srli a5, a5, 56
 ; LMULMAX1-RV64-NEXT:    addi a5, a5, -32
-; LMULMAX1-RV64-NEXT:    sw a5, 0(sp)
+; LMULMAX1-RV64-NEXT:    sw a5, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a5, v26
@@ -3572,7 +3583,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a5, a1, 1
@@ -3603,7 +3614,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    srliw a5, a1, 1
@@ -3634,11 +3645,12 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <4 x i32>, <4 x i32>* %x
   %b = load <4 x i32>, <4 x i32>* %y
@@ -3651,12 +3663,12 @@
 define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v2i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    sw zero, 12(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 20(sp)
 ; LMULMAX2-RV32-NEXT:    addi a6, zero, 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v26, v25, a6
@@ -3725,7 +3737,7 @@
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v26, v25, a6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 16(sp)
 ; LMULMAX2-RV32-NEXT:    bnez a1, .LBB3_5
 ; LMULMAX2-RV32-NEXT:  # %bb.4:
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -3779,12 +3791,13 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:  .LBB3_6:
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v2i64:
@@ -3883,12 +3896,12 @@
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v2i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    sw zero, 12(sp)
-; LMULMAX1-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 20(sp)
 ; LMULMAX1-RV32-NEXT:    addi a6, zero, 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v25, a6
@@ -3957,7 +3970,7 @@
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v25, a6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX1-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a5, 16(sp)
 ; LMULMAX1-RV32-NEXT:    bnez a1, .LBB3_5
 ; LMULMAX1-RV32-NEXT:  # %bb.4:
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -4011,12 +4024,13 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB3_6:
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v2i64:
@@ -4123,13 +4137,13 @@
 define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v32i8:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    addi a6, zero, 32
@@ -4169,7 +4183,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 31
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
@@ -4198,7 +4212,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 31(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 63(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 30
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4226,7 +4240,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 30(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 62(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 29
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4254,7 +4268,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 29(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 61(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 28
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4282,7 +4296,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 27
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4310,7 +4324,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 27(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 59(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 26
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4338,7 +4352,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 26(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 58(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 25
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4366,7 +4380,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 25(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 57(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 24
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4394,7 +4408,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 23
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4422,7 +4436,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 23(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 55(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 22
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4450,7 +4464,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 22(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 54(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 21
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4478,7 +4492,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 21(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 53(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 20
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4506,7 +4520,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 19
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4534,7 +4548,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 19(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 51(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 18
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4562,7 +4576,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 18(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 50(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 17
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4590,7 +4604,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 17(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 49(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 16
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4618,7 +4632,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4646,7 +4660,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 47(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4674,7 +4688,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 46(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4702,7 +4716,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 45(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4730,7 +4744,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4758,7 +4772,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 43(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4786,7 +4800,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 42(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4814,7 +4828,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 41(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4842,7 +4856,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4870,7 +4884,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 39(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4898,7 +4912,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 38(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4926,7 +4940,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 37(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4954,7 +4968,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -4982,7 +4996,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 35(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -5010,7 +5024,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 34(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    andi a1, a1, 255
@@ -5038,25 +5052,26 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX2-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 33(sp)
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a6, e8,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle8.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle8.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse8.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v32i8:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    addi a6, zero, 32
@@ -5120,7 +5135,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e8,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 31
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5151,7 +5166,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 31(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 63(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 30
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5181,7 +5196,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 30(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 62(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 29
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5211,7 +5226,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 29(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 61(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 28
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5241,7 +5256,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 28(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 27
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5271,7 +5286,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 27(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 59(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 26
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5301,7 +5316,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 26(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 58(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 25
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5331,7 +5346,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 25(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 57(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 24
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5361,7 +5376,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 23
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5391,7 +5406,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 23(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 55(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 22
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5421,7 +5436,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 22(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 54(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 21
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5451,7 +5466,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 21(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 53(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 20
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5481,7 +5496,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 20(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 19
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5511,7 +5526,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 19(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 51(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 18
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5541,7 +5556,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 18(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 50(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 17
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5571,7 +5586,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 17(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 49(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 16
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5601,7 +5616,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5631,7 +5646,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 15(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 47(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5661,7 +5676,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 46(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5691,7 +5706,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 45(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5721,7 +5736,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5751,7 +5766,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 43(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5781,7 +5796,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 42(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5811,7 +5826,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 41(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5841,7 +5856,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5871,7 +5886,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 39(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5901,7 +5916,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 38(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5931,7 +5946,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 37(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5961,7 +5976,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -5991,7 +6006,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 35(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -6021,7 +6036,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 34(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    andi a1, a1, 255
@@ -6051,20 +6066,21 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX2-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 33(sp)
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a6, e8,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle8.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle8.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse8.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v32i8:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle8.v v26, (a6)
@@ -6103,7 +6119,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 15
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
@@ -6132,7 +6148,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 31(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 47(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 14
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6160,7 +6176,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 30(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 46(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 13
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6188,7 +6204,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 29(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 45(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 12
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6216,7 +6232,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 28(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 11
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6244,7 +6260,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 27(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 43(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 10
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6272,7 +6288,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 26(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 42(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 9
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6300,7 +6316,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 25(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 41(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 8
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6328,7 +6344,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 24(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6356,7 +6372,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 23(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 39(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6384,7 +6400,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 22(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 38(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6412,7 +6428,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 21(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 37(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6440,7 +6456,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 20(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6468,7 +6484,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 19(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 35(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6496,7 +6512,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 18(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 34(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6524,7 +6540,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 17(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 33(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
@@ -6551,7 +6567,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6579,7 +6595,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6607,7 +6623,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6635,7 +6651,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6663,7 +6679,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6691,7 +6707,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6719,7 +6735,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6747,7 +6763,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6775,7 +6791,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6803,7 +6819,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6831,7 +6847,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6859,7 +6875,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6887,7 +6903,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6915,7 +6931,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6943,7 +6959,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    andi a1, a1, 255
@@ -6971,20 +6987,21 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -24
-; LMULMAX1-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle8.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle8.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle8.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vse8.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse8.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v32i8:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle8.v v26, (a6)
@@ -7047,7 +7064,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 16(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 15
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7078,7 +7095,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 31(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 47(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7108,7 +7125,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 46(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7138,7 +7155,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 45(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7168,7 +7185,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7198,7 +7215,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 43(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7228,7 +7245,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 42(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7258,7 +7275,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 41(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7288,7 +7305,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7318,7 +7335,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 39(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7348,7 +7365,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 38(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7378,7 +7395,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 37(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7408,7 +7425,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7438,7 +7455,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 35(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7468,7 +7485,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 34(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7498,7 +7515,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 33(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
 ; LMULMAX1-RV64-NEXT:    srli a2, a1, 1
@@ -7527,7 +7544,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7557,7 +7574,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 15(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7587,7 +7604,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7617,7 +7634,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7647,7 +7664,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7677,7 +7694,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7707,7 +7724,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7737,7 +7754,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7767,7 +7784,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7797,7 +7814,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7827,7 +7844,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7857,7 +7874,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7887,7 +7904,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7917,7 +7934,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7947,7 +7964,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7977,14 +7994,15 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
-; LMULMAX1-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle8.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle8.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle8.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    vse8.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse8.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %x
   %b = load <32 x i8>, <32 x i8>* %y
@@ -7997,13 +8015,13 @@
 define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v16i16:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
@@ -8044,7 +8062,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
@@ -8073,7 +8091,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 30(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 62(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8101,7 +8119,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8129,7 +8147,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 26(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 58(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8157,7 +8175,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8185,7 +8203,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 22(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 54(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8213,7 +8231,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8241,7 +8259,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 18(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 50(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8269,7 +8287,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8297,7 +8315,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 46(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8325,7 +8343,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8353,7 +8371,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 42(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8381,7 +8399,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8409,7 +8427,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 38(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8437,7 +8455,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a6
@@ -8465,25 +8483,26 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX2-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 34(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle16.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle16.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v16i16:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
@@ -8548,7 +8567,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8579,7 +8598,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 30(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 62(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8609,7 +8628,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 28(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8639,7 +8658,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 26(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 58(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8669,7 +8688,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8699,7 +8718,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 22(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 54(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8729,7 +8748,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 20(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8759,7 +8778,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 18(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 50(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8789,7 +8808,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8819,7 +8838,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 46(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8849,7 +8868,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8879,7 +8898,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 42(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8909,7 +8928,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8939,7 +8958,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 38(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8969,7 +8988,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a6
@@ -8999,20 +9018,21 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX2-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 34(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle16.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle16.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v16i16:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle16.v v26, (a6)
@@ -9053,7 +9073,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 16(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
@@ -9082,7 +9102,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 30(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 46(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9110,7 +9130,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 28(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9138,7 +9158,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 26(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 42(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9166,7 +9186,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 24(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9194,7 +9214,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 22(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 38(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9222,7 +9242,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 20(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9250,7 +9270,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 18(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 34(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
 ; LMULMAX1-RV32-NEXT:    srli a3, a2, 1
@@ -9277,7 +9297,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 0(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9305,7 +9325,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 14(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9333,7 +9353,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 12(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9361,7 +9381,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 10(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9389,7 +9409,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 8(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9417,7 +9437,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 6(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9445,7 +9465,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, -16
-; LMULMAX1-RV32-NEXT:    sh a2, 4(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a7
@@ -9473,20 +9493,21 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, -16
-; LMULMAX1-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle16.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle16.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse16.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v16i16:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle16.v v26, (a6)
@@ -9551,7 +9572,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 16(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -9582,7 +9603,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 30(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 46(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9612,7 +9633,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 28(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9642,7 +9663,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 26(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 42(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9672,7 +9693,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 24(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9702,7 +9723,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 22(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 38(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9732,7 +9753,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 20(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9762,7 +9783,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 18(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 34(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
 ; LMULMAX1-RV64-NEXT:    srli a3, a2, 1
@@ -9791,7 +9812,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 0(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9821,7 +9842,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 14(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9851,7 +9872,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 12(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9881,7 +9902,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 10(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9911,7 +9932,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 8(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9941,7 +9962,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 6(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9971,7 +9992,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
 ; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
-; LMULMAX1-RV64-NEXT:    sh a2, 4(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -10001,14 +10022,15 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
-; LMULMAX1-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle16.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle16.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse16.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %x
   %b = load <16 x i16>, <16 x i16>* %y
@@ -10021,13 +10043,13 @@
 define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v8i32:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
@@ -10064,7 +10086,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV32-NEXT:    srli a5, a5, 24
-; LMULMAX2-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a5, v28
@@ -10091,7 +10113,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -10117,7 +10139,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -10143,7 +10165,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -10169,7 +10191,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -10195,7 +10217,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -10221,7 +10243,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 1
@@ -10247,25 +10269,26 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse32.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v8i32:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
@@ -10329,7 +10352,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, -32
-; LMULMAX2-RV64-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV64-NEXT:    sw a5, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a5, v28
@@ -10361,7 +10384,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -10392,7 +10415,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -10423,7 +10446,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -10454,7 +10477,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -10485,7 +10508,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -10516,7 +10539,7 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    srliw a5, a1, 1
@@ -10547,20 +10570,21 @@
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX2-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse32.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v8i32:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a6)
@@ -10597,7 +10621,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
@@ -10624,7 +10648,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
@@ -10650,7 +10674,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
@@ -10676,7 +10700,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a2
@@ -10701,7 +10725,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
@@ -10727,7 +10751,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
@@ -10753,7 +10777,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
@@ -10779,20 +10803,21 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse32.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v8i32:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle32.v v26, (a6)
@@ -10856,7 +10881,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -10888,7 +10913,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 1
@@ -10919,7 +10944,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 1
@@ -10950,7 +10975,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 1
 ; LMULMAX1-RV64-NEXT:    slli a1, a1, 32
@@ -10980,7 +11005,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 1
@@ -11011,7 +11036,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 1
@@ -11042,7 +11067,7 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 1
@@ -11073,14 +11098,15 @@
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
-; LMULMAX1-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse32.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %x
   %b = load <8 x i32>, <8 x i32>* %y
@@ -11093,21 +11119,21 @@
 define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: ctlz_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    sw zero, 28(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 20(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 12(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 60(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 52(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 44(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 36(sp)
 ; LMULMAX2-RV32-NEXT:    addi a6, zero, 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v28, v26, a6
@@ -11176,7 +11202,7 @@
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v30, v28, a6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v30
-; LMULMAX2-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 32(sp)
 ; LMULMAX2-RV32-NEXT:    bnez a1, .LBB7_5
 ; LMULMAX2-RV32-NEXT:  # %bb.4:
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
@@ -11233,7 +11259,7 @@
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v30, v28, a6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v30
-; LMULMAX2-RV32-NEXT:    sw a5, 24(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 56(sp)
 ; LMULMAX2-RV32-NEXT:    bnez a1, .LBB7_8
 ; LMULMAX2-RV32-NEXT:  # %bb.7:
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
@@ -11290,7 +11316,7 @@
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v28, v26, a6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
-; LMULMAX2-RV32-NEXT:    sw a5, 16(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 48(sp)
 ; LMULMAX2-RV32-NEXT:    bnez a1, .LBB7_11
 ; LMULMAX2-RV32-NEXT:  # %bb.10:
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
@@ -11344,26 +11370,27 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:  .LBB7_12:
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vse64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: ctlz_v4i64:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
@@ -11424,7 +11451,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX2-RV64-NEXT:    sd a5, 0(sp)
+; LMULMAX2-RV64-NEXT:    sd a5, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a5, v28
@@ -11453,7 +11480,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sd a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    srli a5, a1, 1
@@ -11481,7 +11508,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sd a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    srli a5, a1, 1
@@ -11509,26 +11536,27 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sd a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle64.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle64.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse64.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: ctlz_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle64.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    sw zero, 28(sp)
-; LMULMAX1-RV32-NEXT:    sw zero, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 44(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 36(sp)
 ; LMULMAX1-RV32-NEXT:    addi a7, zero, 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v27, v26, a7
@@ -11597,7 +11625,7 @@
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v27, v26, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
-; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    bnez a2, .LBB7_5
 ; LMULMAX1-RV32-NEXT:  # %bb.4:
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -11651,11 +11679,11 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB7_6:
-; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
-; LMULMAX1-RV32-NEXT:    sw zero, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 40(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v25, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX1-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 20(sp)
 ; LMULMAX1-RV32-NEXT:    bnez a1, .LBB7_8
 ; LMULMAX1-RV32-NEXT:  # %bb.7:
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -11712,7 +11740,7 @@
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v25, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    bnez a2, .LBB7_11
 ; LMULMAX1-RV32-NEXT:  # %bb.10:
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -11766,15 +11794,16 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB7_12:
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse64.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: ctlz_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -7,8 +7,8 @@
 define void @cttz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v16i8:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle8.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -36,7 +36,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV32-NEXT:    srli a5, a5, 24
-; LMULMAX2-RV32-NEXT:    sb a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sb a5, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a5, v26
@@ -56,7 +56,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 31(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -75,7 +75,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 30(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -94,7 +94,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 29(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -113,7 +113,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -132,7 +132,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 27(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -151,7 +151,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 26(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -170,7 +170,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 25(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -189,7 +189,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -208,7 +208,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 23(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -227,7 +227,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 22(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -246,7 +246,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 21(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -265,7 +265,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -284,7 +284,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 19(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -303,7 +303,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 18(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -322,17 +322,18 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 17(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle8.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle8.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse8.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v16i8:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle8.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
@@ -384,7 +385,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX2-RV64-NEXT:    sb a5, 15(sp)
+; LMULMAX2-RV64-NEXT:    sb a5, 31(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a5, v26
 ; LMULMAX2-RV64-NEXT:    ori a5, a5, 256
@@ -403,7 +404,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -422,7 +423,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -441,7 +442,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -460,7 +461,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -479,7 +480,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -498,7 +499,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -517,7 +518,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -536,7 +537,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -555,7 +556,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -574,7 +575,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -593,7 +594,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -612,7 +613,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -631,7 +632,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -650,7 +651,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
 ; LMULMAX2-RV64-NEXT:    addi a5, a1, -1
@@ -668,17 +669,18 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle8.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle8.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse8.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v16i8:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle8.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -706,7 +708,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a4, a4, 257
 ; LMULMAX1-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV32-NEXT:    srli a5, a5, 24
-; LMULMAX1-RV32-NEXT:    sb a5, 0(sp)
+; LMULMAX1-RV32-NEXT:    sb a5, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a5, v26
@@ -726,7 +728,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -745,7 +747,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -764,7 +766,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -783,7 +785,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -802,7 +804,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -821,7 +823,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -840,7 +842,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -859,7 +861,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -878,7 +880,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -897,7 +899,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -916,7 +918,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -935,7 +937,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -954,7 +956,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -973,7 +975,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -992,17 +994,18 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle8.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle8.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse8.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v16i8:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle8.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
@@ -1054,7 +1057,7 @@
 ; LMULMAX1-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX1-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX1-RV64-NEXT:    sb a5, 15(sp)
+; LMULMAX1-RV64-NEXT:    sb a5, 31(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a5, v26
 ; LMULMAX1-RV64-NEXT:    ori a5, a5, 256
@@ -1073,7 +1076,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1092,7 +1095,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1111,7 +1114,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1130,7 +1133,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1149,7 +1152,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1168,7 +1171,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1187,7 +1190,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1206,7 +1209,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1225,7 +1228,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1244,7 +1247,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1263,7 +1266,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1282,7 +1285,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1301,7 +1304,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -1320,7 +1323,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
 ; LMULMAX1-RV64-NEXT:    addi a5, a1, -1
@@ -1338,11 +1341,12 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle8.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle8.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse8.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <16 x i8>, <16 x i8>* %x
   %b = load <16 x i8>, <16 x i8>* %y
@@ -1355,8 +1359,8 @@
 define void @cttz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v8i16:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle16.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a2, v25
@@ -1385,7 +1389,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
@@ -1405,7 +1409,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 30(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -1424,7 +1428,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -1443,7 +1447,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 26(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -1462,7 +1466,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -1481,7 +1485,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 22(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -1500,7 +1504,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -1519,17 +1523,18 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle16.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle16.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse16.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v8i16:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle16.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
@@ -1582,7 +1587,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -1601,7 +1606,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -1620,7 +1625,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -1639,7 +1644,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -1658,7 +1663,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -1677,7 +1682,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -1696,7 +1701,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
 ; LMULMAX2-RV64-NEXT:    addi a2, a1, -1
@@ -1714,17 +1719,18 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle16.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle16.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse16.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v8i16:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
@@ -1753,7 +1759,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -1773,7 +1779,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a6
@@ -1792,7 +1798,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a6
@@ -1811,7 +1817,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a6
@@ -1830,7 +1836,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a6
@@ -1849,7 +1855,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a6
@@ -1868,7 +1874,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    or a1, a1, a6
@@ -1887,17 +1893,18 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v8i16:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
@@ -1950,7 +1957,7 @@
 ; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a6
@@ -1969,7 +1976,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a6
@@ -1988,7 +1995,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a6
@@ -2007,7 +2014,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a6
@@ -2026,7 +2033,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a6
@@ -2045,7 +2052,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a6
@@ -2064,7 +2071,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    or a1, a1, a6
 ; LMULMAX1-RV64-NEXT:    addi a2, a1, -1
@@ -2082,11 +2089,12 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle16.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i16>, <8 x i16>* %x
   %b = load <8 x i16>, <8 x i16>* %y
@@ -2099,8 +2107,8 @@
 define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v4i32:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle32.v v25, (a0)
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
@@ -2127,7 +2135,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
@@ -2146,7 +2154,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    addi a3, a1, -1
@@ -2164,7 +2172,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    addi a3, a1, -1
@@ -2182,17 +2190,18 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v4i32:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX2-RV64-NEXT:    vle32.v v25, (a0)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
@@ -2246,7 +2255,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a1, a1, 257
 ; LMULMAX2-RV64-NEXT:    mul a4, a4, a1
 ; LMULMAX2-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX2-RV64-NEXT:    sw a4, 12(sp)
+; LMULMAX2-RV64-NEXT:    sw a4, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a4, v26
 ; LMULMAX2-RV64-NEXT:    or a4, a4, a6
@@ -2265,7 +2274,7 @@
 ; LMULMAX2-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX2-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX2-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX2-RV64-NEXT:    sw a2, 8(sp)
+; LMULMAX2-RV64-NEXT:    sw a2, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX2-RV64-NEXT:    or a2, a2, a6
@@ -2284,7 +2293,7 @@
 ; LMULMAX2-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX2-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX2-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX2-RV64-NEXT:    sw a2, 4(sp)
+; LMULMAX2-RV64-NEXT:    sw a2, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX2-RV64-NEXT:    or a2, a2, a6
 ; LMULMAX2-RV64-NEXT:    addi a4, a2, -1
@@ -2302,17 +2311,18 @@
 ; LMULMAX2-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX2-RV64-NEXT:    mul a1, a2, a1
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV64-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV64-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV64-NEXT:    vse32.v v25, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v4i32:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
@@ -2339,7 +2349,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -2358,7 +2368,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    addi a3, a1, -1
@@ -2376,7 +2386,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    addi a3, a1, -1
@@ -2394,17 +2404,18 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v4i32:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vle32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
@@ -2458,7 +2469,7 @@
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 257
 ; LMULMAX1-RV64-NEXT:    mul a4, a4, a1
 ; LMULMAX1-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX1-RV64-NEXT:    sw a4, 12(sp)
+; LMULMAX1-RV64-NEXT:    sw a4, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a4, v26
 ; LMULMAX1-RV64-NEXT:    or a4, a4, a6
@@ -2477,7 +2488,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 8(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a6
@@ -2496,7 +2507,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 4(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a6
 ; LMULMAX1-RV64-NEXT:    addi a4, a2, -1
@@ -2514,11 +2525,12 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a1, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <4 x i32>, <4 x i32>* %x
   %b = load <4 x i32>, <4 x i32>* %y
@@ -2531,12 +2543,12 @@
 define void @cttz_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v2i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    sw zero, 12(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 20(sp)
 ; LMULMAX2-RV32-NEXT:    addi a6, zero, 32
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a4, a1, 1365
@@ -2586,7 +2598,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 24
 ; LMULMAX2-RV32-NEXT:  .LBB3_3:
-; LMULMAX2-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 16(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a5, v25
@@ -2628,12 +2640,13 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:  .LBB3_6:
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX2-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX2-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v2i64:
@@ -2712,12 +2725,12 @@
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v2i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -16
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    sw zero, 12(sp)
-; LMULMAX1-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 20(sp)
 ; LMULMAX1-RV32-NEXT:    addi a6, zero, 32
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a4, a1, 1365
@@ -2767,7 +2780,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX1-RV32-NEXT:    srli a5, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB3_3:
-; LMULMAX1-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a5, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a5, v25
@@ -2809,12 +2822,13 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB3_6:
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 16
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v2i64:
@@ -2901,13 +2915,13 @@
 define void @cttz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v32i8:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    addi a6, zero, 32
@@ -2938,7 +2952,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e8,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 31
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
@@ -2958,7 +2972,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 31(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 63(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 30
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -2977,7 +2991,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 30(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 62(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 29
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -2996,7 +3010,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 29(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 61(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 28
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3015,7 +3029,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 27
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3034,7 +3048,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 27(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 59(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 26
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3053,7 +3067,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 26(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 58(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 25
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3072,7 +3086,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 25(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 57(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 24
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3091,7 +3105,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 23
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3110,7 +3124,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 23(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 55(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 22
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3129,7 +3143,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 22(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 54(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 21
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3148,7 +3162,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 21(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 53(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 20
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3167,7 +3181,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 19
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3186,7 +3200,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 19(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 51(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 18
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3205,7 +3219,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 18(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 50(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 17
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3224,7 +3238,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 17(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 49(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 16
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3243,7 +3257,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3262,7 +3276,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 47(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3281,7 +3295,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 46(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3300,7 +3314,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 45(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3319,7 +3333,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3338,7 +3352,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 43(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3357,7 +3371,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 42(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3376,7 +3390,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 41(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3395,7 +3409,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3414,7 +3428,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 39(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3433,7 +3447,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 38(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3452,7 +3466,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 37(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3471,7 +3485,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3490,7 +3504,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 35(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3509,7 +3523,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 34(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    ori a1, a1, 256
@@ -3528,25 +3542,26 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV32-NEXT:    sb a1, 33(sp)
 ; LMULMAX2-RV32-NEXT:    vsetvli zero, a6, e8,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle8.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle8.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse8.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v32i8:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    addi a6, zero, 32
@@ -3601,7 +3616,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 31(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 63(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 30
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3620,7 +3635,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 30(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 62(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 29
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3639,7 +3654,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 29(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 61(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 28
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3658,7 +3673,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 28(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 27
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3677,7 +3692,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 27(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 59(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 26
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3696,7 +3711,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 26(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 58(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 25
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3715,7 +3730,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 25(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 57(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 24
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3734,7 +3749,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 23
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3753,7 +3768,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 23(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 55(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 22
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3772,7 +3787,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 22(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 54(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 21
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3791,7 +3806,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 21(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 53(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 20
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3810,7 +3825,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 20(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 19
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3829,7 +3844,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 19(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 51(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 18
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3848,7 +3863,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 18(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 50(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 17
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3867,7 +3882,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 17(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 49(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 16
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3886,7 +3901,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3905,7 +3920,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 15(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 47(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3924,7 +3939,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 46(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3943,7 +3958,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 45(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3962,7 +3977,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -3981,7 +3996,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 43(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4000,7 +4015,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 42(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4019,7 +4034,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 41(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4038,7 +4053,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4057,7 +4072,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 39(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4076,7 +4091,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 38(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4095,7 +4110,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 37(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4114,7 +4129,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4133,7 +4148,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 35(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4152,7 +4167,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 34(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
@@ -4171,7 +4186,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 33(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    ori a1, a1, 256
 ; LMULMAX2-RV64-NEXT:    addi a2, a1, -1
@@ -4189,20 +4204,21 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sb a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sb a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a6, e8,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle8.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle8.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse8.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v32i8:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle8.v v26, (a6)
@@ -4232,7 +4248,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 15
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
@@ -4252,7 +4268,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 31(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 47(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 14
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4271,7 +4287,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 30(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 46(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 13
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4290,7 +4306,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 29(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 45(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 12
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4309,7 +4325,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 28(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 11
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4328,7 +4344,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 27(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 43(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 10
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4347,7 +4363,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 26(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 42(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 9
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4366,7 +4382,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 25(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 41(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 8
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4385,7 +4401,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 24(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4404,7 +4420,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 23(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 39(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4423,7 +4439,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 22(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 38(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4442,7 +4458,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 21(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 37(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4461,7 +4477,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 20(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4480,7 +4496,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 19(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 35(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4499,7 +4515,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 18(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 34(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4518,7 +4534,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 17(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 33(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
@@ -4536,7 +4552,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4555,7 +4571,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 15(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4574,7 +4590,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4593,7 +4609,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4612,7 +4628,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4631,7 +4647,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4650,7 +4666,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4669,7 +4685,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4688,7 +4704,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4707,7 +4723,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4726,7 +4742,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4745,7 +4761,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4764,7 +4780,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4783,7 +4799,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4802,7 +4818,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    ori a1, a1, 256
@@ -4821,20 +4837,21 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV32-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle8.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle8.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle8.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vse8.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse8.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v32i8:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle8.v v26, (a6)
@@ -4886,7 +4903,7 @@
 ; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 16(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e8,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 15
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -4906,7 +4923,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 31(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 47(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -4925,7 +4942,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 46(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -4944,7 +4961,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 45(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -4963,7 +4980,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -4982,7 +4999,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 43(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5001,7 +5018,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 42(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5020,7 +5037,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 41(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5039,7 +5056,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5058,7 +5075,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 39(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5077,7 +5094,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 38(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5096,7 +5113,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 37(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5115,7 +5132,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5134,7 +5151,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 35(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5153,7 +5170,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 34(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5172,7 +5189,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 33(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5191,7 +5208,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 15(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5210,7 +5227,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 14(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5229,7 +5246,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 13(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5248,7 +5265,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 12(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5267,7 +5284,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 11(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5286,7 +5303,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 10(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5305,7 +5322,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 9(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5324,7 +5341,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 8(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5343,7 +5360,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 7(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5362,7 +5379,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 6(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5381,7 +5398,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 5(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5400,7 +5417,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 4(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5419,7 +5436,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 3(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5438,7 +5455,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 2(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
@@ -5457,7 +5474,7 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 1(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    ori a1, a1, 256
 ; LMULMAX1-RV64-NEXT:    addi a2, a1, -1
@@ -5475,14 +5492,15 @@
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sb a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sb a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle8.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle8.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle8.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    vse8.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse8.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <32 x i8>, <32 x i8>* %x
   %b = load <32 x i8>, <32 x i8>* %y
@@ -5495,13 +5513,13 @@
 define void @cttz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v16i16:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
@@ -5532,7 +5550,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
@@ -5552,7 +5570,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 30(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 62(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5571,7 +5589,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5590,7 +5608,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 26(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 58(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5609,7 +5627,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5628,7 +5646,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 22(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 54(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5647,7 +5665,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5666,7 +5684,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 18(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 50(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5685,7 +5703,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5704,7 +5722,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 46(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5723,7 +5741,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5742,7 +5760,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 42(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5761,7 +5779,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5780,7 +5798,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 38(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5799,7 +5817,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    or a1, a1, a6
@@ -5818,25 +5836,26 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV32-NEXT:    sh a1, 34(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle16.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle16.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v16i16:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
@@ -5891,7 +5910,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 30(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 62(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -5910,7 +5929,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 28(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -5929,7 +5948,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 26(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 58(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -5948,7 +5967,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -5967,7 +5986,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 22(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 54(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -5986,7 +6005,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 20(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6005,7 +6024,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 18(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 50(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6024,7 +6043,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6043,7 +6062,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 14(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 46(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6062,7 +6081,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6081,7 +6100,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 10(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 42(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6100,7 +6119,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6119,7 +6138,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 6(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 38(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6138,7 +6157,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -6157,7 +6176,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 2(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 34(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
 ; LMULMAX2-RV64-NEXT:    addi a2, a1, -1
@@ -6175,20 +6194,21 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sh a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle16.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle16.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse16.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v16i16:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle16.v v26, (a6)
@@ -6219,7 +6239,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a1, a1, 257
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 16(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
@@ -6239,7 +6259,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 30(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 46(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6258,7 +6278,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 28(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6277,7 +6297,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 26(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 42(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6296,7 +6316,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 24(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6315,7 +6335,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 22(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 38(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6334,7 +6354,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 20(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6353,7 +6373,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 18(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 34(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
 ; LMULMAX1-RV32-NEXT:    addi a3, a2, -1
@@ -6371,7 +6391,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 0(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6390,7 +6410,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 14(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 30(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6409,7 +6429,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 12(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6428,7 +6448,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 10(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 26(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6447,7 +6467,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 8(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6466,7 +6486,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 6(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 22(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6485,7 +6505,7 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a2, 24
-; LMULMAX1-RV32-NEXT:    sh a2, 4(sp)
+; LMULMAX1-RV32-NEXT:    sh a2, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV32-NEXT:    or a2, a2, a7
@@ -6504,20 +6524,21 @@
 ; LMULMAX1-RV32-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV32-NEXT:    mul a1, a2, a1
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sh a1, 2(sp)
+; LMULMAX1-RV32-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle16.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle16.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse16.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v16i16:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle16.v v26, (a6)
@@ -6570,7 +6591,7 @@
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 257
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 16(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -6590,7 +6611,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 30(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 46(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6609,7 +6630,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 28(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6628,7 +6649,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 26(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 42(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6647,7 +6668,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 24(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6666,7 +6687,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 22(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 38(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6685,7 +6706,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 20(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6704,7 +6725,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 18(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 34(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6723,7 +6744,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 14(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6742,7 +6763,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 12(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6761,7 +6782,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 10(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6780,7 +6801,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 8(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6799,7 +6820,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 6(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6818,7 +6839,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 4(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -6837,7 +6858,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sh a2, 2(sp)
+; LMULMAX1-RV64-NEXT:    sh a2, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
 ; LMULMAX1-RV64-NEXT:    addi a3, a2, -1
@@ -6855,14 +6876,15 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a1, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sh a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle16.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle16.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse16.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <16 x i16>, <16 x i16>* %x
   %b = load <16 x i16>, <16 x i16>* %y
@@ -6875,13 +6897,13 @@
 define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v8i32:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
@@ -6910,7 +6932,7 @@
 ; LMULMAX2-RV32-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV32-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV32-NEXT:    srli a5, a5, 24
-; LMULMAX2-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a5, v28
@@ -6929,7 +6951,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 60(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    addi a5, a1, -1
@@ -6947,7 +6969,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 56(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    addi a5, a1, -1
@@ -6965,7 +6987,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 52(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    addi a5, a1, -1
@@ -6983,7 +7005,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 48(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    addi a5, a1, -1
@@ -7001,7 +7023,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 44(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV32-NEXT:    addi a5, a1, -1
@@ -7019,7 +7041,7 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV32-NEXT:    addi a5, a1, -1
@@ -7037,25 +7059,26 @@
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 36(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vse32.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v8i32:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
@@ -7111,7 +7134,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 28(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -7130,7 +7153,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 24(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -7149,7 +7172,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 20(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -7168,7 +7191,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -7187,7 +7210,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 12(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -7206,7 +7229,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
@@ -7225,7 +7248,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 4(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    or a1, a1, a6
 ; LMULMAX2-RV64-NEXT:    addi a2, a1, -1
@@ -7243,20 +7266,21 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sw a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse32.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v8i32:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a6)
@@ -7285,7 +7309,7 @@
 ; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
@@ -7304,7 +7328,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v27
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
@@ -7322,7 +7346,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
@@ -7340,7 +7364,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 36(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
 ; LMULMAX1-RV32-NEXT:    not a1, a1
@@ -7357,7 +7381,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
@@ -7375,7 +7399,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
@@ -7393,7 +7417,7 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
@@ -7411,20 +7435,21 @@
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse32.v v26, (a6)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v8i32:
 ; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV64-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    addi a6, a0, 16
 ; LMULMAX1-RV64-NEXT:    vle32.v v26, (a6)
@@ -7478,7 +7503,7 @@
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 257
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 16(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -7498,7 +7523,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 28(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -7517,7 +7542,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 24(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -7536,7 +7561,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 20(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -7555,7 +7580,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 12(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -7574,7 +7599,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 8(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
@@ -7593,7 +7618,7 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    sw a2, 4(sp)
+; LMULMAX1-RV64-NEXT:    sw a2, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    or a2, a2, a7
 ; LMULMAX1-RV64-NEXT:    addi a3, a2, -1
@@ -7611,14 +7636,15 @@
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a1, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV64-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV64-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV64-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV64-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV64-NEXT:    vse32.v v25, (a0)
 ; LMULMAX1-RV64-NEXT:    vse32.v v26, (a6)
-; LMULMAX1-RV64-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV64-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV64-NEXT:    ret
   %a = load <8 x i32>, <8 x i32>* %x
   %b = load <8 x i32>, <8 x i32>* %y
@@ -7631,21 +7657,21 @@
 define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-LABEL: cttz_v4i64:
 ; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; LMULMAX2-RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV32-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; LMULMAX2-RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; LMULMAX2-RV32-NEXT:    .cfi_offset ra, -4
 ; LMULMAX2-RV32-NEXT:    .cfi_offset s0, -8
-; LMULMAX2-RV32-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV32-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV32-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV32-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vle64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    sw zero, 28(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 20(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 12(sp)
-; LMULMAX2-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 60(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 52(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 44(sp)
+; LMULMAX2-RV32-NEXT:    sw zero, 36(sp)
 ; LMULMAX2-RV32-NEXT:    addi a6, zero, 32
 ; LMULMAX2-RV32-NEXT:    lui a1, 349525
 ; LMULMAX2-RV32-NEXT:    addi a4, a1, 1365
@@ -7695,7 +7721,7 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a5, a1, 24
 ; LMULMAX2-RV32-NEXT:  .LBB7_3:
-; LMULMAX2-RV32-NEXT:    sw a5, 0(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 32(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a5, v28
@@ -7739,7 +7765,7 @@
 ; LMULMAX2-RV32-NEXT:  .LBB7_6:
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v28
-; LMULMAX2-RV32-NEXT:    sw a5, 24(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 56(sp)
 ; LMULMAX2-RV32-NEXT:    bnez a1, .LBB7_8
 ; LMULMAX2-RV32-NEXT:  # %bb.7:
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v28, v28, a6
@@ -7780,7 +7806,7 @@
 ; LMULMAX2-RV32-NEXT:  .LBB7_9:
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT:    sw a5, 16(sp)
+; LMULMAX2-RV32-NEXT:    sw a5, 48(sp)
 ; LMULMAX2-RV32-NEXT:    bnez a1, .LBB7_11
 ; LMULMAX2-RV32-NEXT:  # %bb.10:
 ; LMULMAX2-RV32-NEXT:    vsrl.vx v26, v26, a6
@@ -7819,26 +7845,27 @@
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:  .LBB7_12:
-; LMULMAX2-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX2-RV32-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; LMULMAX2-RV32-NEXT:    vle32.v v26, (sp)
+; LMULMAX2-RV32-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
 ; LMULMAX2-RV32-NEXT:    vse64.v v26, (a0)
-; LMULMAX2-RV32-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; LMULMAX2-RV32-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV32-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; LMULMAX2-RV32-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV32-NEXT:    ret
 ;
 ; LMULMAX2-RV64-LABEL: cttz_v4i64:
 ; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    addi sp, sp, -64
-; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 64
-; LMULMAX2-RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; LMULMAX2-RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    addi sp, sp, -96
+; LMULMAX2-RV64-NEXT:    .cfi_def_cfa_offset 96
+; LMULMAX2-RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; LMULMAX2-RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; LMULMAX2-RV64-NEXT:    .cfi_offset ra, -8
 ; LMULMAX2-RV64-NEXT:    .cfi_offset s0, -16
-; LMULMAX2-RV64-NEXT:    addi s0, sp, 64
+; LMULMAX2-RV64-NEXT:    addi s0, sp, 96
 ; LMULMAX2-RV64-NEXT:    .cfi_def_cfa s0, 0
 ; LMULMAX2-RV64-NEXT:    andi sp, sp, -32
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
@@ -7891,7 +7918,7 @@
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a3, a3, a5
 ; LMULMAX2-RV64-NEXT:    srli a3, a3, 56
-; LMULMAX2-RV64-NEXT:    sd a3, 24(sp)
+; LMULMAX2-RV64-NEXT:    sd a3, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
 ; LMULMAX2-RV64-NEXT:    addi a1, a3, -1
@@ -7909,7 +7936,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sd a1, 16(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
 ; LMULMAX2-RV64-NEXT:    addi a3, a1, -1
@@ -7927,7 +7954,7 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sd a1, 8(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
 ; LMULMAX2-RV64-NEXT:    addi a3, a1, -1
 ; LMULMAX2-RV64-NEXT:    not a1, a1
@@ -7944,26 +7971,27 @@
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    sd a1, 0(sp)
+; LMULMAX2-RV64-NEXT:    sd a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; LMULMAX2-RV64-NEXT:    vle64.v v26, (sp)
+; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
+; LMULMAX2-RV64-NEXT:    vle64.v v26, (a1)
 ; LMULMAX2-RV64-NEXT:    vse64.v v26, (a0)
-; LMULMAX2-RV64-NEXT:    addi sp, s0, -64
-; LMULMAX2-RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; LMULMAX2-RV64-NEXT:    addi sp, sp, 64
+; LMULMAX2-RV64-NEXT:    addi sp, s0, -96
+; LMULMAX2-RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; LMULMAX2-RV64-NEXT:    addi sp, sp, 96
 ; LMULMAX2-RV64-NEXT:    ret
 ;
 ; LMULMAX1-RV32-LABEL: cttz_v4i64:
 ; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    addi sp, sp, -32
-; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, -48
+; LMULMAX1-RV32-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    addi a7, a0, 16
 ; LMULMAX1-RV32-NEXT:    vle64.v v26, (a7)
-; LMULMAX1-RV32-NEXT:    sw zero, 28(sp)
-; LMULMAX1-RV32-NEXT:    sw zero, 20(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 44(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 36(sp)
 ; LMULMAX1-RV32-NEXT:    addi a6, zero, 32
 ; LMULMAX1-RV32-NEXT:    lui a1, 349525
 ; LMULMAX1-RV32-NEXT:    addi a5, a1, 1365
@@ -8013,7 +8041,7 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB7_3:
-; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
@@ -8055,10 +8083,10 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB7_6:
-; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
-; LMULMAX1-RV32-NEXT:    sw zero, 12(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 40(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
-; LMULMAX1-RV32-NEXT:    sw zero, 4(sp)
+; LMULMAX1-RV32-NEXT:    sw zero, 20(sp)
 ; LMULMAX1-RV32-NEXT:    bnez a1, .LBB7_8
 ; LMULMAX1-RV32-NEXT:  # %bb.7:
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v25, a6
@@ -8099,7 +8127,7 @@
 ; LMULMAX1-RV32-NEXT:  .LBB7_9:
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a2, v25
-; LMULMAX1-RV32-NEXT:    sw a1, 0(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV32-NEXT:    bnez a2, .LBB7_11
 ; LMULMAX1-RV32-NEXT:  # %bb.10:
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v25, v25, a6
@@ -8138,15 +8166,16 @@
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:  .LBB7_12:
-; LMULMAX1-RV32-NEXT:    sw a1, 8(sp)
+; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-RV32-NEXT:    vle32.v v25, (sp)
 ; LMULMAX1-RV32-NEXT:    addi a1, sp, 16
+; LMULMAX1-RV32-NEXT:    vle32.v v25, (a1)
+; LMULMAX1-RV32-NEXT:    addi a1, sp, 32
 ; LMULMAX1-RV32-NEXT:    vle32.v v26, (a1)
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-RV32-NEXT:    vse64.v v25, (a0)
 ; LMULMAX1-RV32-NEXT:    vse64.v v26, (a7)
-; LMULMAX1-RV32-NEXT:    addi sp, sp, 32
+; LMULMAX1-RV32-NEXT:    addi sp, sp, 48
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: cttz_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-emergency-slot.mir b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-emergency-slot.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-emergency-slot.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+# RUN: llc -mtriple riscv64 -start-before=prologepilog -o - \
+# RUN:     -verify-machineinstrs %s | FileCheck %s
+--- |
+  target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
+  target triple = "riscv64"
+
+  define weak_odr dso_local void @fixedlen_vector_spillslot(i8* %ay) nounwind {
+  ; CHECK-LABEL: fixedlen_vector_spillslot:
+  ; CHECK:       # %bb.0: # %entry
+  ; CHECK-NEXT:    addi sp, sp, -48
+  ; CHECK-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+  ; CHECK-NEXT:    sd a0, 32(sp)
+  ; CHECK-NEXT:    sd a0, 16(sp)
+  ; CHECK-NEXT:    vsetivli a5, 1, e16,m1,ta,mu
+  ; CHECK-NEXT:    sd a1, 8(sp)
+  ; CHECK-NEXT:    addi a1, sp, 24
+  ; CHECK-NEXT:    vs1r.v v25, (a1) # Unknown-size Folded Spill
+  ; CHECK-NEXT:    ld a1, 8(sp)
+  ; CHECK-NEXT:    call fixedlen_vector_spillslot@plt
+  ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+  ; CHECK-NEXT:    addi sp, sp, 48
+  ; CHECK-NEXT:    ret
+  entry:
+    ret void
+  }
+
+...
+---
+name:            fixedlen_vector_spillslot
+alignment:       2
+tracksRegLiveness: false
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: 0, size: 2, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x1, $x5, $x6, $x7, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31, $v25
+
+    SD $x10, %stack.0, 0
+    SD $x10, %stack.2, 0
+    dead renamable $x15 = PseudoVSETIVLI 1, 72, implicit-def $vl, implicit-def $vtype
+    PseudoVSPILL_M1 killed renamable $v25, %stack.1 :: (store unknown-size into %stack.1, align 8)
+    ; This is here just to make all the eligible registers live at this point.
+    ; This way when we replace the frame index %stack.1 with its actual address
+    ; we have to allocate a virtual register to compute it.
+    ; A later run of the the register scavenger won't find an available register
+    ; either so it will have to spill one to the emergency spill slot.
+    PseudoCALL target-flags(riscv-plt) @fixedlen_vector_spillslot, csr_ilp32_lp64, implicit-def $x1, implicit-def $x2, implicit $x1, implicit $x5, implicit $x6, implicit $x7, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x28, implicit $x29, implicit $x30, implicit $x31
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -151,10 +151,10 @@
 define i1 @extractelt_v256i1(<256 x i8>* %x, i64 %idx) nounwind {
 ; RV32-LABEL: extractelt_v256i1:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -384
-; RV32-NEXT:    sw ra, 380(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
-; RV32-NEXT:    addi s0, sp, 384
+; RV32-NEXT:    addi sp, sp, -512
+; RV32-NEXT:    sw ra, 508(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 504(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 512
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    andi a1, a1, 255
 ; RV32-NEXT:    addi a2, a0, 128
@@ -162,30 +162,31 @@
 ; RV32-NEXT:    vsetvli zero, a3, e8,m8,ta,mu
 ; RV32-NEXT:    vle8.v v8, (a0)
 ; RV32-NEXT:    vle8.v v16, (a2)
-; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    vmseq.vi v25, v8, 0
 ; RV32-NEXT:    vmseq.vi v0, v16, 0
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    vmerge.vim v16, v8, 1, v0
-; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    addi a1, sp, 256
 ; RV32-NEXT:    vse8.v v16, (a1)
 ; RV32-NEXT:    vmv1r.v v0, v25
 ; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV32-NEXT:    vse8.v v8, (sp)
+; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    vse8.v v8, (a1)
 ; RV32-NEXT:    lb a0, 0(a0)
-; RV32-NEXT:    addi sp, s0, -384
-; RV32-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 384
+; RV32-NEXT:    addi sp, s0, -512
+; RV32-NEXT:    lw s0, 504(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 508(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 512
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: extractelt_v256i1:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -384
-; RV64-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
-; RV64-NEXT:    addi s0, sp, 384
+; RV64-NEXT:    addi sp, sp, -512
+; RV64-NEXT:    sd ra, 504(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 496(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 512
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    andi a1, a1, 255
 ; RV64-NEXT:    addi a2, a0, 128
@@ -193,22 +194,23 @@
 ; RV64-NEXT:    vsetvli zero, a3, e8,m8,ta,mu
 ; RV64-NEXT:    vle8.v v8, (a0)
 ; RV64-NEXT:    vle8.v v16, (a2)
-; RV64-NEXT:    mv a0, sp
+; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vmseq.vi v25, v8, 0
 ; RV64-NEXT:    vmseq.vi v0, v16, 0
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    vmerge.vim v16, v8, 1, v0
-; RV64-NEXT:    addi a1, sp, 128
+; RV64-NEXT:    addi a1, sp, 256
 ; RV64-NEXT:    vse8.v v16, (a1)
 ; RV64-NEXT:    vmv1r.v v0, v25
 ; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV64-NEXT:    vse8.v v8, (sp)
+; RV64-NEXT:    addi a1, sp, 128
+; RV64-NEXT:    vse8.v v8, (a1)
 ; RV64-NEXT:    lb a0, 0(a0)
-; RV64-NEXT:    addi sp, s0, -384
-; RV64-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 384
+; RV64-NEXT:    addi sp, s0, -512
+; RV64-NEXT:    ld s0, 496(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 504(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 512
 ; RV64-NEXT:    ret
   %a = load <256 x i8>, <256 x i8>* %x
   %b = icmp eq <256 x i8> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -35,23 +35,24 @@
 define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize {
 ; LMULMAX1-LABEL: hang_when_merging_stores_after_legalization:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -16
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-NEXT:    addi sp, sp, -32
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; LMULMAX1-NEXT:    vfmv.f.s ft0, v10
-; LMULMAX1-NEXT:    fsw ft0, 8(sp)
+; LMULMAX1-NEXT:    fsw ft0, 24(sp)
 ; LMULMAX1-NEXT:    vfmv.f.s ft0, v8
-; LMULMAX1-NEXT:    fsw ft0, 0(sp)
+; LMULMAX1-NEXT:    fsw ft0, 16(sp)
 ; LMULMAX1-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; LMULMAX1-NEXT:    vslidedown.vi v26, v10, 7
 ; LMULMAX1-NEXT:    vfmv.f.s ft0, v26
-; LMULMAX1-NEXT:    fsw ft0, 12(sp)
+; LMULMAX1-NEXT:    fsw ft0, 28(sp)
 ; LMULMAX1-NEXT:    vslidedown.vi v26, v8, 7
 ; LMULMAX1-NEXT:    vfmv.f.s ft0, v26
-; LMULMAX1-NEXT:    fsw ft0, 4(sp)
+; LMULMAX1-NEXT:    fsw ft0, 20(sp)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; LMULMAX1-NEXT:    vle32.v v8, (sp)
-; LMULMAX1-NEXT:    addi sp, sp, 16
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vle32.v v8, (a0)
+; LMULMAX1-NEXT:    addi sp, sp, 32
 ; LMULMAX1-NEXT:    ret
 ;
 ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
@@ -163,22 +163,24 @@
 ;
 ; LMULMAX1-LABEL: fpround_v8f32_v8f16:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -16
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-NEXT:    addi sp, sp, -32
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
 ; LMULMAX1-NEXT:    addi a2, a0, 16
 ; LMULMAX1-NEXT:    vle32.v v25, (a2)
 ; LMULMAX1-NEXT:    vle32.v v26, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v27, v25
-; LMULMAX1-NEXT:    addi a0, sp, 8
+; LMULMAX1-NEXT:    addi a0, sp, 24
 ; LMULMAX1-NEXT:    vse16.v v27, (a0)
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
-; LMULMAX1-NEXT:    vse16.v v25, (sp)
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-NEXT:    vle16.v v25, (sp)
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    vse16.v v25, (a1)
-; LMULMAX1-NEXT:    addi sp, sp, 16
+; LMULMAX1-NEXT:    addi sp, sp, 32
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x float>, <8 x float>* %x
   %d = fptrunc <8 x float> %a to <8 x half>
@@ -200,8 +202,8 @@
 ;
 ; LMULMAX1-LABEL: fpround_v8f64_v8f16:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -32
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-NEXT:    addi sp, sp, -48
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-NEXT:    addi a2, a0, 32
@@ -214,39 +216,41 @@
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v29, v27
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v27, v29
-; LMULMAX1-NEXT:    addi a0, sp, 12
+; LMULMAX1-NEXT:    addi a0, sp, 28
 ; LMULMAX1-NEXT:    vse16.v v27, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v28, v27
-; LMULMAX1-NEXT:    addi a0, sp, 4
+; LMULMAX1-NEXT:    addi a0, sp, 20
 ; LMULMAX1-NEXT:    vse16.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v26, v27
-; LMULMAX1-NEXT:    addi a0, sp, 8
+; LMULMAX1-NEXT:    addi a0, sp, 24
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16,mf2,ta,mu
-; LMULMAX1-NEXT:    addi a0, sp, 8
-; LMULMAX1-NEXT:    vle16.v v26, (a0)
 ; LMULMAX1-NEXT:    addi a0, sp, 24
+; LMULMAX1-NEXT:    vle16.v v26, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 40
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
-; LMULMAX1-NEXT:    vse16.v v25, (sp)
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16,mf2,ta,mu
-; LMULMAX1-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 32
 ; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    addi a0, sp, 32
 ; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    vse16.v v25, (a1)
-; LMULMAX1-NEXT:    addi sp, sp, 32
+; LMULMAX1-NEXT:    addi sp, sp, 48
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x double>, <8 x double>* %x
   %d = fptrunc <8 x double> %a to <8 x half>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -256,8 +256,8 @@
 ;
 ; LMULMAX1-LABEL: si2fp_v8i64_v8f16:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -32
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-NEXT:    addi sp, sp, -48
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-NEXT:    addi a2, a0, 32
@@ -270,39 +270,41 @@
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v29, v27
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v27, v29
-; LMULMAX1-NEXT:    addi a0, sp, 12
+; LMULMAX1-NEXT:    addi a0, sp, 28
 ; LMULMAX1-NEXT:    vse16.v v27, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v28, v27
-; LMULMAX1-NEXT:    addi a0, sp, 4
+; LMULMAX1-NEXT:    addi a0, sp, 20
 ; LMULMAX1-NEXT:    vse16.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v26, v27
-; LMULMAX1-NEXT:    addi a0, sp, 8
+; LMULMAX1-NEXT:    addi a0, sp, 24
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16,mf2,ta,mu
-; LMULMAX1-NEXT:    addi a0, sp, 8
-; LMULMAX1-NEXT:    vle16.v v26, (a0)
 ; LMULMAX1-NEXT:    addi a0, sp, 24
+; LMULMAX1-NEXT:    vle16.v v26, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 40
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
-; LMULMAX1-NEXT:    vse16.v v25, (sp)
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16,mf2,ta,mu
-; LMULMAX1-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 32
 ; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    addi a0, sp, 32
 ; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    vse16.v v25, (a1)
-; LMULMAX1-NEXT:    addi sp, sp, 32
+; LMULMAX1-NEXT:    addi sp, sp, 48
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %x
   %d = sitofp <8 x i64> %a to <8 x half>
@@ -324,8 +326,8 @@
 ;
 ; LMULMAX1-LABEL: ui2fp_v8i64_v8f16:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -32
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
+; LMULMAX1-NEXT:    addi sp, sp, -48
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 48
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e64,m1,ta,mu
 ; LMULMAX1-NEXT:    vle64.v v25, (a0)
 ; LMULMAX1-NEXT:    addi a2, a0, 32
@@ -338,39 +340,41 @@
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v29, v27
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v27, v29
-; LMULMAX1-NEXT:    addi a0, sp, 12
+; LMULMAX1-NEXT:    addi a0, sp, 28
 ; LMULMAX1-NEXT:    vse16.v v27, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v28, v27
-; LMULMAX1-NEXT:    addi a0, sp, 4
+; LMULMAX1-NEXT:    addi a0, sp, 20
 ; LMULMAX1-NEXT:    vse16.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v26, v27
-; LMULMAX1-NEXT:    addi a0, sp, 8
+; LMULMAX1-NEXT:    addi a0, sp, 24
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16,mf2,ta,mu
-; LMULMAX1-NEXT:    addi a0, sp, 8
-; LMULMAX1-NEXT:    vle16.v v26, (a0)
 ; LMULMAX1-NEXT:    addi a0, sp, 24
+; LMULMAX1-NEXT:    vle16.v v26, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 40
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32,mf2,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16,mf4,ta,mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
-; LMULMAX1-NEXT:    vse16.v v25, (sp)
+; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16,mf2,ta,mu
-; LMULMAX1-NEXT:    vle16.v v25, (sp)
 ; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    vle16.v v25, (a0)
+; LMULMAX1-NEXT:    addi a0, sp, 32
 ; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; LMULMAX1-NEXT:    addi a0, sp, 16
+; LMULMAX1-NEXT:    addi a0, sp, 32
 ; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    vse16.v v25, (a1)
-; LMULMAX1-NEXT:    addi sp, sp, 32
+; LMULMAX1-NEXT:    addi sp, sp, 48
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %x
   %d = uitofp <8 x i64> %a to <8 x half>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll
@@ -171,13 +171,13 @@
 define <8 x half> @select_v8f16(i1 zeroext %c, <8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: select_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    bnez a0, .LBB4_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v9
-; CHECK-NEXT:    fsh ft0, 0(sp)
+; CHECK-NEXT:    fsh ft0, 16(sp)
 ; CHECK-NEXT:    beqz a0, .LBB4_4
 ; CHECK-NEXT:  .LBB4_2:
 ; CHECK-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
@@ -186,14 +186,14 @@
 ; CHECK-NEXT:  .LBB4_3:
 ; CHECK-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v8
-; CHECK-NEXT:    fsh ft0, 0(sp)
+; CHECK-NEXT:    fsh ft0, 16(sp)
 ; CHECK-NEXT:    bnez a0, .LBB4_2
 ; CHECK-NEXT:  .LBB4_4:
 ; CHECK-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 7
 ; CHECK-NEXT:  .LBB4_5:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 14(sp)
+; CHECK-NEXT:    fsh ft0, 30(sp)
 ; CHECK-NEXT:    bnez a0, .LBB4_7
 ; CHECK-NEXT:  # %bb.6:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 6
@@ -202,7 +202,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 6
 ; CHECK-NEXT:  .LBB4_8:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 12(sp)
+; CHECK-NEXT:    fsh ft0, 28(sp)
 ; CHECK-NEXT:    bnez a0, .LBB4_10
 ; CHECK-NEXT:  # %bb.9:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 5
@@ -211,7 +211,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 5
 ; CHECK-NEXT:  .LBB4_11:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 10(sp)
+; CHECK-NEXT:    fsh ft0, 26(sp)
 ; CHECK-NEXT:    bnez a0, .LBB4_13
 ; CHECK-NEXT:  # %bb.12:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 4
@@ -220,7 +220,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 4
 ; CHECK-NEXT:  .LBB4_14:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 8(sp)
+; CHECK-NEXT:    fsh ft0, 24(sp)
 ; CHECK-NEXT:    bnez a0, .LBB4_16
 ; CHECK-NEXT:  # %bb.15:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 3
@@ -229,7 +229,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 3
 ; CHECK-NEXT:  .LBB4_17:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 6(sp)
+; CHECK-NEXT:    fsh ft0, 22(sp)
 ; CHECK-NEXT:    bnez a0, .LBB4_19
 ; CHECK-NEXT:  # %bb.18:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 2
@@ -238,7 +238,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 2
 ; CHECK-NEXT:  .LBB4_20:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 4(sp)
+; CHECK-NEXT:    fsh ft0, 20(sp)
 ; CHECK-NEXT:    bnez a0, .LBB4_22
 ; CHECK-NEXT:  # %bb.21:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 1
@@ -247,10 +247,11 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 1
 ; CHECK-NEXT:  .LBB4_23:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 2(sp)
+; CHECK-NEXT:    fsh ft0, 18(sp)
 ; CHECK-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; CHECK-NEXT:    vle16.v v8, (sp)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    ret
   %v = select i1 %c, <8 x half> %a, <8 x half> %b
   ret <8 x half> %v
@@ -259,14 +260,14 @@
 define <8 x half> @selectcc_v8f16(half %a, half %b, <8 x half> %c, <8 x half> %d) {
 ; CHECK-LABEL: selectcc_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    feq.h a0, fa0, fa1
 ; CHECK-NEXT:    bnez a0, .LBB5_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v9
-; CHECK-NEXT:    fsh ft0, 0(sp)
+; CHECK-NEXT:    fsh ft0, 16(sp)
 ; CHECK-NEXT:    beqz a0, .LBB5_4
 ; CHECK-NEXT:  .LBB5_2:
 ; CHECK-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
@@ -275,14 +276,14 @@
 ; CHECK-NEXT:  .LBB5_3:
 ; CHECK-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v8
-; CHECK-NEXT:    fsh ft0, 0(sp)
+; CHECK-NEXT:    fsh ft0, 16(sp)
 ; CHECK-NEXT:    bnez a0, .LBB5_2
 ; CHECK-NEXT:  .LBB5_4:
 ; CHECK-NEXT:    vsetivli zero, 1, e16,m1,ta,mu
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 7
 ; CHECK-NEXT:  .LBB5_5:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 14(sp)
+; CHECK-NEXT:    fsh ft0, 30(sp)
 ; CHECK-NEXT:    bnez a0, .LBB5_7
 ; CHECK-NEXT:  # %bb.6:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 6
@@ -291,7 +292,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 6
 ; CHECK-NEXT:  .LBB5_8:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 12(sp)
+; CHECK-NEXT:    fsh ft0, 28(sp)
 ; CHECK-NEXT:    bnez a0, .LBB5_10
 ; CHECK-NEXT:  # %bb.9:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 5
@@ -300,7 +301,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 5
 ; CHECK-NEXT:  .LBB5_11:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 10(sp)
+; CHECK-NEXT:    fsh ft0, 26(sp)
 ; CHECK-NEXT:    bnez a0, .LBB5_13
 ; CHECK-NEXT:  # %bb.12:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 4
@@ -309,7 +310,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 4
 ; CHECK-NEXT:  .LBB5_14:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 8(sp)
+; CHECK-NEXT:    fsh ft0, 24(sp)
 ; CHECK-NEXT:    bnez a0, .LBB5_16
 ; CHECK-NEXT:  # %bb.15:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 3
@@ -318,7 +319,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 3
 ; CHECK-NEXT:  .LBB5_17:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 6(sp)
+; CHECK-NEXT:    fsh ft0, 22(sp)
 ; CHECK-NEXT:    bnez a0, .LBB5_19
 ; CHECK-NEXT:  # %bb.18:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 2
@@ -327,7 +328,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 2
 ; CHECK-NEXT:  .LBB5_20:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 4(sp)
+; CHECK-NEXT:    fsh ft0, 20(sp)
 ; CHECK-NEXT:    bnez a0, .LBB5_22
 ; CHECK-NEXT:  # %bb.21:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 1
@@ -336,10 +337,11 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 1
 ; CHECK-NEXT:  .LBB5_23:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsh ft0, 2(sp)
+; CHECK-NEXT:    fsh ft0, 18(sp)
 ; CHECK-NEXT:    vsetivli zero, 8, e16,m1,ta,mu
-; CHECK-NEXT:    vle16.v v8, (sp)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    ret
   %cmp = fcmp oeq half %a, %b
   %v = select i1 %cmp, <8 x half> %c, <8 x half> %d
@@ -349,20 +351,20 @@
 define <16 x half> @select_v16f16(i1 zeroext %c, <16 x half> %a, <16 x half> %b) {
 ; RV32-LABEL: select_v16f16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    .cfi_def_cfa_offset 96
+; RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
+; RV32-NEXT:    addi s0, sp, 96
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -32
 ; RV32-NEXT:    bnez a0, .LBB6_3
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v10
-; RV32-NEXT:    fsh ft0, 0(sp)
+; RV32-NEXT:    fsh ft0, 32(sp)
 ; RV32-NEXT:    beqz a0, .LBB6_4
 ; RV32-NEXT:  .LBB6_2:
 ; RV32-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
@@ -371,14 +373,14 @@
 ; RV32-NEXT:  .LBB6_3:
 ; RV32-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsh ft0, 0(sp)
+; RV32-NEXT:    fsh ft0, 32(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_2
 ; RV32-NEXT:  .LBB6_4:
 ; RV32-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; RV32-NEXT:    vslidedown.vi v26, v10, 15
 ; RV32-NEXT:  .LBB6_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 30(sp)
+; RV32-NEXT:    fsh ft0, 62(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 14
@@ -387,7 +389,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 14
 ; RV32-NEXT:  .LBB6_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 28(sp)
+; RV32-NEXT:    fsh ft0, 60(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 13
@@ -396,7 +398,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 13
 ; RV32-NEXT:  .LBB6_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 26(sp)
+; RV32-NEXT:    fsh ft0, 58(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 12
@@ -405,7 +407,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 12
 ; RV32-NEXT:  .LBB6_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 24(sp)
+; RV32-NEXT:    fsh ft0, 56(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 11
@@ -414,7 +416,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 11
 ; RV32-NEXT:  .LBB6_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 22(sp)
+; RV32-NEXT:    fsh ft0, 54(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 10
@@ -423,7 +425,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 10
 ; RV32-NEXT:  .LBB6_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 20(sp)
+; RV32-NEXT:    fsh ft0, 52(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 9
@@ -432,7 +434,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 9
 ; RV32-NEXT:  .LBB6_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 18(sp)
+; RV32-NEXT:    fsh ft0, 50(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_25
 ; RV32-NEXT:  # %bb.24:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 8
@@ -441,7 +443,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 8
 ; RV32-NEXT:  .LBB6_26:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 16(sp)
+; RV32-NEXT:    fsh ft0, 48(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_28
 ; RV32-NEXT:  # %bb.27:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 7
@@ -450,7 +452,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 7
 ; RV32-NEXT:  .LBB6_29:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 14(sp)
+; RV32-NEXT:    fsh ft0, 46(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_31
 ; RV32-NEXT:  # %bb.30:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 6
@@ -459,7 +461,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 6
 ; RV32-NEXT:  .LBB6_32:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 12(sp)
+; RV32-NEXT:    fsh ft0, 44(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_34
 ; RV32-NEXT:  # %bb.33:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 5
@@ -468,7 +470,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 5
 ; RV32-NEXT:  .LBB6_35:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 10(sp)
+; RV32-NEXT:    fsh ft0, 42(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_37
 ; RV32-NEXT:  # %bb.36:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 4
@@ -477,7 +479,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 4
 ; RV32-NEXT:  .LBB6_38:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 8(sp)
+; RV32-NEXT:    fsh ft0, 40(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_40
 ; RV32-NEXT:  # %bb.39:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 3
@@ -486,7 +488,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 3
 ; RV32-NEXT:  .LBB6_41:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 6(sp)
+; RV32-NEXT:    fsh ft0, 38(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_43
 ; RV32-NEXT:  # %bb.42:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 2
@@ -495,7 +497,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 2
 ; RV32-NEXT:  .LBB6_44:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 4(sp)
+; RV32-NEXT:    fsh ft0, 36(sp)
 ; RV32-NEXT:    bnez a0, .LBB6_46
 ; RV32-NEXT:  # %bb.45:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 1
@@ -504,31 +506,32 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 1
 ; RV32-NEXT:  .LBB6_47:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 2(sp)
+; RV32-NEXT:    fsh ft0, 34(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; RV32-NEXT:    vle16.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    vle16.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -96
+; RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: select_v16f16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
+; RV64-NEXT:    addi s0, sp, 96
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -32
 ; RV64-NEXT:    bnez a0, .LBB6_3
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v10
-; RV64-NEXT:    fsh ft0, 0(sp)
+; RV64-NEXT:    fsh ft0, 32(sp)
 ; RV64-NEXT:    beqz a0, .LBB6_4
 ; RV64-NEXT:  .LBB6_2:
 ; RV64-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
@@ -537,14 +540,14 @@
 ; RV64-NEXT:  .LBB6_3:
 ; RV64-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsh ft0, 0(sp)
+; RV64-NEXT:    fsh ft0, 32(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_2
 ; RV64-NEXT:  .LBB6_4:
 ; RV64-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; RV64-NEXT:    vslidedown.vi v26, v10, 15
 ; RV64-NEXT:  .LBB6_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 30(sp)
+; RV64-NEXT:    fsh ft0, 62(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 14
@@ -553,7 +556,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 14
 ; RV64-NEXT:  .LBB6_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 28(sp)
+; RV64-NEXT:    fsh ft0, 60(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 13
@@ -562,7 +565,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 13
 ; RV64-NEXT:  .LBB6_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 26(sp)
+; RV64-NEXT:    fsh ft0, 58(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 12
@@ -571,7 +574,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 12
 ; RV64-NEXT:  .LBB6_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 24(sp)
+; RV64-NEXT:    fsh ft0, 56(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 11
@@ -580,7 +583,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 11
 ; RV64-NEXT:  .LBB6_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 22(sp)
+; RV64-NEXT:    fsh ft0, 54(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 10
@@ -589,7 +592,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 10
 ; RV64-NEXT:  .LBB6_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 20(sp)
+; RV64-NEXT:    fsh ft0, 52(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 9
@@ -598,7 +601,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 9
 ; RV64-NEXT:  .LBB6_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 18(sp)
+; RV64-NEXT:    fsh ft0, 50(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_25
 ; RV64-NEXT:  # %bb.24:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 8
@@ -607,7 +610,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 8
 ; RV64-NEXT:  .LBB6_26:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 16(sp)
+; RV64-NEXT:    fsh ft0, 48(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_28
 ; RV64-NEXT:  # %bb.27:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 7
@@ -616,7 +619,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 7
 ; RV64-NEXT:  .LBB6_29:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 14(sp)
+; RV64-NEXT:    fsh ft0, 46(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_31
 ; RV64-NEXT:  # %bb.30:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 6
@@ -625,7 +628,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 6
 ; RV64-NEXT:  .LBB6_32:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 12(sp)
+; RV64-NEXT:    fsh ft0, 44(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_34
 ; RV64-NEXT:  # %bb.33:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 5
@@ -634,7 +637,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 5
 ; RV64-NEXT:  .LBB6_35:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 10(sp)
+; RV64-NEXT:    fsh ft0, 42(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_37
 ; RV64-NEXT:  # %bb.36:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 4
@@ -643,7 +646,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 4
 ; RV64-NEXT:  .LBB6_38:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 8(sp)
+; RV64-NEXT:    fsh ft0, 40(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_40
 ; RV64-NEXT:  # %bb.39:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 3
@@ -652,7 +655,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 3
 ; RV64-NEXT:  .LBB6_41:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 6(sp)
+; RV64-NEXT:    fsh ft0, 38(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_43
 ; RV64-NEXT:  # %bb.42:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 2
@@ -661,7 +664,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 2
 ; RV64-NEXT:  .LBB6_44:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 4(sp)
+; RV64-NEXT:    fsh ft0, 36(sp)
 ; RV64-NEXT:    bnez a0, .LBB6_46
 ; RV64-NEXT:  # %bb.45:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 1
@@ -670,13 +673,14 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 1
 ; RV64-NEXT:  .LBB6_47:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 2(sp)
+; RV64-NEXT:    fsh ft0, 34(sp)
 ; RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; RV64-NEXT:    vle16.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, sp, 32
+; RV64-NEXT:    vle16.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
 ; RV64-NEXT:    ret
   %v = select i1 %c, <16 x half> %a, <16 x half> %b
   ret <16 x half> %v
@@ -685,13 +689,13 @@
 define <16 x half> @selectcc_v16f16(half %a, half %b, <16 x half> %c, <16 x half> %d) {
 ; RV32-LABEL: selectcc_v16f16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    .cfi_def_cfa_offset 96
+; RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
+; RV32-NEXT:    addi s0, sp, 96
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -32
 ; RV32-NEXT:    feq.h a0, fa0, fa1
@@ -699,7 +703,7 @@
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v10
-; RV32-NEXT:    fsh ft0, 0(sp)
+; RV32-NEXT:    fsh ft0, 32(sp)
 ; RV32-NEXT:    beqz a0, .LBB7_4
 ; RV32-NEXT:  .LBB7_2:
 ; RV32-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
@@ -708,14 +712,14 @@
 ; RV32-NEXT:  .LBB7_3:
 ; RV32-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsh ft0, 0(sp)
+; RV32-NEXT:    fsh ft0, 32(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_2
 ; RV32-NEXT:  .LBB7_4:
 ; RV32-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; RV32-NEXT:    vslidedown.vi v26, v10, 15
 ; RV32-NEXT:  .LBB7_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 30(sp)
+; RV32-NEXT:    fsh ft0, 62(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 14
@@ -724,7 +728,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 14
 ; RV32-NEXT:  .LBB7_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 28(sp)
+; RV32-NEXT:    fsh ft0, 60(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 13
@@ -733,7 +737,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 13
 ; RV32-NEXT:  .LBB7_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 26(sp)
+; RV32-NEXT:    fsh ft0, 58(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 12
@@ -742,7 +746,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 12
 ; RV32-NEXT:  .LBB7_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 24(sp)
+; RV32-NEXT:    fsh ft0, 56(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 11
@@ -751,7 +755,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 11
 ; RV32-NEXT:  .LBB7_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 22(sp)
+; RV32-NEXT:    fsh ft0, 54(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 10
@@ -760,7 +764,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 10
 ; RV32-NEXT:  .LBB7_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 20(sp)
+; RV32-NEXT:    fsh ft0, 52(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 9
@@ -769,7 +773,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 9
 ; RV32-NEXT:  .LBB7_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 18(sp)
+; RV32-NEXT:    fsh ft0, 50(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_25
 ; RV32-NEXT:  # %bb.24:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 8
@@ -778,7 +782,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 8
 ; RV32-NEXT:  .LBB7_26:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 16(sp)
+; RV32-NEXT:    fsh ft0, 48(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_28
 ; RV32-NEXT:  # %bb.27:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 7
@@ -787,7 +791,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 7
 ; RV32-NEXT:  .LBB7_29:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 14(sp)
+; RV32-NEXT:    fsh ft0, 46(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_31
 ; RV32-NEXT:  # %bb.30:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 6
@@ -796,7 +800,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 6
 ; RV32-NEXT:  .LBB7_32:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 12(sp)
+; RV32-NEXT:    fsh ft0, 44(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_34
 ; RV32-NEXT:  # %bb.33:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 5
@@ -805,7 +809,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 5
 ; RV32-NEXT:  .LBB7_35:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 10(sp)
+; RV32-NEXT:    fsh ft0, 42(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_37
 ; RV32-NEXT:  # %bb.36:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 4
@@ -814,7 +818,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 4
 ; RV32-NEXT:  .LBB7_38:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 8(sp)
+; RV32-NEXT:    fsh ft0, 40(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_40
 ; RV32-NEXT:  # %bb.39:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 3
@@ -823,7 +827,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 3
 ; RV32-NEXT:  .LBB7_41:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 6(sp)
+; RV32-NEXT:    fsh ft0, 38(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_43
 ; RV32-NEXT:  # %bb.42:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 2
@@ -832,7 +836,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 2
 ; RV32-NEXT:  .LBB7_44:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 4(sp)
+; RV32-NEXT:    fsh ft0, 36(sp)
 ; RV32-NEXT:    bnez a0, .LBB7_46
 ; RV32-NEXT:  # %bb.45:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 1
@@ -841,24 +845,25 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 1
 ; RV32-NEXT:  .LBB7_47:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsh ft0, 2(sp)
+; RV32-NEXT:    fsh ft0, 34(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; RV32-NEXT:    vle16.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    vle16.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -96
+; RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: selectcc_v16f16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
+; RV64-NEXT:    addi s0, sp, 96
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -32
 ; RV64-NEXT:    feq.h a0, fa0, fa1
@@ -866,7 +871,7 @@
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v10
-; RV64-NEXT:    fsh ft0, 0(sp)
+; RV64-NEXT:    fsh ft0, 32(sp)
 ; RV64-NEXT:    beqz a0, .LBB7_4
 ; RV64-NEXT:  .LBB7_2:
 ; RV64-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
@@ -875,14 +880,14 @@
 ; RV64-NEXT:  .LBB7_3:
 ; RV64-NEXT:    vsetvli zero, zero, e16,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsh ft0, 0(sp)
+; RV64-NEXT:    fsh ft0, 32(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_2
 ; RV64-NEXT:  .LBB7_4:
 ; RV64-NEXT:    vsetivli zero, 1, e16,m2,ta,mu
 ; RV64-NEXT:    vslidedown.vi v26, v10, 15
 ; RV64-NEXT:  .LBB7_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 30(sp)
+; RV64-NEXT:    fsh ft0, 62(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 14
@@ -891,7 +896,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 14
 ; RV64-NEXT:  .LBB7_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 28(sp)
+; RV64-NEXT:    fsh ft0, 60(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 13
@@ -900,7 +905,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 13
 ; RV64-NEXT:  .LBB7_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 26(sp)
+; RV64-NEXT:    fsh ft0, 58(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 12
@@ -909,7 +914,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 12
 ; RV64-NEXT:  .LBB7_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 24(sp)
+; RV64-NEXT:    fsh ft0, 56(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 11
@@ -918,7 +923,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 11
 ; RV64-NEXT:  .LBB7_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 22(sp)
+; RV64-NEXT:    fsh ft0, 54(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 10
@@ -927,7 +932,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 10
 ; RV64-NEXT:  .LBB7_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 20(sp)
+; RV64-NEXT:    fsh ft0, 52(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 9
@@ -936,7 +941,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 9
 ; RV64-NEXT:  .LBB7_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 18(sp)
+; RV64-NEXT:    fsh ft0, 50(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_25
 ; RV64-NEXT:  # %bb.24:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 8
@@ -945,7 +950,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 8
 ; RV64-NEXT:  .LBB7_26:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 16(sp)
+; RV64-NEXT:    fsh ft0, 48(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_28
 ; RV64-NEXT:  # %bb.27:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 7
@@ -954,7 +959,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 7
 ; RV64-NEXT:  .LBB7_29:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 14(sp)
+; RV64-NEXT:    fsh ft0, 46(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_31
 ; RV64-NEXT:  # %bb.30:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 6
@@ -963,7 +968,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 6
 ; RV64-NEXT:  .LBB7_32:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 12(sp)
+; RV64-NEXT:    fsh ft0, 44(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_34
 ; RV64-NEXT:  # %bb.33:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 5
@@ -972,7 +977,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 5
 ; RV64-NEXT:  .LBB7_35:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 10(sp)
+; RV64-NEXT:    fsh ft0, 42(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_37
 ; RV64-NEXT:  # %bb.36:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 4
@@ -981,7 +986,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 4
 ; RV64-NEXT:  .LBB7_38:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 8(sp)
+; RV64-NEXT:    fsh ft0, 40(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_40
 ; RV64-NEXT:  # %bb.39:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 3
@@ -990,7 +995,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 3
 ; RV64-NEXT:  .LBB7_41:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 6(sp)
+; RV64-NEXT:    fsh ft0, 38(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_43
 ; RV64-NEXT:  # %bb.42:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 2
@@ -999,7 +1004,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 2
 ; RV64-NEXT:  .LBB7_44:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 4(sp)
+; RV64-NEXT:    fsh ft0, 36(sp)
 ; RV64-NEXT:    bnez a0, .LBB7_46
 ; RV64-NEXT:  # %bb.45:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 1
@@ -1008,13 +1013,14 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 1
 ; RV64-NEXT:  .LBB7_47:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsh ft0, 2(sp)
+; RV64-NEXT:    fsh ft0, 34(sp)
 ; RV64-NEXT:    vsetivli zero, 16, e16,m2,ta,mu
-; RV64-NEXT:    vle16.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, sp, 32
+; RV64-NEXT:    vle16.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
 ; RV64-NEXT:    ret
   %cmp = fcmp oeq half %a, %b
   %v = select i1 %cmp, <16 x half> %c, <16 x half> %d
@@ -1080,13 +1086,13 @@
 define <4 x float> @select_v4f32(i1 zeroext %c, <4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: select_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    bnez a0, .LBB10_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vsetvli zero, zero, e32,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v9
-; CHECK-NEXT:    fsw ft0, 0(sp)
+; CHECK-NEXT:    fsw ft0, 16(sp)
 ; CHECK-NEXT:    beqz a0, .LBB10_4
 ; CHECK-NEXT:  .LBB10_2:
 ; CHECK-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
@@ -1095,14 +1101,14 @@
 ; CHECK-NEXT:  .LBB10_3:
 ; CHECK-NEXT:    vsetvli zero, zero, e32,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v8
-; CHECK-NEXT:    fsw ft0, 0(sp)
+; CHECK-NEXT:    fsw ft0, 16(sp)
 ; CHECK-NEXT:    bnez a0, .LBB10_2
 ; CHECK-NEXT:  .LBB10_4:
 ; CHECK-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 3
 ; CHECK-NEXT:  .LBB10_5:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsw ft0, 12(sp)
+; CHECK-NEXT:    fsw ft0, 28(sp)
 ; CHECK-NEXT:    bnez a0, .LBB10_7
 ; CHECK-NEXT:  # %bb.6:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 2
@@ -1111,7 +1117,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 2
 ; CHECK-NEXT:  .LBB10_8:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsw ft0, 8(sp)
+; CHECK-NEXT:    fsw ft0, 24(sp)
 ; CHECK-NEXT:    bnez a0, .LBB10_10
 ; CHECK-NEXT:  # %bb.9:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 1
@@ -1120,10 +1126,11 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 1
 ; CHECK-NEXT:  .LBB10_11:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsw ft0, 4(sp)
+; CHECK-NEXT:    fsw ft0, 20(sp)
 ; CHECK-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; CHECK-NEXT:    vle32.v v8, (sp)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    ret
   %v = select i1 %c, <4 x float> %a, <4 x float> %b
   ret <4 x float> %v
@@ -1132,14 +1139,14 @@
 define <4 x float> @selectcc_v4f32(float %a, float %b, <4 x float> %c, <4 x float> %d) {
 ; CHECK-LABEL: selectcc_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    feq.s a0, fa0, fa1
 ; CHECK-NEXT:    bnez a0, .LBB11_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vsetvli zero, zero, e32,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v9
-; CHECK-NEXT:    fsw ft0, 0(sp)
+; CHECK-NEXT:    fsw ft0, 16(sp)
 ; CHECK-NEXT:    beqz a0, .LBB11_4
 ; CHECK-NEXT:  .LBB11_2:
 ; CHECK-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
@@ -1148,14 +1155,14 @@
 ; CHECK-NEXT:  .LBB11_3:
 ; CHECK-NEXT:    vsetvli zero, zero, e32,m1,ta,mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v8
-; CHECK-NEXT:    fsw ft0, 0(sp)
+; CHECK-NEXT:    fsw ft0, 16(sp)
 ; CHECK-NEXT:    bnez a0, .LBB11_2
 ; CHECK-NEXT:  .LBB11_4:
 ; CHECK-NEXT:    vsetivli zero, 1, e32,m1,ta,mu
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 3
 ; CHECK-NEXT:  .LBB11_5:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsw ft0, 12(sp)
+; CHECK-NEXT:    fsw ft0, 28(sp)
 ; CHECK-NEXT:    bnez a0, .LBB11_7
 ; CHECK-NEXT:  # %bb.6:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 2
@@ -1164,7 +1171,7 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 2
 ; CHECK-NEXT:  .LBB11_8:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsw ft0, 8(sp)
+; CHECK-NEXT:    fsw ft0, 24(sp)
 ; CHECK-NEXT:    bnez a0, .LBB11_10
 ; CHECK-NEXT:  # %bb.9:
 ; CHECK-NEXT:    vslidedown.vi v25, v9, 1
@@ -1173,10 +1180,11 @@
 ; CHECK-NEXT:    vslidedown.vi v25, v8, 1
 ; CHECK-NEXT:  .LBB11_11:
 ; CHECK-NEXT:    vfmv.f.s ft0, v25
-; CHECK-NEXT:    fsw ft0, 4(sp)
+; CHECK-NEXT:    fsw ft0, 20(sp)
 ; CHECK-NEXT:    vsetivli zero, 4, e32,m1,ta,mu
-; CHECK-NEXT:    vle32.v v8, (sp)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    ret
   %cmp = fcmp oeq float %a, %b
   %v = select i1 %cmp, <4 x float> %c, <4 x float> %d
@@ -1186,20 +1194,20 @@
 define <8 x float> @select_v8f32(i1 zeroext %c, <8 x float> %a, <8 x float> %b) {
 ; RV32-LABEL: select_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    .cfi_def_cfa_offset 96
+; RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
+; RV32-NEXT:    addi s0, sp, 96
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -32
 ; RV32-NEXT:    bnez a0, .LBB12_3
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v10
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 32(sp)
 ; RV32-NEXT:    beqz a0, .LBB12_4
 ; RV32-NEXT:  .LBB12_2:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
@@ -1208,14 +1216,14 @@
 ; RV32-NEXT:  .LBB12_3:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 32(sp)
 ; RV32-NEXT:    bnez a0, .LBB12_2
 ; RV32-NEXT:  .LBB12_4:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; RV32-NEXT:    vslidedown.vi v26, v10, 7
 ; RV32-NEXT:  .LBB12_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 28(sp)
+; RV32-NEXT:    fsw ft0, 60(sp)
 ; RV32-NEXT:    bnez a0, .LBB12_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 6
@@ -1224,7 +1232,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 6
 ; RV32-NEXT:  .LBB12_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 24(sp)
+; RV32-NEXT:    fsw ft0, 56(sp)
 ; RV32-NEXT:    bnez a0, .LBB12_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 5
@@ -1233,7 +1241,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 5
 ; RV32-NEXT:  .LBB12_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 20(sp)
+; RV32-NEXT:    fsw ft0, 52(sp)
 ; RV32-NEXT:    bnez a0, .LBB12_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 4
@@ -1242,7 +1250,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 4
 ; RV32-NEXT:  .LBB12_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 16(sp)
+; RV32-NEXT:    fsw ft0, 48(sp)
 ; RV32-NEXT:    bnez a0, .LBB12_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 3
@@ -1251,7 +1259,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 3
 ; RV32-NEXT:  .LBB12_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 12(sp)
+; RV32-NEXT:    fsw ft0, 44(sp)
 ; RV32-NEXT:    bnez a0, .LBB12_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 2
@@ -1260,7 +1268,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 2
 ; RV32-NEXT:  .LBB12_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 8(sp)
+; RV32-NEXT:    fsw ft0, 40(sp)
 ; RV32-NEXT:    bnez a0, .LBB12_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 1
@@ -1269,31 +1277,32 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 1
 ; RV32-NEXT:  .LBB12_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 4(sp)
+; RV32-NEXT:    fsw ft0, 36(sp)
 ; RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -96
+; RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: select_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
+; RV64-NEXT:    addi s0, sp, 96
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -32
 ; RV64-NEXT:    bnez a0, .LBB12_3
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v10
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 32(sp)
 ; RV64-NEXT:    beqz a0, .LBB12_4
 ; RV64-NEXT:  .LBB12_2:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
@@ -1302,14 +1311,14 @@
 ; RV64-NEXT:  .LBB12_3:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 32(sp)
 ; RV64-NEXT:    bnez a0, .LBB12_2
 ; RV64-NEXT:  .LBB12_4:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; RV64-NEXT:    vslidedown.vi v26, v10, 7
 ; RV64-NEXT:  .LBB12_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 28(sp)
+; RV64-NEXT:    fsw ft0, 60(sp)
 ; RV64-NEXT:    bnez a0, .LBB12_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 6
@@ -1318,7 +1327,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 6
 ; RV64-NEXT:  .LBB12_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 24(sp)
+; RV64-NEXT:    fsw ft0, 56(sp)
 ; RV64-NEXT:    bnez a0, .LBB12_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 5
@@ -1327,7 +1336,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 5
 ; RV64-NEXT:  .LBB12_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 20(sp)
+; RV64-NEXT:    fsw ft0, 52(sp)
 ; RV64-NEXT:    bnez a0, .LBB12_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 4
@@ -1336,7 +1345,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 4
 ; RV64-NEXT:  .LBB12_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 16(sp)
+; RV64-NEXT:    fsw ft0, 48(sp)
 ; RV64-NEXT:    bnez a0, .LBB12_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 3
@@ -1345,7 +1354,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 3
 ; RV64-NEXT:  .LBB12_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 12(sp)
+; RV64-NEXT:    fsw ft0, 44(sp)
 ; RV64-NEXT:    bnez a0, .LBB12_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 2
@@ -1354,7 +1363,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 2
 ; RV64-NEXT:  .LBB12_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 8(sp)
+; RV64-NEXT:    fsw ft0, 40(sp)
 ; RV64-NEXT:    bnez a0, .LBB12_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 1
@@ -1363,13 +1372,14 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 1
 ; RV64-NEXT:  .LBB12_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 4(sp)
+; RV64-NEXT:    fsw ft0, 36(sp)
 ; RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; RV64-NEXT:    vle32.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, sp, 32
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
 ; RV64-NEXT:    ret
   %v = select i1 %c, <8 x float> %a, <8 x float> %b
   ret <8 x float> %v
@@ -1378,13 +1388,13 @@
 define <8 x float> @selectcc_v8f32(float %a, float %b, <8 x float> %c, <8 x float> %d) {
 ; RV32-LABEL: selectcc_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    .cfi_def_cfa_offset 96
+; RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
+; RV32-NEXT:    addi s0, sp, 96
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -32
 ; RV32-NEXT:    feq.s a0, fa0, fa1
@@ -1392,7 +1402,7 @@
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v10
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 32(sp)
 ; RV32-NEXT:    beqz a0, .LBB13_4
 ; RV32-NEXT:  .LBB13_2:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
@@ -1401,14 +1411,14 @@
 ; RV32-NEXT:  .LBB13_3:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 32(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_2
 ; RV32-NEXT:  .LBB13_4:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; RV32-NEXT:    vslidedown.vi v26, v10, 7
 ; RV32-NEXT:  .LBB13_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 28(sp)
+; RV32-NEXT:    fsw ft0, 60(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 6
@@ -1417,7 +1427,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 6
 ; RV32-NEXT:  .LBB13_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 24(sp)
+; RV32-NEXT:    fsw ft0, 56(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 5
@@ -1426,7 +1436,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 5
 ; RV32-NEXT:  .LBB13_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 20(sp)
+; RV32-NEXT:    fsw ft0, 52(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 4
@@ -1435,7 +1445,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 4
 ; RV32-NEXT:  .LBB13_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 16(sp)
+; RV32-NEXT:    fsw ft0, 48(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 3
@@ -1444,7 +1454,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 3
 ; RV32-NEXT:  .LBB13_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 12(sp)
+; RV32-NEXT:    fsw ft0, 44(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 2
@@ -1453,7 +1463,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 2
 ; RV32-NEXT:  .LBB13_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 8(sp)
+; RV32-NEXT:    fsw ft0, 40(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 1
@@ -1462,24 +1472,25 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 1
 ; RV32-NEXT:  .LBB13_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsw ft0, 4(sp)
+; RV32-NEXT:    fsw ft0, 36(sp)
 ; RV32-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -96
+; RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: selectcc_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
+; RV64-NEXT:    addi s0, sp, 96
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -32
 ; RV64-NEXT:    feq.s a0, fa0, fa1
@@ -1487,7 +1498,7 @@
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v10
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 32(sp)
 ; RV64-NEXT:    beqz a0, .LBB13_4
 ; RV64-NEXT:  .LBB13_2:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
@@ -1496,14 +1507,14 @@
 ; RV64-NEXT:  .LBB13_3:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 32(sp)
 ; RV64-NEXT:    bnez a0, .LBB13_2
 ; RV64-NEXT:  .LBB13_4:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m2,ta,mu
 ; RV64-NEXT:    vslidedown.vi v26, v10, 7
 ; RV64-NEXT:  .LBB13_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 28(sp)
+; RV64-NEXT:    fsw ft0, 60(sp)
 ; RV64-NEXT:    bnez a0, .LBB13_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 6
@@ -1512,7 +1523,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 6
 ; RV64-NEXT:  .LBB13_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 24(sp)
+; RV64-NEXT:    fsw ft0, 56(sp)
 ; RV64-NEXT:    bnez a0, .LBB13_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 5
@@ -1521,7 +1532,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 5
 ; RV64-NEXT:  .LBB13_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 20(sp)
+; RV64-NEXT:    fsw ft0, 52(sp)
 ; RV64-NEXT:    bnez a0, .LBB13_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 4
@@ -1530,7 +1541,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 4
 ; RV64-NEXT:  .LBB13_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 16(sp)
+; RV64-NEXT:    fsw ft0, 48(sp)
 ; RV64-NEXT:    bnez a0, .LBB13_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 3
@@ -1539,7 +1550,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 3
 ; RV64-NEXT:  .LBB13_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 12(sp)
+; RV64-NEXT:    fsw ft0, 44(sp)
 ; RV64-NEXT:    bnez a0, .LBB13_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 2
@@ -1548,7 +1559,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 2
 ; RV64-NEXT:  .LBB13_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 8(sp)
+; RV64-NEXT:    fsw ft0, 40(sp)
 ; RV64-NEXT:    bnez a0, .LBB13_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 1
@@ -1557,13 +1568,14 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 1
 ; RV64-NEXT:  .LBB13_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsw ft0, 4(sp)
+; RV64-NEXT:    fsw ft0, 36(sp)
 ; RV64-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
-; RV64-NEXT:    vle32.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, sp, 32
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
 ; RV64-NEXT:    ret
   %cmp = fcmp oeq float %a, %b
   %v = select i1 %cmp, <8 x float> %c, <8 x float> %d
@@ -1573,20 +1585,20 @@
 define <16 x float> @select_v16f32(i1 zeroext %c, <16 x float> %a, <16 x float> %b) {
 ; RV32-LABEL: select_v16f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -192
+; RV32-NEXT:    .cfi_def_cfa_offset 192
+; RV32-NEXT:    sw ra, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 184(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    addi s0, sp, 192
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    bnez a0, .LBB14_3
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v12
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 64(sp)
 ; RV32-NEXT:    beqz a0, .LBB14_4
 ; RV32-NEXT:  .LBB14_2:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
@@ -1595,14 +1607,14 @@
 ; RV32-NEXT:  .LBB14_3:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 64(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_2
 ; RV32-NEXT:  .LBB14_4:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
 ; RV32-NEXT:    vslidedown.vi v28, v12, 15
 ; RV32-NEXT:  .LBB14_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 60(sp)
+; RV32-NEXT:    fsw ft0, 124(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 14
@@ -1611,7 +1623,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 14
 ; RV32-NEXT:  .LBB14_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 56(sp)
+; RV32-NEXT:    fsw ft0, 120(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 13
@@ -1620,7 +1632,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 13
 ; RV32-NEXT:  .LBB14_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 52(sp)
+; RV32-NEXT:    fsw ft0, 116(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 12
@@ -1629,7 +1641,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 12
 ; RV32-NEXT:  .LBB14_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 48(sp)
+; RV32-NEXT:    fsw ft0, 112(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 11
@@ -1638,7 +1650,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 11
 ; RV32-NEXT:  .LBB14_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 44(sp)
+; RV32-NEXT:    fsw ft0, 108(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 10
@@ -1647,7 +1659,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 10
 ; RV32-NEXT:  .LBB14_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 40(sp)
+; RV32-NEXT:    fsw ft0, 104(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 9
@@ -1656,7 +1668,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 9
 ; RV32-NEXT:  .LBB14_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 36(sp)
+; RV32-NEXT:    fsw ft0, 100(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_25
 ; RV32-NEXT:  # %bb.24:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 8
@@ -1665,7 +1677,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 8
 ; RV32-NEXT:  .LBB14_26:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 32(sp)
+; RV32-NEXT:    fsw ft0, 96(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_28
 ; RV32-NEXT:  # %bb.27:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 7
@@ -1674,7 +1686,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 7
 ; RV32-NEXT:  .LBB14_29:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 28(sp)
+; RV32-NEXT:    fsw ft0, 92(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_31
 ; RV32-NEXT:  # %bb.30:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 6
@@ -1683,7 +1695,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 6
 ; RV32-NEXT:  .LBB14_32:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 24(sp)
+; RV32-NEXT:    fsw ft0, 88(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_34
 ; RV32-NEXT:  # %bb.33:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 5
@@ -1692,7 +1704,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 5
 ; RV32-NEXT:  .LBB14_35:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 20(sp)
+; RV32-NEXT:    fsw ft0, 84(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_37
 ; RV32-NEXT:  # %bb.36:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 4
@@ -1701,7 +1713,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 4
 ; RV32-NEXT:  .LBB14_38:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 16(sp)
+; RV32-NEXT:    fsw ft0, 80(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_40
 ; RV32-NEXT:  # %bb.39:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 3
@@ -1710,7 +1722,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 3
 ; RV32-NEXT:  .LBB14_41:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 12(sp)
+; RV32-NEXT:    fsw ft0, 76(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_43
 ; RV32-NEXT:  # %bb.42:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 2
@@ -1719,7 +1731,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 2
 ; RV32-NEXT:  .LBB14_44:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 8(sp)
+; RV32-NEXT:    fsw ft0, 72(sp)
 ; RV32-NEXT:    bnez a0, .LBB14_46
 ; RV32-NEXT:  # %bb.45:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 1
@@ -1728,31 +1740,32 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 1
 ; RV32-NEXT:  .LBB14_47:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 4(sp)
+; RV32-NEXT:    fsw ft0, 68(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -192
+; RV32-NEXT:    lw s0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 192
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: select_v16f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -192
+; RV64-NEXT:    .cfi_def_cfa_offset 192
+; RV64-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 176(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    addi s0, sp, 192
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    bnez a0, .LBB14_3
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v12
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 64(sp)
 ; RV64-NEXT:    beqz a0, .LBB14_4
 ; RV64-NEXT:  .LBB14_2:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
@@ -1761,14 +1774,14 @@
 ; RV64-NEXT:  .LBB14_3:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 64(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_2
 ; RV64-NEXT:  .LBB14_4:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
 ; RV64-NEXT:    vslidedown.vi v28, v12, 15
 ; RV64-NEXT:  .LBB14_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 60(sp)
+; RV64-NEXT:    fsw ft0, 124(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 14
@@ -1777,7 +1790,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 14
 ; RV64-NEXT:  .LBB14_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 56(sp)
+; RV64-NEXT:    fsw ft0, 120(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 13
@@ -1786,7 +1799,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 13
 ; RV64-NEXT:  .LBB14_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 52(sp)
+; RV64-NEXT:    fsw ft0, 116(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 12
@@ -1795,7 +1808,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 12
 ; RV64-NEXT:  .LBB14_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 48(sp)
+; RV64-NEXT:    fsw ft0, 112(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 11
@@ -1804,7 +1817,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 11
 ; RV64-NEXT:  .LBB14_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 44(sp)
+; RV64-NEXT:    fsw ft0, 108(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 10
@@ -1813,7 +1826,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 10
 ; RV64-NEXT:  .LBB14_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 40(sp)
+; RV64-NEXT:    fsw ft0, 104(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 9
@@ -1822,7 +1835,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 9
 ; RV64-NEXT:  .LBB14_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 36(sp)
+; RV64-NEXT:    fsw ft0, 100(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_25
 ; RV64-NEXT:  # %bb.24:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 8
@@ -1831,7 +1844,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 8
 ; RV64-NEXT:  .LBB14_26:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 32(sp)
+; RV64-NEXT:    fsw ft0, 96(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_28
 ; RV64-NEXT:  # %bb.27:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 7
@@ -1840,7 +1853,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 7
 ; RV64-NEXT:  .LBB14_29:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 28(sp)
+; RV64-NEXT:    fsw ft0, 92(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_31
 ; RV64-NEXT:  # %bb.30:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 6
@@ -1849,7 +1862,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 6
 ; RV64-NEXT:  .LBB14_32:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 24(sp)
+; RV64-NEXT:    fsw ft0, 88(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_34
 ; RV64-NEXT:  # %bb.33:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 5
@@ -1858,7 +1871,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 5
 ; RV64-NEXT:  .LBB14_35:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 20(sp)
+; RV64-NEXT:    fsw ft0, 84(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_37
 ; RV64-NEXT:  # %bb.36:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 4
@@ -1867,7 +1880,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 4
 ; RV64-NEXT:  .LBB14_38:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 16(sp)
+; RV64-NEXT:    fsw ft0, 80(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_40
 ; RV64-NEXT:  # %bb.39:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 3
@@ -1876,7 +1889,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 3
 ; RV64-NEXT:  .LBB14_41:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 12(sp)
+; RV64-NEXT:    fsw ft0, 76(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_43
 ; RV64-NEXT:  # %bb.42:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 2
@@ -1885,7 +1898,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 2
 ; RV64-NEXT:  .LBB14_44:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 8(sp)
+; RV64-NEXT:    fsw ft0, 72(sp)
 ; RV64-NEXT:    bnez a0, .LBB14_46
 ; RV64-NEXT:  # %bb.45:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 1
@@ -1894,13 +1907,14 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 1
 ; RV64-NEXT:  .LBB14_47:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 4(sp)
+; RV64-NEXT:    fsw ft0, 68(sp)
 ; RV64-NEXT:    vsetivli zero, 16, e32,m4,ta,mu
-; RV64-NEXT:    vle32.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -192
+; RV64-NEXT:    ld s0, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 192
 ; RV64-NEXT:    ret
   %v = select i1 %c, <16 x float> %a, <16 x float> %b
   ret <16 x float> %v
@@ -1909,13 +1923,13 @@
 define <16 x float> @selectcc_v16f32(float %a, float %b, <16 x float> %c, <16 x float> %d) {
 ; RV32-LABEL: selectcc_v16f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -192
+; RV32-NEXT:    .cfi_def_cfa_offset 192
+; RV32-NEXT:    sw ra, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 184(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    addi s0, sp, 192
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    feq.s a0, fa0, fa1
@@ -1923,7 +1937,7 @@
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v12
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 64(sp)
 ; RV32-NEXT:    beqz a0, .LBB15_4
 ; RV32-NEXT:  .LBB15_2:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
@@ -1932,14 +1946,14 @@
 ; RV32-NEXT:  .LBB15_3:
 ; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsw ft0, 0(sp)
+; RV32-NEXT:    fsw ft0, 64(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_2
 ; RV32-NEXT:  .LBB15_4:
 ; RV32-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
 ; RV32-NEXT:    vslidedown.vi v28, v12, 15
 ; RV32-NEXT:  .LBB15_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 60(sp)
+; RV32-NEXT:    fsw ft0, 124(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 14
@@ -1948,7 +1962,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 14
 ; RV32-NEXT:  .LBB15_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 56(sp)
+; RV32-NEXT:    fsw ft0, 120(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 13
@@ -1957,7 +1971,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 13
 ; RV32-NEXT:  .LBB15_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 52(sp)
+; RV32-NEXT:    fsw ft0, 116(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 12
@@ -1966,7 +1980,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 12
 ; RV32-NEXT:  .LBB15_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 48(sp)
+; RV32-NEXT:    fsw ft0, 112(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 11
@@ -1975,7 +1989,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 11
 ; RV32-NEXT:  .LBB15_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 44(sp)
+; RV32-NEXT:    fsw ft0, 108(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 10
@@ -1984,7 +1998,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 10
 ; RV32-NEXT:  .LBB15_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 40(sp)
+; RV32-NEXT:    fsw ft0, 104(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 9
@@ -1993,7 +2007,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 9
 ; RV32-NEXT:  .LBB15_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 36(sp)
+; RV32-NEXT:    fsw ft0, 100(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_25
 ; RV32-NEXT:  # %bb.24:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 8
@@ -2002,7 +2016,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 8
 ; RV32-NEXT:  .LBB15_26:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 32(sp)
+; RV32-NEXT:    fsw ft0, 96(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_28
 ; RV32-NEXT:  # %bb.27:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 7
@@ -2011,7 +2025,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 7
 ; RV32-NEXT:  .LBB15_29:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 28(sp)
+; RV32-NEXT:    fsw ft0, 92(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_31
 ; RV32-NEXT:  # %bb.30:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 6
@@ -2020,7 +2034,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 6
 ; RV32-NEXT:  .LBB15_32:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 24(sp)
+; RV32-NEXT:    fsw ft0, 88(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_34
 ; RV32-NEXT:  # %bb.33:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 5
@@ -2029,7 +2043,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 5
 ; RV32-NEXT:  .LBB15_35:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 20(sp)
+; RV32-NEXT:    fsw ft0, 84(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_37
 ; RV32-NEXT:  # %bb.36:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 4
@@ -2038,7 +2052,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 4
 ; RV32-NEXT:  .LBB15_38:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 16(sp)
+; RV32-NEXT:    fsw ft0, 80(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_40
 ; RV32-NEXT:  # %bb.39:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 3
@@ -2047,7 +2061,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 3
 ; RV32-NEXT:  .LBB15_41:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 12(sp)
+; RV32-NEXT:    fsw ft0, 76(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_43
 ; RV32-NEXT:  # %bb.42:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 2
@@ -2056,7 +2070,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 2
 ; RV32-NEXT:  .LBB15_44:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 8(sp)
+; RV32-NEXT:    fsw ft0, 72(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_46
 ; RV32-NEXT:  # %bb.45:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 1
@@ -2065,24 +2079,25 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 1
 ; RV32-NEXT:  .LBB15_47:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsw ft0, 4(sp)
+; RV32-NEXT:    fsw ft0, 68(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -192
+; RV32-NEXT:    lw s0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 192
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: selectcc_v16f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -192
+; RV64-NEXT:    .cfi_def_cfa_offset 192
+; RV64-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 176(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    addi s0, sp, 192
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    feq.s a0, fa0, fa1
@@ -2090,7 +2105,7 @@
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v12
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 64(sp)
 ; RV64-NEXT:    beqz a0, .LBB15_4
 ; RV64-NEXT:  .LBB15_2:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
@@ -2099,14 +2114,14 @@
 ; RV64-NEXT:  .LBB15_3:
 ; RV64-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsw ft0, 0(sp)
+; RV64-NEXT:    fsw ft0, 64(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_2
 ; RV64-NEXT:  .LBB15_4:
 ; RV64-NEXT:    vsetivli zero, 1, e32,m4,ta,mu
 ; RV64-NEXT:    vslidedown.vi v28, v12, 15
 ; RV64-NEXT:  .LBB15_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 60(sp)
+; RV64-NEXT:    fsw ft0, 124(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 14
@@ -2115,7 +2130,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 14
 ; RV64-NEXT:  .LBB15_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 56(sp)
+; RV64-NEXT:    fsw ft0, 120(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 13
@@ -2124,7 +2139,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 13
 ; RV64-NEXT:  .LBB15_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 52(sp)
+; RV64-NEXT:    fsw ft0, 116(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 12
@@ -2133,7 +2148,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 12
 ; RV64-NEXT:  .LBB15_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 48(sp)
+; RV64-NEXT:    fsw ft0, 112(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 11
@@ -2142,7 +2157,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 11
 ; RV64-NEXT:  .LBB15_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 44(sp)
+; RV64-NEXT:    fsw ft0, 108(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 10
@@ -2151,7 +2166,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 10
 ; RV64-NEXT:  .LBB15_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 40(sp)
+; RV64-NEXT:    fsw ft0, 104(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 9
@@ -2160,7 +2175,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 9
 ; RV64-NEXT:  .LBB15_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 36(sp)
+; RV64-NEXT:    fsw ft0, 100(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_25
 ; RV64-NEXT:  # %bb.24:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 8
@@ -2169,7 +2184,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 8
 ; RV64-NEXT:  .LBB15_26:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 32(sp)
+; RV64-NEXT:    fsw ft0, 96(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_28
 ; RV64-NEXT:  # %bb.27:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 7
@@ -2178,7 +2193,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 7
 ; RV64-NEXT:  .LBB15_29:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 28(sp)
+; RV64-NEXT:    fsw ft0, 92(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_31
 ; RV64-NEXT:  # %bb.30:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 6
@@ -2187,7 +2202,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 6
 ; RV64-NEXT:  .LBB15_32:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 24(sp)
+; RV64-NEXT:    fsw ft0, 88(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_34
 ; RV64-NEXT:  # %bb.33:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 5
@@ -2196,7 +2211,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 5
 ; RV64-NEXT:  .LBB15_35:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 20(sp)
+; RV64-NEXT:    fsw ft0, 84(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_37
 ; RV64-NEXT:  # %bb.36:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 4
@@ -2205,7 +2220,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 4
 ; RV64-NEXT:  .LBB15_38:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 16(sp)
+; RV64-NEXT:    fsw ft0, 80(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_40
 ; RV64-NEXT:  # %bb.39:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 3
@@ -2214,7 +2229,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 3
 ; RV64-NEXT:  .LBB15_41:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 12(sp)
+; RV64-NEXT:    fsw ft0, 76(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_43
 ; RV64-NEXT:  # %bb.42:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 2
@@ -2223,7 +2238,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 2
 ; RV64-NEXT:  .LBB15_44:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 8(sp)
+; RV64-NEXT:    fsw ft0, 72(sp)
 ; RV64-NEXT:    bnez a0, .LBB15_46
 ; RV64-NEXT:  # %bb.45:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 1
@@ -2232,13 +2247,14 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 1
 ; RV64-NEXT:  .LBB15_47:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsw ft0, 4(sp)
+; RV64-NEXT:    fsw ft0, 68(sp)
 ; RV64-NEXT:    vsetivli zero, 16, e32,m4,ta,mu
-; RV64-NEXT:    vle32.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -192
+; RV64-NEXT:    ld s0, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 192
 ; RV64-NEXT:    ret
   %cmp = fcmp oeq float %a, %b
   %v = select i1 %cmp, <16 x float> %c, <16 x float> %d
@@ -2304,20 +2320,20 @@
 define <4 x double> @select_v4f64(i1 zeroext %c, <4 x double> %a, <4 x double> %b) {
 ; RV32-LABEL: select_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    .cfi_def_cfa_offset 96
+; RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
+; RV32-NEXT:    addi s0, sp, 96
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -32
 ; RV32-NEXT:    bnez a0, .LBB18_3
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v10
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 32(sp)
 ; RV32-NEXT:    beqz a0, .LBB18_4
 ; RV32-NEXT:  .LBB18_2:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
@@ -2326,14 +2342,14 @@
 ; RV32-NEXT:  .LBB18_3:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 32(sp)
 ; RV32-NEXT:    bnez a0, .LBB18_2
 ; RV32-NEXT:  .LBB18_4:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; RV32-NEXT:    vslidedown.vi v26, v10, 3
 ; RV32-NEXT:  .LBB18_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsd ft0, 24(sp)
+; RV32-NEXT:    fsd ft0, 56(sp)
 ; RV32-NEXT:    bnez a0, .LBB18_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 2
@@ -2342,7 +2358,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 2
 ; RV32-NEXT:  .LBB18_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsd ft0, 16(sp)
+; RV32-NEXT:    fsd ft0, 48(sp)
 ; RV32-NEXT:    bnez a0, .LBB18_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 1
@@ -2351,31 +2367,32 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 1
 ; RV32-NEXT:  .LBB18_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsd ft0, 8(sp)
+; RV32-NEXT:    fsd ft0, 40(sp)
 ; RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -96
+; RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: select_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
+; RV64-NEXT:    addi s0, sp, 96
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -32
 ; RV64-NEXT:    bnez a0, .LBB18_3
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v10
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 32(sp)
 ; RV64-NEXT:    beqz a0, .LBB18_4
 ; RV64-NEXT:  .LBB18_2:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
@@ -2384,14 +2401,14 @@
 ; RV64-NEXT:  .LBB18_3:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 32(sp)
 ; RV64-NEXT:    bnez a0, .LBB18_2
 ; RV64-NEXT:  .LBB18_4:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; RV64-NEXT:    vslidedown.vi v26, v10, 3
 ; RV64-NEXT:  .LBB18_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsd ft0, 24(sp)
+; RV64-NEXT:    fsd ft0, 56(sp)
 ; RV64-NEXT:    bnez a0, .LBB18_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 2
@@ -2400,7 +2417,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 2
 ; RV64-NEXT:  .LBB18_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsd ft0, 16(sp)
+; RV64-NEXT:    fsd ft0, 48(sp)
 ; RV64-NEXT:    bnez a0, .LBB18_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 1
@@ -2409,13 +2426,14 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 1
 ; RV64-NEXT:  .LBB18_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsd ft0, 8(sp)
+; RV64-NEXT:    fsd ft0, 40(sp)
 ; RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, sp, 32
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
 ; RV64-NEXT:    ret
   %v = select i1 %c, <4 x double> %a, <4 x double> %b
   ret <4 x double> %v
@@ -2424,13 +2442,13 @@
 define <4 x double> @selectcc_v4f64(double %a, double %b, <4 x double> %c, <4 x double> %d) {
 ; RV32-LABEL: selectcc_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    .cfi_def_cfa_offset 96
+; RV32-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
+; RV32-NEXT:    addi s0, sp, 96
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -32
 ; RV32-NEXT:    feq.d a0, fa0, fa1
@@ -2438,7 +2456,7 @@
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v10
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 32(sp)
 ; RV32-NEXT:    beqz a0, .LBB19_4
 ; RV32-NEXT:  .LBB19_2:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
@@ -2447,14 +2465,14 @@
 ; RV32-NEXT:  .LBB19_3:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 32(sp)
 ; RV32-NEXT:    bnez a0, .LBB19_2
 ; RV32-NEXT:  .LBB19_4:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; RV32-NEXT:    vslidedown.vi v26, v10, 3
 ; RV32-NEXT:  .LBB19_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsd ft0, 24(sp)
+; RV32-NEXT:    fsd ft0, 56(sp)
 ; RV32-NEXT:    bnez a0, .LBB19_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 2
@@ -2463,7 +2481,7 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 2
 ; RV32-NEXT:  .LBB19_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsd ft0, 16(sp)
+; RV32-NEXT:    fsd ft0, 48(sp)
 ; RV32-NEXT:    bnez a0, .LBB19_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v26, v10, 1
@@ -2472,24 +2490,25 @@
 ; RV32-NEXT:    vslidedown.vi v26, v8, 1
 ; RV32-NEXT:  .LBB19_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    fsd ft0, 8(sp)
+; RV32-NEXT:    fsd ft0, 40(sp)
 ; RV32-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -96
+; RV32-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: selectcc_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -96
+; RV64-NEXT:    .cfi_def_cfa_offset 96
+; RV64-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
+; RV64-NEXT:    addi s0, sp, 96
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -32
 ; RV64-NEXT:    feq.d a0, fa0, fa1
@@ -2497,7 +2516,7 @@
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v10
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 32(sp)
 ; RV64-NEXT:    beqz a0, .LBB19_4
 ; RV64-NEXT:  .LBB19_2:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
@@ -2506,14 +2525,14 @@
 ; RV64-NEXT:  .LBB19_3:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 32(sp)
 ; RV64-NEXT:    bnez a0, .LBB19_2
 ; RV64-NEXT:  .LBB19_4:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m2,ta,mu
 ; RV64-NEXT:    vslidedown.vi v26, v10, 3
 ; RV64-NEXT:  .LBB19_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsd ft0, 24(sp)
+; RV64-NEXT:    fsd ft0, 56(sp)
 ; RV64-NEXT:    bnez a0, .LBB19_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 2
@@ -2522,7 +2541,7 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 2
 ; RV64-NEXT:  .LBB19_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsd ft0, 16(sp)
+; RV64-NEXT:    fsd ft0, 48(sp)
 ; RV64-NEXT:    bnez a0, .LBB19_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v26, v10, 1
@@ -2531,13 +2550,14 @@
 ; RV64-NEXT:    vslidedown.vi v26, v8, 1
 ; RV64-NEXT:  .LBB19_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    fsd ft0, 8(sp)
+; RV64-NEXT:    fsd ft0, 40(sp)
 ; RV64-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, sp, 32
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -96
+; RV64-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 96
 ; RV64-NEXT:    ret
   %cmp = fcmp oeq double %a, %b
   %v = select i1 %cmp, <4 x double> %c, <4 x double> %d
@@ -2547,20 +2567,20 @@
 define <8 x double> @select_v8f64(i1 zeroext %c, <8 x double> %a, <8 x double> %b) {
 ; RV32-LABEL: select_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -192
+; RV32-NEXT:    .cfi_def_cfa_offset 192
+; RV32-NEXT:    sw ra, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 184(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    addi s0, sp, 192
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    bnez a0, .LBB20_3
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v12
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 64(sp)
 ; RV32-NEXT:    beqz a0, .LBB20_4
 ; RV32-NEXT:  .LBB20_2:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
@@ -2569,14 +2589,14 @@
 ; RV32-NEXT:  .LBB20_3:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 64(sp)
 ; RV32-NEXT:    bnez a0, .LBB20_2
 ; RV32-NEXT:  .LBB20_4:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
 ; RV32-NEXT:    vslidedown.vi v28, v12, 7
 ; RV32-NEXT:  .LBB20_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 56(sp)
+; RV32-NEXT:    fsd ft0, 120(sp)
 ; RV32-NEXT:    bnez a0, .LBB20_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 6
@@ -2585,7 +2605,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 6
 ; RV32-NEXT:  .LBB20_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 48(sp)
+; RV32-NEXT:    fsd ft0, 112(sp)
 ; RV32-NEXT:    bnez a0, .LBB20_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 5
@@ -2594,7 +2614,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 5
 ; RV32-NEXT:  .LBB20_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 40(sp)
+; RV32-NEXT:    fsd ft0, 104(sp)
 ; RV32-NEXT:    bnez a0, .LBB20_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 4
@@ -2603,7 +2623,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 4
 ; RV32-NEXT:  .LBB20_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 32(sp)
+; RV32-NEXT:    fsd ft0, 96(sp)
 ; RV32-NEXT:    bnez a0, .LBB20_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 3
@@ -2612,7 +2632,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 3
 ; RV32-NEXT:  .LBB20_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 24(sp)
+; RV32-NEXT:    fsd ft0, 88(sp)
 ; RV32-NEXT:    bnez a0, .LBB20_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 2
@@ -2621,7 +2641,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 2
 ; RV32-NEXT:  .LBB20_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 16(sp)
+; RV32-NEXT:    fsd ft0, 80(sp)
 ; RV32-NEXT:    bnez a0, .LBB20_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 1
@@ -2630,31 +2650,32 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 1
 ; RV32-NEXT:  .LBB20_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 8(sp)
+; RV32-NEXT:    fsd ft0, 72(sp)
 ; RV32-NEXT:    vsetivli zero, 8, e64,m4,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -192
+; RV32-NEXT:    lw s0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 192
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: select_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -192
+; RV64-NEXT:    .cfi_def_cfa_offset 192
+; RV64-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 176(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    addi s0, sp, 192
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    bnez a0, .LBB20_3
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v12
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 64(sp)
 ; RV64-NEXT:    beqz a0, .LBB20_4
 ; RV64-NEXT:  .LBB20_2:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
@@ -2663,14 +2684,14 @@
 ; RV64-NEXT:  .LBB20_3:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 64(sp)
 ; RV64-NEXT:    bnez a0, .LBB20_2
 ; RV64-NEXT:  .LBB20_4:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
 ; RV64-NEXT:    vslidedown.vi v28, v12, 7
 ; RV64-NEXT:  .LBB20_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 56(sp)
+; RV64-NEXT:    fsd ft0, 120(sp)
 ; RV64-NEXT:    bnez a0, .LBB20_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 6
@@ -2679,7 +2700,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 6
 ; RV64-NEXT:  .LBB20_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 48(sp)
+; RV64-NEXT:    fsd ft0, 112(sp)
 ; RV64-NEXT:    bnez a0, .LBB20_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 5
@@ -2688,7 +2709,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 5
 ; RV64-NEXT:  .LBB20_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 40(sp)
+; RV64-NEXT:    fsd ft0, 104(sp)
 ; RV64-NEXT:    bnez a0, .LBB20_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 4
@@ -2697,7 +2718,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 4
 ; RV64-NEXT:  .LBB20_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 32(sp)
+; RV64-NEXT:    fsd ft0, 96(sp)
 ; RV64-NEXT:    bnez a0, .LBB20_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 3
@@ -2706,7 +2727,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 3
 ; RV64-NEXT:  .LBB20_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 24(sp)
+; RV64-NEXT:    fsd ft0, 88(sp)
 ; RV64-NEXT:    bnez a0, .LBB20_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 2
@@ -2715,7 +2736,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 2
 ; RV64-NEXT:  .LBB20_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 16(sp)
+; RV64-NEXT:    fsd ft0, 80(sp)
 ; RV64-NEXT:    bnez a0, .LBB20_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 1
@@ -2724,13 +2745,14 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 1
 ; RV64-NEXT:  .LBB20_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 8(sp)
+; RV64-NEXT:    fsd ft0, 72(sp)
 ; RV64-NEXT:    vsetivli zero, 8, e64,m4,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -192
+; RV64-NEXT:    ld s0, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 192
 ; RV64-NEXT:    ret
   %v = select i1 %c, <8 x double> %a, <8 x double> %b
   ret <8 x double> %v
@@ -2739,13 +2761,13 @@
 define <8 x double> @selectcc_v8f64(double %a, double %b, <8 x double> %c, <8 x double> %d) {
 ; RV32-LABEL: selectcc_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -192
+; RV32-NEXT:    .cfi_def_cfa_offset 192
+; RV32-NEXT:    sw ra, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 184(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    addi s0, sp, 192
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    feq.d a0, fa0, fa1
@@ -2753,7 +2775,7 @@
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v12
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 64(sp)
 ; RV32-NEXT:    beqz a0, .LBB21_4
 ; RV32-NEXT:  .LBB21_2:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
@@ -2762,14 +2784,14 @@
 ; RV32-NEXT:  .LBB21_3:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 64(sp)
 ; RV32-NEXT:    bnez a0, .LBB21_2
 ; RV32-NEXT:  .LBB21_4:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
 ; RV32-NEXT:    vslidedown.vi v28, v12, 7
 ; RV32-NEXT:  .LBB21_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 56(sp)
+; RV32-NEXT:    fsd ft0, 120(sp)
 ; RV32-NEXT:    bnez a0, .LBB21_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 6
@@ -2778,7 +2800,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 6
 ; RV32-NEXT:  .LBB21_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 48(sp)
+; RV32-NEXT:    fsd ft0, 112(sp)
 ; RV32-NEXT:    bnez a0, .LBB21_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 5
@@ -2787,7 +2809,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 5
 ; RV32-NEXT:  .LBB21_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 40(sp)
+; RV32-NEXT:    fsd ft0, 104(sp)
 ; RV32-NEXT:    bnez a0, .LBB21_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 4
@@ -2796,7 +2818,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 4
 ; RV32-NEXT:  .LBB21_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 32(sp)
+; RV32-NEXT:    fsd ft0, 96(sp)
 ; RV32-NEXT:    bnez a0, .LBB21_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 3
@@ -2805,7 +2827,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 3
 ; RV32-NEXT:  .LBB21_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 24(sp)
+; RV32-NEXT:    fsd ft0, 88(sp)
 ; RV32-NEXT:    bnez a0, .LBB21_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 2
@@ -2814,7 +2836,7 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 2
 ; RV32-NEXT:  .LBB21_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 16(sp)
+; RV32-NEXT:    fsd ft0, 80(sp)
 ; RV32-NEXT:    bnez a0, .LBB21_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v28, v12, 1
@@ -2823,24 +2845,25 @@
 ; RV32-NEXT:    vslidedown.vi v28, v8, 1
 ; RV32-NEXT:  .LBB21_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v28
-; RV32-NEXT:    fsd ft0, 8(sp)
+; RV32-NEXT:    fsd ft0, 72(sp)
 ; RV32-NEXT:    vsetivli zero, 8, e64,m4,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -192
+; RV32-NEXT:    lw s0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 192
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: selectcc_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -192
+; RV64-NEXT:    .cfi_def_cfa_offset 192
+; RV64-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 176(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    addi s0, sp, 192
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    feq.d a0, fa0, fa1
@@ -2848,7 +2871,7 @@
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v12
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 64(sp)
 ; RV64-NEXT:    beqz a0, .LBB21_4
 ; RV64-NEXT:  .LBB21_2:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
@@ -2857,14 +2880,14 @@
 ; RV64-NEXT:  .LBB21_3:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 64(sp)
 ; RV64-NEXT:    bnez a0, .LBB21_2
 ; RV64-NEXT:  .LBB21_4:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m4,ta,mu
 ; RV64-NEXT:    vslidedown.vi v28, v12, 7
 ; RV64-NEXT:  .LBB21_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 56(sp)
+; RV64-NEXT:    fsd ft0, 120(sp)
 ; RV64-NEXT:    bnez a0, .LBB21_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 6
@@ -2873,7 +2896,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 6
 ; RV64-NEXT:  .LBB21_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 48(sp)
+; RV64-NEXT:    fsd ft0, 112(sp)
 ; RV64-NEXT:    bnez a0, .LBB21_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 5
@@ -2882,7 +2905,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 5
 ; RV64-NEXT:  .LBB21_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 40(sp)
+; RV64-NEXT:    fsd ft0, 104(sp)
 ; RV64-NEXT:    bnez a0, .LBB21_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 4
@@ -2891,7 +2914,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 4
 ; RV64-NEXT:  .LBB21_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 32(sp)
+; RV64-NEXT:    fsd ft0, 96(sp)
 ; RV64-NEXT:    bnez a0, .LBB21_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 3
@@ -2900,7 +2923,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 3
 ; RV64-NEXT:  .LBB21_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 24(sp)
+; RV64-NEXT:    fsd ft0, 88(sp)
 ; RV64-NEXT:    bnez a0, .LBB21_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 2
@@ -2909,7 +2932,7 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 2
 ; RV64-NEXT:  .LBB21_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 16(sp)
+; RV64-NEXT:    fsd ft0, 80(sp)
 ; RV64-NEXT:    bnez a0, .LBB21_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v28, v12, 1
@@ -2918,13 +2941,14 @@
 ; RV64-NEXT:    vslidedown.vi v28, v8, 1
 ; RV64-NEXT:  .LBB21_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v28
-; RV64-NEXT:    fsd ft0, 8(sp)
+; RV64-NEXT:    fsd ft0, 72(sp)
 ; RV64-NEXT:    vsetivli zero, 8, e64,m4,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -192
+; RV64-NEXT:    ld s0, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 192
 ; RV64-NEXT:    ret
   %cmp = fcmp oeq double %a, %b
   %v = select i1 %cmp, <8 x double> %c, <8 x double> %d
@@ -2934,20 +2958,20 @@
 define <16 x double> @select_v16f64(i1 zeroext %c, <16 x double> %a, <16 x double> %b) {
 ; RV32-LABEL: select_v16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -256
-; RV32-NEXT:    .cfi_def_cfa_offset 256
-; RV32-NEXT:    sw ra, 252(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -384
+; RV32-NEXT:    .cfi_def_cfa_offset 384
+; RV32-NEXT:    sw ra, 380(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 256
+; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    bnez a0, .LBB22_3
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v16
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 128(sp)
 ; RV32-NEXT:    beqz a0, .LBB22_4
 ; RV32-NEXT:  .LBB22_2:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
@@ -2956,14 +2980,14 @@
 ; RV32-NEXT:  .LBB22_3:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 128(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_2
 ; RV32-NEXT:  .LBB22_4:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
 ; RV32-NEXT:    vslidedown.vi v24, v16, 15
 ; RV32-NEXT:  .LBB22_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 120(sp)
+; RV32-NEXT:    fsd ft0, 248(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 14
@@ -2972,7 +2996,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 14
 ; RV32-NEXT:  .LBB22_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 112(sp)
+; RV32-NEXT:    fsd ft0, 240(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 13
@@ -2981,7 +3005,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 13
 ; RV32-NEXT:  .LBB22_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 104(sp)
+; RV32-NEXT:    fsd ft0, 232(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 12
@@ -2990,7 +3014,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 12
 ; RV32-NEXT:  .LBB22_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 96(sp)
+; RV32-NEXT:    fsd ft0, 224(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 11
@@ -2999,7 +3023,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 11
 ; RV32-NEXT:  .LBB22_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 88(sp)
+; RV32-NEXT:    fsd ft0, 216(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 10
@@ -3008,7 +3032,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 10
 ; RV32-NEXT:  .LBB22_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 80(sp)
+; RV32-NEXT:    fsd ft0, 208(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 9
@@ -3017,7 +3041,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 9
 ; RV32-NEXT:  .LBB22_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 72(sp)
+; RV32-NEXT:    fsd ft0, 200(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_25
 ; RV32-NEXT:  # %bb.24:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 8
@@ -3026,7 +3050,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 8
 ; RV32-NEXT:  .LBB22_26:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 64(sp)
+; RV32-NEXT:    fsd ft0, 192(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_28
 ; RV32-NEXT:  # %bb.27:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 7
@@ -3035,7 +3059,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 7
 ; RV32-NEXT:  .LBB22_29:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 56(sp)
+; RV32-NEXT:    fsd ft0, 184(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_31
 ; RV32-NEXT:  # %bb.30:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 6
@@ -3044,7 +3068,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 6
 ; RV32-NEXT:  .LBB22_32:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 48(sp)
+; RV32-NEXT:    fsd ft0, 176(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_34
 ; RV32-NEXT:  # %bb.33:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 5
@@ -3053,7 +3077,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 5
 ; RV32-NEXT:  .LBB22_35:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 40(sp)
+; RV32-NEXT:    fsd ft0, 168(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_37
 ; RV32-NEXT:  # %bb.36:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 4
@@ -3062,7 +3086,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 4
 ; RV32-NEXT:  .LBB22_38:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 32(sp)
+; RV32-NEXT:    fsd ft0, 160(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_40
 ; RV32-NEXT:  # %bb.39:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 3
@@ -3071,7 +3095,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 3
 ; RV32-NEXT:  .LBB22_41:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 24(sp)
+; RV32-NEXT:    fsd ft0, 152(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_43
 ; RV32-NEXT:  # %bb.42:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 2
@@ -3080,7 +3104,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 2
 ; RV32-NEXT:  .LBB22_44:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 16(sp)
+; RV32-NEXT:    fsd ft0, 144(sp)
 ; RV32-NEXT:    bnez a0, .LBB22_46
 ; RV32-NEXT:  # %bb.45:
 ; RV32-NEXT:    vslidedown.vi v8, v16, 1
@@ -3089,31 +3113,32 @@
 ; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:  .LBB22_47:
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 8(sp)
+; RV32-NEXT:    fsd ft0, 136(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e64,m8,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -256
-; RV32-NEXT:    lw s0, 248(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 256
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -384
+; RV32-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 384
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: select_v16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -256
-; RV64-NEXT:    .cfi_def_cfa_offset 256
-; RV64-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -384
+; RV64-NEXT:    .cfi_def_cfa_offset 384
+; RV64-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 256
+; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    bnez a0, .LBB22_3
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v16
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 128(sp)
 ; RV64-NEXT:    beqz a0, .LBB22_4
 ; RV64-NEXT:  .LBB22_2:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
@@ -3122,14 +3147,14 @@
 ; RV64-NEXT:  .LBB22_3:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 128(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_2
 ; RV64-NEXT:  .LBB22_4:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
 ; RV64-NEXT:    vslidedown.vi v24, v16, 15
 ; RV64-NEXT:  .LBB22_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 120(sp)
+; RV64-NEXT:    fsd ft0, 248(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 14
@@ -3138,7 +3163,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 14
 ; RV64-NEXT:  .LBB22_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 112(sp)
+; RV64-NEXT:    fsd ft0, 240(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 13
@@ -3147,7 +3172,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 13
 ; RV64-NEXT:  .LBB22_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 104(sp)
+; RV64-NEXT:    fsd ft0, 232(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 12
@@ -3156,7 +3181,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 12
 ; RV64-NEXT:  .LBB22_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 96(sp)
+; RV64-NEXT:    fsd ft0, 224(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 11
@@ -3165,7 +3190,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 11
 ; RV64-NEXT:  .LBB22_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 88(sp)
+; RV64-NEXT:    fsd ft0, 216(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 10
@@ -3174,7 +3199,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 10
 ; RV64-NEXT:  .LBB22_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 80(sp)
+; RV64-NEXT:    fsd ft0, 208(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 9
@@ -3183,7 +3208,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 9
 ; RV64-NEXT:  .LBB22_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 72(sp)
+; RV64-NEXT:    fsd ft0, 200(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_25
 ; RV64-NEXT:  # %bb.24:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 8
@@ -3192,7 +3217,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 8
 ; RV64-NEXT:  .LBB22_26:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 64(sp)
+; RV64-NEXT:    fsd ft0, 192(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_28
 ; RV64-NEXT:  # %bb.27:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 7
@@ -3201,7 +3226,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 7
 ; RV64-NEXT:  .LBB22_29:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 56(sp)
+; RV64-NEXT:    fsd ft0, 184(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_31
 ; RV64-NEXT:  # %bb.30:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 6
@@ -3210,7 +3235,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 6
 ; RV64-NEXT:  .LBB22_32:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 48(sp)
+; RV64-NEXT:    fsd ft0, 176(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_34
 ; RV64-NEXT:  # %bb.33:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 5
@@ -3219,7 +3244,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 5
 ; RV64-NEXT:  .LBB22_35:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 40(sp)
+; RV64-NEXT:    fsd ft0, 168(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_37
 ; RV64-NEXT:  # %bb.36:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 4
@@ -3228,7 +3253,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 4
 ; RV64-NEXT:  .LBB22_38:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 32(sp)
+; RV64-NEXT:    fsd ft0, 160(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_40
 ; RV64-NEXT:  # %bb.39:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 3
@@ -3237,7 +3262,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 3
 ; RV64-NEXT:  .LBB22_41:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 24(sp)
+; RV64-NEXT:    fsd ft0, 152(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_43
 ; RV64-NEXT:  # %bb.42:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 2
@@ -3246,7 +3271,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 2
 ; RV64-NEXT:  .LBB22_44:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 16(sp)
+; RV64-NEXT:    fsd ft0, 144(sp)
 ; RV64-NEXT:    bnez a0, .LBB22_46
 ; RV64-NEXT:  # %bb.45:
 ; RV64-NEXT:    vslidedown.vi v8, v16, 1
@@ -3255,13 +3280,14 @@
 ; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:  .LBB22_47:
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 8(sp)
+; RV64-NEXT:    fsd ft0, 136(sp)
 ; RV64-NEXT:    vsetivli zero, 16, e64,m8,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -256
-; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 256
+; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -384
+; RV64-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 384
 ; RV64-NEXT:    ret
   %v = select i1 %c, <16 x double> %a, <16 x double> %b
   ret <16 x double> %v
@@ -3270,13 +3296,13 @@
 define <16 x double> @selectcc_v16f64(double %a, double %b, <16 x double> %c, <16 x double> %d) {
 ; RV32-LABEL: selectcc_v16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -256
-; RV32-NEXT:    .cfi_def_cfa_offset 256
-; RV32-NEXT:    sw ra, 252(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 248(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -384
+; RV32-NEXT:    .cfi_def_cfa_offset 384
+; RV32-NEXT:    sw ra, 380(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 256
+; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    feq.d a0, fa0, fa1
@@ -3284,7 +3310,7 @@
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v16
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 128(sp)
 ; RV32-NEXT:    beqz a0, .LBB23_4
 ; RV32-NEXT:  .LBB23_2:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
@@ -3293,14 +3319,14 @@
 ; RV32-NEXT:  .LBB23_3:
 ; RV32-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 0(sp)
+; RV32-NEXT:    fsd ft0, 128(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_2
 ; RV32-NEXT:  .LBB23_4:
 ; RV32-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
 ; RV32-NEXT:    vslidedown.vi v24, v16, 15
 ; RV32-NEXT:  .LBB23_5:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 120(sp)
+; RV32-NEXT:    fsd ft0, 248(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_7
 ; RV32-NEXT:  # %bb.6:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 14
@@ -3309,7 +3335,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 14
 ; RV32-NEXT:  .LBB23_8:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 112(sp)
+; RV32-NEXT:    fsd ft0, 240(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 13
@@ -3318,7 +3344,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 13
 ; RV32-NEXT:  .LBB23_11:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 104(sp)
+; RV32-NEXT:    fsd ft0, 232(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_13
 ; RV32-NEXT:  # %bb.12:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 12
@@ -3327,7 +3353,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 12
 ; RV32-NEXT:  .LBB23_14:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 96(sp)
+; RV32-NEXT:    fsd ft0, 224(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_16
 ; RV32-NEXT:  # %bb.15:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 11
@@ -3336,7 +3362,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 11
 ; RV32-NEXT:  .LBB23_17:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 88(sp)
+; RV32-NEXT:    fsd ft0, 216(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_19
 ; RV32-NEXT:  # %bb.18:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 10
@@ -3345,7 +3371,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 10
 ; RV32-NEXT:  .LBB23_20:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 80(sp)
+; RV32-NEXT:    fsd ft0, 208(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_22
 ; RV32-NEXT:  # %bb.21:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 9
@@ -3354,7 +3380,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 9
 ; RV32-NEXT:  .LBB23_23:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 72(sp)
+; RV32-NEXT:    fsd ft0, 200(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_25
 ; RV32-NEXT:  # %bb.24:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 8
@@ -3363,7 +3389,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 8
 ; RV32-NEXT:  .LBB23_26:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 64(sp)
+; RV32-NEXT:    fsd ft0, 192(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_28
 ; RV32-NEXT:  # %bb.27:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 7
@@ -3372,7 +3398,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 7
 ; RV32-NEXT:  .LBB23_29:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 56(sp)
+; RV32-NEXT:    fsd ft0, 184(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_31
 ; RV32-NEXT:  # %bb.30:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 6
@@ -3381,7 +3407,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 6
 ; RV32-NEXT:  .LBB23_32:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 48(sp)
+; RV32-NEXT:    fsd ft0, 176(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_34
 ; RV32-NEXT:  # %bb.33:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 5
@@ -3390,7 +3416,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 5
 ; RV32-NEXT:  .LBB23_35:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 40(sp)
+; RV32-NEXT:    fsd ft0, 168(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_37
 ; RV32-NEXT:  # %bb.36:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 4
@@ -3399,7 +3425,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 4
 ; RV32-NEXT:  .LBB23_38:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 32(sp)
+; RV32-NEXT:    fsd ft0, 160(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_40
 ; RV32-NEXT:  # %bb.39:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 3
@@ -3408,7 +3434,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 3
 ; RV32-NEXT:  .LBB23_41:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 24(sp)
+; RV32-NEXT:    fsd ft0, 152(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_43
 ; RV32-NEXT:  # %bb.42:
 ; RV32-NEXT:    vslidedown.vi v24, v16, 2
@@ -3417,7 +3443,7 @@
 ; RV32-NEXT:    vslidedown.vi v24, v8, 2
 ; RV32-NEXT:  .LBB23_44:
 ; RV32-NEXT:    vfmv.f.s ft0, v24
-; RV32-NEXT:    fsd ft0, 16(sp)
+; RV32-NEXT:    fsd ft0, 144(sp)
 ; RV32-NEXT:    bnez a0, .LBB23_46
 ; RV32-NEXT:  # %bb.45:
 ; RV32-NEXT:    vslidedown.vi v8, v16, 1
@@ -3426,24 +3452,25 @@
 ; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:  .LBB23_47:
 ; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    fsd ft0, 8(sp)
+; RV32-NEXT:    fsd ft0, 136(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e64,m8,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -256
-; RV32-NEXT:    lw s0, 248(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 256
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -384
+; RV32-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 384
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: selectcc_v16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -256
-; RV64-NEXT:    .cfi_def_cfa_offset 256
-; RV64-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi sp, sp, -384
+; RV64-NEXT:    .cfi_def_cfa_offset 384
+; RV64-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 256
+; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    feq.d a0, fa0, fa1
@@ -3451,7 +3478,7 @@
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v16
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 128(sp)
 ; RV64-NEXT:    beqz a0, .LBB23_4
 ; RV64-NEXT:  .LBB23_2:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
@@ -3460,14 +3487,14 @@
 ; RV64-NEXT:  .LBB23_3:
 ; RV64-NEXT:    vsetvli zero, zero, e64,m8,ta,mu
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 0(sp)
+; RV64-NEXT:    fsd ft0, 128(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_2
 ; RV64-NEXT:  .LBB23_4:
 ; RV64-NEXT:    vsetivli zero, 1, e64,m8,ta,mu
 ; RV64-NEXT:    vslidedown.vi v24, v16, 15
 ; RV64-NEXT:  .LBB23_5:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 120(sp)
+; RV64-NEXT:    fsd ft0, 248(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_7
 ; RV64-NEXT:  # %bb.6:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 14
@@ -3476,7 +3503,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 14
 ; RV64-NEXT:  .LBB23_8:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 112(sp)
+; RV64-NEXT:    fsd ft0, 240(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_10
 ; RV64-NEXT:  # %bb.9:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 13
@@ -3485,7 +3512,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 13
 ; RV64-NEXT:  .LBB23_11:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 104(sp)
+; RV64-NEXT:    fsd ft0, 232(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_13
 ; RV64-NEXT:  # %bb.12:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 12
@@ -3494,7 +3521,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 12
 ; RV64-NEXT:  .LBB23_14:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 96(sp)
+; RV64-NEXT:    fsd ft0, 224(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_16
 ; RV64-NEXT:  # %bb.15:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 11
@@ -3503,7 +3530,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 11
 ; RV64-NEXT:  .LBB23_17:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 88(sp)
+; RV64-NEXT:    fsd ft0, 216(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_19
 ; RV64-NEXT:  # %bb.18:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 10
@@ -3512,7 +3539,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 10
 ; RV64-NEXT:  .LBB23_20:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 80(sp)
+; RV64-NEXT:    fsd ft0, 208(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_22
 ; RV64-NEXT:  # %bb.21:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 9
@@ -3521,7 +3548,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 9
 ; RV64-NEXT:  .LBB23_23:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 72(sp)
+; RV64-NEXT:    fsd ft0, 200(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_25
 ; RV64-NEXT:  # %bb.24:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 8
@@ -3530,7 +3557,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 8
 ; RV64-NEXT:  .LBB23_26:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 64(sp)
+; RV64-NEXT:    fsd ft0, 192(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_28
 ; RV64-NEXT:  # %bb.27:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 7
@@ -3539,7 +3566,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 7
 ; RV64-NEXT:  .LBB23_29:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 56(sp)
+; RV64-NEXT:    fsd ft0, 184(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_31
 ; RV64-NEXT:  # %bb.30:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 6
@@ -3548,7 +3575,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 6
 ; RV64-NEXT:  .LBB23_32:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 48(sp)
+; RV64-NEXT:    fsd ft0, 176(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_34
 ; RV64-NEXT:  # %bb.33:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 5
@@ -3557,7 +3584,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 5
 ; RV64-NEXT:  .LBB23_35:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 40(sp)
+; RV64-NEXT:    fsd ft0, 168(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_37
 ; RV64-NEXT:  # %bb.36:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 4
@@ -3566,7 +3593,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 4
 ; RV64-NEXT:  .LBB23_38:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 32(sp)
+; RV64-NEXT:    fsd ft0, 160(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_40
 ; RV64-NEXT:  # %bb.39:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 3
@@ -3575,7 +3602,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 3
 ; RV64-NEXT:  .LBB23_41:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 24(sp)
+; RV64-NEXT:    fsd ft0, 152(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_43
 ; RV64-NEXT:  # %bb.42:
 ; RV64-NEXT:    vslidedown.vi v24, v16, 2
@@ -3584,7 +3611,7 @@
 ; RV64-NEXT:    vslidedown.vi v24, v8, 2
 ; RV64-NEXT:  .LBB23_44:
 ; RV64-NEXT:    vfmv.f.s ft0, v24
-; RV64-NEXT:    fsd ft0, 16(sp)
+; RV64-NEXT:    fsd ft0, 144(sp)
 ; RV64-NEXT:    bnez a0, .LBB23_46
 ; RV64-NEXT:  # %bb.45:
 ; RV64-NEXT:    vslidedown.vi v8, v16, 1
@@ -3593,13 +3620,14 @@
 ; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:  .LBB23_47:
 ; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    fsd ft0, 8(sp)
+; RV64-NEXT:    fsd ft0, 136(sp)
 ; RV64-NEXT:    vsetivli zero, 16, e64,m8,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -256
-; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 256
+; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -384
+; RV64-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 384
 ; RV64-NEXT:    ret
   %cmp = fcmp oeq double %a, %b
   %v = select i1 %cmp, <16 x double> %c, <16 x double> %d
diff --git a/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir b/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir
--- a/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir
@@ -25,8 +25,7 @@
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   $x2 = frame-setup ADDI $x2, -16
-  ; CHECK:   CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK:   CFI_INSTRUCTION def_cfa_offset 0
   ; CHECK:   $x10 = PseudoReadVLENB
   ; CHECK:   $x10 = SLLI killed $x10, 1
   ; CHECK:   $x2 = SUB $x2, killed $x10
@@ -34,7 +33,6 @@
   ; CHECK:   $x10 = PseudoReadVLENB
   ; CHECK:   $x10 = SLLI killed $x10, 1
   ; CHECK:   $x2 = ADD $x2, killed $x10
-  ; CHECK:   $x2 = frame-destroy ADDI $x2, 16
   ; CHECK:   PseudoRET
   bb.0:
   bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/localvar.ll b/llvm/test/CodeGen/RISCV/rvv/localvar.ll
--- a/llvm/test/CodeGen/RISCV/rvv/localvar.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/localvar.ll
@@ -290,18 +290,18 @@
 define i64 @fixed_object(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8) nounwind {
 ; RV64IV-LABEL: fixed_object:
 ; RV64IV:       # %bb.0:
-; RV64IV-NEXT:    addi sp, sp, -32
+; RV64IV-NEXT:    addi sp, sp, -16
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    slli a0, a0, 3
 ; RV64IV-NEXT:    sub sp, sp, a0
 ; RV64IV-NEXT:    csrr a0, vlenb
 ; RV64IV-NEXT:    slli a0, a0, 3
 ; RV64IV-NEXT:    add a0, sp, a0
-; RV64IV-NEXT:    ld a0, 32(a0)
+; RV64IV-NEXT:    ld a0, 16(a0)
 ; RV64IV-NEXT:    csrr a1, vlenb
 ; RV64IV-NEXT:    slli a1, a1, 3
 ; RV64IV-NEXT:    add sp, sp, a1
-; RV64IV-NEXT:    addi sp, sp, 32
+; RV64IV-NEXT:    addi sp, sp, 16
 ; RV64IV-NEXT:    ret
   %fixed_size = alloca i32
   %rvv_vector = alloca <vscale x 8 x i64>, align 8
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -663,15 +663,15 @@
 ;
 ; RV32MV-LABEL: test_srem_vec:
 ; RV32MV:       # %bb.0:
-; RV32MV-NEXT:    addi sp, sp, -64
-; RV32MV-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
-; RV32MV-NEXT:    addi s0, sp, 64
+; RV32MV-NEXT:    addi sp, sp, -96
+; RV32MV-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s1, 84(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s2, 80(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s3, 76(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s4, 72(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    sw s5, 68(sp) # 4-byte Folded Spill
+; RV32MV-NEXT:    addi s0, sp, 96
 ; RV32MV-NEXT:    andi sp, sp, -32
 ; RV32MV-NEXT:    mv s1, a0
 ; RV32MV-NEXT:    lw a0, 8(a0)
@@ -695,28 +695,29 @@
 ; RV32MV-NEXT:    addi a2, zero, 6
 ; RV32MV-NEXT:    mv a3, zero
 ; RV32MV-NEXT:    call __moddi3@plt
-; RV32MV-NEXT:    sw a1, 4(sp)
-; RV32MV-NEXT:    sw a0, 0(sp)
+; RV32MV-NEXT:    sw a1, 36(sp)
+; RV32MV-NEXT:    sw a0, 32(sp)
 ; RV32MV-NEXT:    addi a2, zero, -5
 ; RV32MV-NEXT:    addi a3, zero, -1
 ; RV32MV-NEXT:    mv a0, s4
 ; RV32MV-NEXT:    mv a1, s5
 ; RV32MV-NEXT:    call __moddi3@plt
-; RV32MV-NEXT:    sw a1, 20(sp)
-; RV32MV-NEXT:    sw a0, 16(sp)
+; RV32MV-NEXT:    sw a1, 52(sp)
+; RV32MV-NEXT:    sw a0, 48(sp)
 ; RV32MV-NEXT:    addi a2, zero, 7
 ; RV32MV-NEXT:    mv a0, s2
 ; RV32MV-NEXT:    mv a1, s3
 ; RV32MV-NEXT:    mv a3, zero
 ; RV32MV-NEXT:    call __moddi3@plt
-; RV32MV-NEXT:    sw a1, 12(sp)
-; RV32MV-NEXT:    sw a0, 8(sp)
+; RV32MV-NEXT:    sw a1, 44(sp)
+; RV32MV-NEXT:    sw a0, 40(sp)
 ; RV32MV-NEXT:    addi a0, zero, 85
 ; RV32MV-NEXT:    vsetivli zero, 1, e8,mf8,ta,mu
 ; RV32MV-NEXT:    vmv.s.x v0, a0
 ; RV32MV-NEXT:    vsetivli zero, 8, e32,m2,ta,mu
 ; RV32MV-NEXT:    vmv.v.i v26, 1
-; RV32MV-NEXT:    vle32.v v28, (sp)
+; RV32MV-NEXT:    addi a0, sp, 32
+; RV32MV-NEXT:    vle32.v v28, (a0)
 ; RV32MV-NEXT:    lui a0, %hi(.LCPI3_0)
 ; RV32MV-NEXT:    addi a0, a0, %lo(.LCPI3_0)
 ; RV32MV-NEXT:    vle32.v v30, (a0)
@@ -755,23 +756,23 @@
 ; RV32MV-NEXT:    slli a0, a0, 2
 ; RV32MV-NEXT:    or a0, a1, a0
 ; RV32MV-NEXT:    sw a0, 8(s1)
-; RV32MV-NEXT:    addi sp, s0, -64
-; RV32MV-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32MV-NEXT:    addi sp, sp, 64
+; RV32MV-NEXT:    addi sp, s0, -96
+; RV32MV-NEXT:    lw s5, 68(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s4, 72(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s3, 76(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s2, 80(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s1, 84(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; RV32MV-NEXT:    addi sp, sp, 96
 ; RV32MV-NEXT:    ret
 ;
 ; RV64MV-LABEL: test_srem_vec:
 ; RV64MV:       # %bb.0:
-; RV64MV-NEXT:    addi sp, sp, -64
-; RV64MV-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64MV-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64MV-NEXT:    addi s0, sp, 64
+; RV64MV-NEXT:    addi sp, sp, -96
+; RV64MV-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
+; RV64MV-NEXT:    sd s0, 80(sp) # 8-byte Folded Spill
+; RV64MV-NEXT:    addi s0, sp, 96
 ; RV64MV-NEXT:    andi sp, sp, -32
 ; RV64MV-NEXT:    lb a1, 12(a0)
 ; RV64MV-NEXT:    lwu a2, 8(a0)
@@ -804,7 +805,7 @@
 ; RV64MV-NEXT:    addi a5, zero, 6
 ; RV64MV-NEXT:    mul a1, a1, a5
 ; RV64MV-NEXT:    sub a1, a3, a1
-; RV64MV-NEXT:    sd a1, 0(sp)
+; RV64MV-NEXT:    sd a1, 32(sp)
 ; RV64MV-NEXT:    lui a1, 1035469
 ; RV64MV-NEXT:    addiw a1, a1, -819
 ; RV64MV-NEXT:    slli a1, a1, 12
@@ -820,7 +821,7 @@
 ; RV64MV-NEXT:    slli a3, a1, 2
 ; RV64MV-NEXT:    add a1, a3, a1
 ; RV64MV-NEXT:    add a1, a2, a1
-; RV64MV-NEXT:    sd a1, 16(sp)
+; RV64MV-NEXT:    sd a1, 48(sp)
 ; RV64MV-NEXT:    lui a1, 18725
 ; RV64MV-NEXT:    addiw a1, a1, -1755
 ; RV64MV-NEXT:    slli a1, a1, 12
@@ -836,9 +837,10 @@
 ; RV64MV-NEXT:    slli a2, a1, 3
 ; RV64MV-NEXT:    sub a1, a1, a2
 ; RV64MV-NEXT:    add a1, a4, a1
-; RV64MV-NEXT:    sd a1, 8(sp)
+; RV64MV-NEXT:    sd a1, 40(sp)
 ; RV64MV-NEXT:    vsetivli zero, 4, e64,m2,ta,mu
-; RV64MV-NEXT:    vle64.v v26, (sp)
+; RV64MV-NEXT:    addi a1, sp, 32
+; RV64MV-NEXT:    vle64.v v26, (a1)
 ; RV64MV-NEXT:    lui a1, %hi(.LCPI3_0)
 ; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI3_0)
 ; RV64MV-NEXT:    vle64.v v28, (a1)
@@ -865,10 +867,10 @@
 ; RV64MV-NEXT:    slli a2, a3, 33
 ; RV64MV-NEXT:    or a1, a1, a2
 ; RV64MV-NEXT:    sd a1, 0(a0)
-; RV64MV-NEXT:    addi sp, s0, -64
-; RV64MV-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64MV-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64MV-NEXT:    addi sp, sp, 64
+; RV64MV-NEXT:    addi sp, s0, -96
+; RV64MV-NEXT:    ld s0, 80(sp) # 8-byte Folded Reload
+; RV64MV-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; RV64MV-NEXT:    addi sp, sp, 96
 ; RV64MV-NEXT:    ret
   %ld = load <3 x i33>, <3 x i33>* %X
   %srem = srem <3 x i33> %ld, <i33 6, i33 7, i33 -5>