diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -843,15 +843,27 @@ return Offset; } +static bool hasRVVSpillWithFIs(MachineFunction &MF, const RISCVInstrInfo &TII) { + if (!MF.getSubtarget().hasStdExtV()) + return false; + return any_of(MF, [&TII](const MachineBasicBlock &MBB) { + return any_of(MBB, [&TII](const MachineInstr &MI) { + return TII.isRVVSpill(MI, /*CheckFIs*/ true); + }); + }); +} + void RISCVFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { - const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + const RISCVRegisterInfo *RegInfo = + MF.getSubtarget().getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterClass *RC = &RISCV::GPRRegClass; auto *RVFI = MF.getInfo(); int64_t RVVStackSize = assignRVVStackObjectOffsets(MFI); RVFI->setRVVStackSize(RVVStackSize); + const RISCVInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // estimateStackSize has been observed to under-estimate the final stack // size, so give ourselves wiggle-room by checking for stack size @@ -859,7 +871,10 @@ // FIXME: It may be possible to craft a function with a small stack that // still needs an emergency spill slot for branch relaxation. This case // would currently be missed. - if (!isInt<11>(MFI.estimateStackSize(MF)) || RVVStackSize != 0) { + // RVV loads & stores have no capacity to hold the immediate address offsets + // so we must always reserve an emergency spill slot if the MachineFunction + // contains any RVV spills. + if (!isInt<11>(MFI.estimateStackSize(MF)) || hasRVVSpillWithFIs(MF, TII)) { int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC), RegInfo->getSpillAlign(*RC), false); RS->addScavengingFrameIndex(RegScavFI); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -147,6 +147,11 @@ MachineBasicBlock::iterator II, const DebugLoc &DL, int64_t Amount) const; + // Returns true if the given MI is an RVV instruction opcode for which we may + // expect to see a FrameIndex operand. When CheckFIs is true, the instruction + // must contain at least one FrameIndex operand. + bool isRVVSpill(const MachineInstr &MI, bool CheckFIs) const; + Optional> isRVVSpillForZvlsseg(unsigned Opcode) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1412,6 +1412,46 @@ return VL; } +static bool isRVVWholeLoadStore(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case RISCV::VS1R_V: + case RISCV::VS2R_V: + case RISCV::VS4R_V: + case RISCV::VS8R_V: + case RISCV::VL1RE8_V: + case RISCV::VL2RE8_V: + case RISCV::VL4RE8_V: + case RISCV::VL8RE8_V: + case RISCV::VL1RE16_V: + case RISCV::VL2RE16_V: + case RISCV::VL4RE16_V: + case RISCV::VL8RE16_V: + case RISCV::VL1RE32_V: + case RISCV::VL2RE32_V: + case RISCV::VL4RE32_V: + case RISCV::VL8RE32_V: + case RISCV::VL1RE64_V: + case RISCV::VL2RE64_V: + case RISCV::VL4RE64_V: + case RISCV::VL8RE64_V: + return true; + } +} + +bool RISCVInstrInfo::isRVVSpill(const MachineInstr &MI, bool CheckFIs) const { + // RVV lacks any support for immediate addressing for stack addresses, so be + // conservative. + unsigned Opcode = MI.getOpcode(); + if (!RISCVVPseudosTable::getPseudoInfo(Opcode) && + !isRVVWholeLoadStore(Opcode) && !isRVVSpillForZvlsseg(Opcode)) + return false; + return !CheckFIs || any_of(MI.operands(), [](const MachineOperand &MO) { + return MO.isFI(); + }); +} + Optional> RISCVInstrInfo::isRVVSpillForZvlsseg(unsigned Opcode) const { switch (Opcode) { diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -156,34 +156,6 @@ return true; } -static bool isRVVWholeLoadStore(unsigned Opcode) { - switch (Opcode) { - default: - return false; - case RISCV::VS1R_V: - case RISCV::VS2R_V: - case RISCV::VS4R_V: - case RISCV::VS8R_V: - case RISCV::VL1RE8_V: - case RISCV::VL2RE8_V: - case RISCV::VL4RE8_V: - case RISCV::VL8RE8_V: - case RISCV::VL1RE16_V: - case RISCV::VL2RE16_V: - case RISCV::VL4RE16_V: - case RISCV::VL8RE16_V: - case RISCV::VL1RE32_V: - case RISCV::VL2RE32_V: - case RISCV::VL4RE32_V: - case RISCV::VL8RE32_V: - case RISCV::VL1RE64_V: - case RISCV::VL2RE64_V: - case RISCV::VL4RE64_V: - case RISCV::VL8RE64_V: - return true; - } -} - void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -199,10 +171,8 @@ Register FrameReg; StackOffset Offset = getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg); - bool isRVV = RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()) || - isRVVWholeLoadStore(MI.getOpcode()) || - TII->isRVVSpillForZvlsseg(MI.getOpcode()); - if (!isRVV) + bool IsRVVSpill = TII->isRVVSpill(MI, /*CheckFIs*/ false); + if (!IsRVVSpill) Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); if (!isInt<32>(Offset.getFixed())) { @@ -255,7 +225,7 @@ // Offset = (fixed offset, 0) MI.getOperand(FIOperandNum) .ChangeToRegister(FrameReg, false, false, FrameRegIsKill); - if (!isRVV) + if (!IsRVVSpill) MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); else { if (Offset.getFixed()) { @@ -286,7 +256,7 @@ .addReg(FrameReg, getKillRegState(FrameRegIsKill)) .addReg(ScalableFactorRegister, RegState::Kill); - if (isRVV && Offset.getFixed()) { + if (IsRVVSpill && Offset.getFixed()) { // Scalable load/store has no immediate argument. We need to add the // fixed part into the load/store base address. BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), VL) @@ -296,7 +266,7 @@ // 3. Replace address register with calculated address register MI.getOperand(FIOperandNum).ChangeToRegister(VL, false, false, true); - if (!isRVV) + if (!IsRVVSpill) MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); } diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll --- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll +++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll @@ -5,12 +5,10 @@ define void @lmul1() nounwind { ; CHECK-LABEL: lmul1: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v = alloca ret void @@ -19,14 +17,12 @@ define void @lmul2() nounwind { ; CHECK-LABEL: lmul2: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v = alloca ret void @@ -75,7 +71,6 @@ define void @lmul1_and_2() nounwind { ; CHECK-LABEL: lmul1_and_2: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 @@ -84,7 +79,6 @@ ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v1 = alloca %v2 = alloca @@ -138,7 +132,6 @@ define void @lmul2_and_1() nounwind { ; CHECK-LABEL: lmul2_and_1: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 @@ -147,7 +140,6 @@ ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v1 = alloca %v2 = alloca @@ -250,18 +242,18 @@ define void @gpr_and_lmul1_and_2() nounwind { ; CHECK-LABEL: gpr_and_lmul1_and_2: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, zero, 3 -; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd a0, 8(sp) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %x1 = alloca i64 %v1 = alloca @@ -273,21 +265,21 @@ define void @gpr_and_lmul1_and_4() nounwind { ; CHECK-LABEL: gpr_and_lmul1_and_4: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -64 -; CHECK-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: addi s0, sp, 64 +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi s0, sp, 32 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a1, a0, 2 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: andi sp, sp, -32 ; CHECK-NEXT: addi a0, zero, 3 -; CHECK-NEXT: sd a0, 40(sp) -; CHECK-NEXT: addi sp, s0, -64 -; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 64 +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: addi sp, s0, -32 +; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: ret %x1 = alloca i64 %v1 = alloca @@ -379,14 +371,12 @@ define void @masks() nounwind { ; CHECK-LABEL: masks: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v1 = alloca %v2 = alloca diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -7,8 +7,8 @@ define void @bswap_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV32-LABEL: bswap_v8i16: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -16,7 +16,7 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: sh a1, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 @@ -24,59 +24,60 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 14(sp) +; LMULMAX2-RV32-NEXT: sh a1, 30(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: sh a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 10(sp) +; LMULMAX2-RV32-NEXT: sh a1, 26(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: sh a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 6(sp) +; LMULMAX2-RV32-NEXT: sh a1, 22(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 4(sp) +; LMULMAX2-RV32-NEXT: sh a1, 20(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 2(sp) +; LMULMAX2-RV32-NEXT: sh a1, 18(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle16.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bswap_v8i16: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 @@ -84,7 +85,7 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: sh a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 @@ -92,59 +93,60 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 14(sp) +; LMULMAX2-RV64-NEXT: sh a1, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: sh a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 10(sp) +; LMULMAX2-RV64-NEXT: sh a1, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: sh a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 6(sp) +; LMULMAX2-RV64-NEXT: sh a1, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 4(sp) +; LMULMAX2-RV64-NEXT: sh a1, 20(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 2(sp) +; LMULMAX2-RV64-NEXT: sh a1, 18(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle16.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: bswap_v8i16: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -152,7 +154,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 0(sp) +; LMULMAX1-RV32-NEXT: sh a1, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -160,59 +162,60 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 14(sp) +; LMULMAX1-RV32-NEXT: sh a1, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: slli a2, a1, 8 ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 12(sp) +; LMULMAX1-RV32-NEXT: sh a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: slli a2, a1, 8 ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 10(sp) +; LMULMAX1-RV32-NEXT: sh a1, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: slli a2, a1, 8 ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 8(sp) +; LMULMAX1-RV32-NEXT: sh a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: slli a2, a1, 8 ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 6(sp) +; LMULMAX1-RV32-NEXT: sh a1, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: slli a2, a1, 8 ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 4(sp) +; LMULMAX1-RV32-NEXT: sh a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: slli a2, a1, 8 ; LMULMAX1-RV32-NEXT: slli a1, a1, 16 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: sh a1, 2(sp) +; LMULMAX1-RV32-NEXT: sh a1, 18(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v8i16: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 @@ -220,7 +223,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 0(sp) +; LMULMAX1-RV64-NEXT: sh a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 @@ -228,53 +231,54 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 14(sp) +; LMULMAX1-RV64-NEXT: sh a1, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: slli a2, a1, 8 ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 12(sp) +; LMULMAX1-RV64-NEXT: sh a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: slli a2, a1, 8 ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 10(sp) +; LMULMAX1-RV64-NEXT: sh a1, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: slli a2, a1, 8 ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 8(sp) +; LMULMAX1-RV64-NEXT: sh a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: slli a2, a1, 8 ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 6(sp) +; LMULMAX1-RV64-NEXT: sh a1, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: slli a2, a1, 8 ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 4(sp) +; LMULMAX1-RV64-NEXT: sh a1, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: slli a2, a1, 8 ; LMULMAX1-RV64-NEXT: slli a1, a1, 48 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: sh a1, 2(sp) +; LMULMAX1-RV64-NEXT: sh a1, 18(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y @@ -287,8 +291,8 @@ define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV32-LABEL: bswap_v4i32: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle32.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -304,7 +308,7 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 @@ -317,7 +321,7 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 @@ -329,7 +333,7 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 @@ -341,17 +345,18 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a3 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 20(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bswap_v4i32: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 @@ -367,7 +372,7 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 24 ; LMULMAX2-RV64-NEXT: or a1, a1, a4 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sw a1, 0(sp) +; LMULMAX2-RV64-NEXT: sw a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 @@ -380,7 +385,7 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 24 ; LMULMAX2-RV64-NEXT: or a1, a1, a4 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sw a1, 12(sp) +; LMULMAX2-RV64-NEXT: sw a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 @@ -392,7 +397,7 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 24 ; LMULMAX2-RV64-NEXT: or a1, a1, a4 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sw a1, 8(sp) +; LMULMAX2-RV64-NEXT: sw a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 @@ -404,17 +409,18 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 24 ; LMULMAX2-RV64-NEXT: or a1, a1, a3 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sw a1, 4(sp) +; LMULMAX2-RV64-NEXT: sw a1, 20(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: bswap_v4i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -430,7 +436,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -443,7 +449,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 @@ -455,7 +461,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 @@ -467,17 +473,18 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a3 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v4i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 @@ -493,7 +500,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sw a1, 0(sp) +; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 @@ -506,7 +513,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sw a1, 12(sp) +; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 @@ -518,7 +525,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sw a1, 8(sp) +; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 @@ -530,11 +537,12 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a3 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sw a1, 4(sp) +; LMULMAX1-RV64-NEXT: sw a1, 20(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y @@ -547,8 +555,8 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-LABEL: bswap_v2i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -564,7 +572,7 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 20(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 @@ -577,7 +585,7 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 28(sp) ; LMULMAX2-RV32-NEXT: addi a1, zero, 32 ; LMULMAX2-RV32-NEXT: vsrl.vx v25, v25, a1 ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v25 @@ -590,7 +598,7 @@ ; LMULMAX2-RV32-NEXT: slli a2, a2, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a5 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 -; LMULMAX2-RV32-NEXT: sw a2, 0(sp) +; LMULMAX2-RV32-NEXT: sw a2, 16(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v25, v26, a1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 @@ -602,12 +610,13 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a3 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bswap_v2i64: @@ -676,8 +685,8 @@ ; ; LMULMAX1-RV32-LABEL: bswap_v2i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -693,7 +702,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -706,7 +715,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: addi a1, zero, 32 ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 @@ -719,7 +728,7 @@ ; LMULMAX1-RV32-NEXT: slli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a2, a5 ; LMULMAX1-RV32-NEXT: or a2, a2, a4 -; LMULMAX1-RV32-NEXT: sw a2, 0(sp) +; LMULMAX1-RV32-NEXT: sw a2, 16(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v26, a1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 @@ -731,12 +740,13 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a3 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v2i64: @@ -813,13 +823,13 @@ define void @bswap_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV32-LABEL: bswap_v16i16: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu @@ -829,7 +839,7 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: sh a1, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 @@ -837,123 +847,124 @@ ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 30(sp) +; LMULMAX2-RV32-NEXT: sh a1, 62(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 28(sp) +; LMULMAX2-RV32-NEXT: sh a1, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 26(sp) +; LMULMAX2-RV32-NEXT: sh a1, 58(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 24(sp) +; LMULMAX2-RV32-NEXT: sh a1, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 22(sp) +; LMULMAX2-RV32-NEXT: sh a1, 54(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 20(sp) +; LMULMAX2-RV32-NEXT: sh a1, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 18(sp) +; LMULMAX2-RV32-NEXT: sh a1, 50(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 16(sp) +; LMULMAX2-RV32-NEXT: sh a1, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 14(sp) +; LMULMAX2-RV32-NEXT: sh a1, 46(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: sh a1, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 10(sp) +; LMULMAX2-RV32-NEXT: sh a1, 42(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: sh a1, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 6(sp) +; LMULMAX2-RV32-NEXT: sh a1, 38(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 4(sp) +; LMULMAX2-RV32-NEXT: sh a1, 36(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: slli a2, a1, 8 ; LMULMAX2-RV32-NEXT: slli a1, a1, 16 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sh a1, 2(sp) +; LMULMAX2-RV32-NEXT: sh a1, 34(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle16.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle16.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bswap_v16i16: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu @@ -963,7 +974,7 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: sh a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 @@ -971,118 +982,119 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 30(sp) +; LMULMAX2-RV64-NEXT: sh a1, 62(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 28(sp) +; LMULMAX2-RV64-NEXT: sh a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 26(sp) +; LMULMAX2-RV64-NEXT: sh a1, 58(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 24(sp) +; LMULMAX2-RV64-NEXT: sh a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 22(sp) +; LMULMAX2-RV64-NEXT: sh a1, 54(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 20(sp) +; LMULMAX2-RV64-NEXT: sh a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 18(sp) +; LMULMAX2-RV64-NEXT: sh a1, 50(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 16(sp) +; LMULMAX2-RV64-NEXT: sh a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 14(sp) +; LMULMAX2-RV64-NEXT: sh a1, 46(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: sh a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 10(sp) +; LMULMAX2-RV64-NEXT: sh a1, 42(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: sh a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 6(sp) +; LMULMAX2-RV64-NEXT: sh a1, 38(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 4(sp) +; LMULMAX2-RV64-NEXT: sh a1, 36(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: slli a2, a1, 8 ; LMULMAX2-RV64-NEXT: slli a1, a1, 48 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sh a1, 2(sp) +; LMULMAX2-RV64-NEXT: sh a1, 34(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle16.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle16.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: bswap_v16i16: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 ; LMULMAX1-RV32-NEXT: vle16.v v26, (a1) @@ -1092,7 +1104,7 @@ ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 16(sp) +; LMULMAX1-RV32-NEXT: sh a2, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 @@ -1100,117 +1112,118 @@ ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 30(sp) +; LMULMAX1-RV32-NEXT: sh a2, 46(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 28(sp) +; LMULMAX1-RV32-NEXT: sh a2, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 26(sp) +; LMULMAX1-RV32-NEXT: sh a2, 42(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 24(sp) +; LMULMAX1-RV32-NEXT: sh a2, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 22(sp) +; LMULMAX1-RV32-NEXT: sh a2, 38(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 20(sp) +; LMULMAX1-RV32-NEXT: sh a2, 36(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 18(sp) +; LMULMAX1-RV32-NEXT: sh a2, 34(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 0(sp) +; LMULMAX1-RV32-NEXT: sh a2, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 14(sp) +; LMULMAX1-RV32-NEXT: sh a2, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 12(sp) +; LMULMAX1-RV32-NEXT: sh a2, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 10(sp) +; LMULMAX1-RV32-NEXT: sh a2, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 8(sp) +; LMULMAX1-RV32-NEXT: sh a2, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 6(sp) +; LMULMAX1-RV32-NEXT: sh a2, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 4(sp) +; LMULMAX1-RV32-NEXT: sh a2, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: slli a3, a2, 8 ; LMULMAX1-RV32-NEXT: slli a2, a2, 16 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: or a2, a3, a2 -; LMULMAX1-RV32-NEXT: sh a2, 2(sp) +; LMULMAX1-RV32-NEXT: sh a2, 18(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a2, sp, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a2) +; LMULMAX1-RV32-NEXT: addi a2, sp, 32 ; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) ; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse16.v v26, (a1) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v16i16: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 ; LMULMAX1-RV64-NEXT: vle16.v v26, (a1) @@ -1220,7 +1233,7 @@ ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 16(sp) +; LMULMAX1-RV64-NEXT: sh a2, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 @@ -1228,111 +1241,112 @@ ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 30(sp) +; LMULMAX1-RV64-NEXT: sh a2, 46(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 28(sp) +; LMULMAX1-RV64-NEXT: sh a2, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 26(sp) +; LMULMAX1-RV64-NEXT: sh a2, 42(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 24(sp) +; LMULMAX1-RV64-NEXT: sh a2, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 22(sp) +; LMULMAX1-RV64-NEXT: sh a2, 38(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 20(sp) +; LMULMAX1-RV64-NEXT: sh a2, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 18(sp) +; LMULMAX1-RV64-NEXT: sh a2, 34(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 0(sp) +; LMULMAX1-RV64-NEXT: sh a2, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 14(sp) +; LMULMAX1-RV64-NEXT: sh a2, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 12(sp) +; LMULMAX1-RV64-NEXT: sh a2, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 10(sp) +; LMULMAX1-RV64-NEXT: sh a2, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 8(sp) +; LMULMAX1-RV64-NEXT: sh a2, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 6(sp) +; LMULMAX1-RV64-NEXT: sh a2, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 4(sp) +; LMULMAX1-RV64-NEXT: sh a2, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: slli a2, a2, 48 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: sh a2, 2(sp) +; LMULMAX1-RV64-NEXT: sh a2, 18(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a2, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a2) +; LMULMAX1-RV64-NEXT: addi a2, sp, 32 ; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) ; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse16.v v26, (a1) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y @@ -1345,13 +1359,13 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV32-LABEL: bswap_v8i32: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu @@ -1369,7 +1383,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 0(sp) +; LMULMAX2-RV32-NEXT: sw a3, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 @@ -1382,7 +1396,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 28(sp) +; LMULMAX2-RV32-NEXT: sw a3, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1394,7 +1408,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 24(sp) +; LMULMAX2-RV32-NEXT: sw a3, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1406,7 +1420,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 20(sp) +; LMULMAX2-RV32-NEXT: sw a3, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1418,7 +1432,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 16(sp) +; LMULMAX2-RV32-NEXT: sw a3, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1430,7 +1444,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 12(sp) +; LMULMAX2-RV32-NEXT: sw a3, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1442,7 +1456,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 8(sp) +; LMULMAX2-RV32-NEXT: sw a3, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v26 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1454,25 +1468,26 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a2, a3, a2 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 36(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bswap_v8i32: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu @@ -1490,7 +1505,7 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a3, a3, a5 ; LMULMAX2-RV64-NEXT: or a3, a3, a4 -; LMULMAX2-RV64-NEXT: sw a3, 0(sp) +; LMULMAX2-RV64-NEXT: sw a3, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 @@ -1503,7 +1518,7 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a3, a3, a5 ; LMULMAX2-RV64-NEXT: or a3, a3, a4 -; LMULMAX2-RV64-NEXT: sw a3, 28(sp) +; LMULMAX2-RV64-NEXT: sw a3, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 @@ -1515,7 +1530,7 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a3, a3, a5 ; LMULMAX2-RV64-NEXT: or a3, a3, a4 -; LMULMAX2-RV64-NEXT: sw a3, 24(sp) +; LMULMAX2-RV64-NEXT: sw a3, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 @@ -1527,7 +1542,7 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a3, a3, a5 ; LMULMAX2-RV64-NEXT: or a3, a3, a4 -; LMULMAX2-RV64-NEXT: sw a3, 20(sp) +; LMULMAX2-RV64-NEXT: sw a3, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 @@ -1539,7 +1554,7 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a3, a3, a5 ; LMULMAX2-RV64-NEXT: or a3, a3, a4 -; LMULMAX2-RV64-NEXT: sw a3, 16(sp) +; LMULMAX2-RV64-NEXT: sw a3, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 @@ -1551,7 +1566,7 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a3, a3, a5 ; LMULMAX2-RV64-NEXT: or a3, a3, a4 -; LMULMAX2-RV64-NEXT: sw a3, 12(sp) +; LMULMAX2-RV64-NEXT: sw a3, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 @@ -1563,7 +1578,7 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a3, a3, a5 ; LMULMAX2-RV64-NEXT: or a3, a3, a4 -; LMULMAX2-RV64-NEXT: sw a3, 8(sp) +; LMULMAX2-RV64-NEXT: sw a3, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v26 ; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 @@ -1575,20 +1590,21 @@ ; LMULMAX2-RV64-NEXT: slli a3, a3, 24 ; LMULMAX2-RV64-NEXT: or a2, a3, a2 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sw a1, 4(sp) +; LMULMAX2-RV64-NEXT: sw a1, 36(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: bswap_v8i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a6) @@ -1606,7 +1622,7 @@ ; LMULMAX1-RV32-NEXT: slli a4, a4, 24 ; LMULMAX1-RV32-NEXT: or a1, a4, a1 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: sw a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 @@ -1619,7 +1635,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 28(sp) +; LMULMAX1-RV32-NEXT: sw a1, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -1631,7 +1647,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 24(sp) +; LMULMAX1-RV32-NEXT: sw a1, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -1643,7 +1659,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 20(sp) +; LMULMAX1-RV32-NEXT: sw a1, 36(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 ; LMULMAX1-RV32-NEXT: and a4, a4, a2 @@ -1654,7 +1670,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -1666,7 +1682,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -1678,7 +1694,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -1690,20 +1706,21 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a3 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse32.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v8i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle32.v v26, (a6) @@ -1721,7 +1738,7 @@ ; LMULMAX1-RV64-NEXT: slli a4, a4, 24 ; LMULMAX1-RV64-NEXT: or a1, a4, a1 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 -; LMULMAX1-RV64-NEXT: sw a1, 16(sp) +; LMULMAX1-RV64-NEXT: sw a1, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 @@ -1734,7 +1751,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: sw a1, 28(sp) +; LMULMAX1-RV64-NEXT: sw a1, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 @@ -1746,7 +1763,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: sw a1, 24(sp) +; LMULMAX1-RV64-NEXT: sw a1, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 @@ -1758,7 +1775,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: sw a1, 20(sp) +; LMULMAX1-RV64-NEXT: sw a1, 36(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, a2 @@ -1769,7 +1786,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: sw a1, 0(sp) +; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 @@ -1781,7 +1798,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: sw a1, 12(sp) +; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 @@ -1793,7 +1810,7 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: sw a1, 8(sp) +; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 @@ -1805,14 +1822,15 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 24 ; LMULMAX1-RV64-NEXT: or a1, a1, a3 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sw a1, 4(sp) +; LMULMAX1-RV64-NEXT: sw a1, 20(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV64-NEXT: addi a1, sp, 32 ; LMULMAX1-RV64-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse32.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y @@ -1825,13 +1843,13 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-LABEL: bswap_v4i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu @@ -1849,7 +1867,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 4(sp) +; LMULMAX2-RV32-NEXT: sw a3, 36(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 @@ -1862,7 +1880,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 28(sp) +; LMULMAX2-RV32-NEXT: sw a3, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v30, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v30 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1874,7 +1892,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 20(sp) +; LMULMAX2-RV32-NEXT: sw a3, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v8, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v8 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 @@ -1886,7 +1904,7 @@ ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 -; LMULMAX2-RV32-NEXT: sw a3, 12(sp) +; LMULMAX2-RV32-NEXT: sw a3, 44(sp) ; LMULMAX2-RV32-NEXT: addi a3, zero, 32 ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v26, a3 ; LMULMAX2-RV32-NEXT: vmv.x.s a4, v26 @@ -1899,7 +1917,7 @@ ; LMULMAX2-RV32-NEXT: slli a4, a4, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 ; LMULMAX2-RV32-NEXT: or a2, a4, a2 -; LMULMAX2-RV32-NEXT: sw a2, 0(sp) +; LMULMAX2-RV32-NEXT: sw a2, 32(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v28, a3 ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV32-NEXT: srli a4, a2, 8 @@ -1911,7 +1929,7 @@ ; LMULMAX2-RV32-NEXT: slli a2, a2, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a5 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 -; LMULMAX2-RV32-NEXT: sw a2, 24(sp) +; LMULMAX2-RV32-NEXT: sw a2, 56(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v30, a3 ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV32-NEXT: srli a4, a2, 8 @@ -1923,7 +1941,7 @@ ; LMULMAX2-RV32-NEXT: slli a2, a2, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a5 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 -; LMULMAX2-RV32-NEXT: sw a2, 16(sp) +; LMULMAX2-RV32-NEXT: sw a2, 48(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v8, a3 ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV32-NEXT: srli a3, a2, 8 @@ -1935,26 +1953,27 @@ ; LMULMAX2-RV32-NEXT: slli a2, a2, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a3 ; LMULMAX2-RV32-NEXT: or a1, a2, a1 -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 40(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bswap_v4i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu @@ -1989,7 +2008,7 @@ ; LMULMAX2-RV64-NEXT: or a2, a2, a4 ; LMULMAX2-RV64-NEXT: or a2, a2, a3 ; LMULMAX2-RV64-NEXT: or a1, a2, a1 -; LMULMAX2-RV64-NEXT: sd a1, 0(sp) +; LMULMAX2-RV64-NEXT: sd a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 @@ -2014,7 +2033,7 @@ ; LMULMAX2-RV64-NEXT: or a1, a1, a4 ; LMULMAX2-RV64-NEXT: or a1, a1, a3 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sd a1, 24(sp) +; LMULMAX2-RV64-NEXT: sd a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: srli a2, a1, 40 @@ -2038,7 +2057,7 @@ ; LMULMAX2-RV64-NEXT: or a1, a1, a4 ; LMULMAX2-RV64-NEXT: or a1, a1, a3 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sd a1, 16(sp) +; LMULMAX2-RV64-NEXT: sd a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: srli a2, a1, 40 @@ -2062,20 +2081,21 @@ ; LMULMAX2-RV64-NEXT: or a1, a1, a4 ; LMULMAX2-RV64-NEXT: or a1, a1, a3 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sd a1, 8(sp) +; LMULMAX2-RV64-NEXT: sd a1, 40(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle64.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle64.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: bswap_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) @@ -2093,7 +2113,7 @@ ; LMULMAX1-RV32-NEXT: slli a4, a4, 24 ; LMULMAX1-RV32-NEXT: or a1, a4, a1 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: sw a1, 20(sp) +; LMULMAX1-RV32-NEXT: sw a1, 36(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 @@ -2106,7 +2126,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 28(sp) +; LMULMAX1-RV32-NEXT: sw a1, 44(sp) ; LMULMAX1-RV32-NEXT: addi a7, zero, 32 ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a4, v26 @@ -2119,7 +2139,7 @@ ; LMULMAX1-RV32-NEXT: slli a4, a4, 24 ; LMULMAX1-RV32-NEXT: or a4, a4, a5 ; LMULMAX1-RV32-NEXT: or a1, a4, a1 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: sw a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v27, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -2131,7 +2151,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 24(sp) +; LMULMAX1-RV32-NEXT: sw a1, 40(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 ; LMULMAX1-RV32-NEXT: and a4, a4, a2 @@ -2142,7 +2162,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -2154,7 +2174,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -2166,7 +2186,7 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a5 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v26, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a4, a1, 8 @@ -2178,15 +2198,16 @@ ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a3 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse64.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll @@ -283,56 +283,58 @@ define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { ; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: ; LMULMAX8: # %bb.0: -; LMULMAX8-NEXT: addi sp, sp, -256 -; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: addi sp, sp, -384 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX8-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX8-NEXT: .cfi_offset ra, -8 ; LMULMAX8-NEXT: .cfi_offset s0, -16 -; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: addi s0, sp, 384 ; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX8-NEXT: andi sp, sp, -128 ; LMULMAX8-NEXT: addi a2, zero, 32 ; LMULMAX8-NEXT: vsetvli zero, a2, e32,m8,ta,mu ; LMULMAX8-NEXT: vle32.v v24, (a0) -; LMULMAX8-NEXT: mv a0, sp +; LMULMAX8-NEXT: addi a0, sp, 128 ; LMULMAX8-NEXT: addi a2, zero, 42 -; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: addi a3, sp, 128 +; LMULMAX8-NEXT: vse32.v v8, (a3) ; LMULMAX8-NEXT: vmv8r.v v8, v24 ; LMULMAX8-NEXT: call ext3@plt -; LMULMAX8-NEXT: addi sp, s0, -256 -; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: addi sp, s0, -384 +; LMULMAX8-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 384 ; LMULMAX8-NEXT: ret ; ; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: ; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: addi sp, sp, -256 -; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: addi sp, sp, -384 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX4-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX4-NEXT: .cfi_offset ra, -8 ; LMULMAX4-NEXT: .cfi_offset s0, -16 -; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: addi s0, sp, 384 ; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX4-NEXT: andi sp, sp, -128 ; LMULMAX4-NEXT: vsetivli zero, 16, e32,m4,ta,mu ; LMULMAX4-NEXT: vle32.v v28, (a0) ; LMULMAX4-NEXT: addi a0, a0, 64 ; LMULMAX4-NEXT: vle32.v v24, (a0) -; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: addi a0, sp, 192 ; LMULMAX4-NEXT: vse32.v v12, (a0) -; LMULMAX4-NEXT: mv a0, sp +; LMULMAX4-NEXT: addi a0, sp, 128 ; LMULMAX4-NEXT: addi a3, zero, 42 -; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: addi a1, sp, 128 +; LMULMAX4-NEXT: vse32.v v8, (a1) ; LMULMAX4-NEXT: vmv4r.v v8, v28 ; LMULMAX4-NEXT: vmv4r.v v12, v24 ; LMULMAX4-NEXT: call ext3@plt -; LMULMAX4-NEXT: addi sp, s0, -256 -; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: addi sp, s0, -384 +; LMULMAX4-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 384 ; LMULMAX4-NEXT: ret %t = call fastcc <32 x i32> @ext3(<32 x i32> %z, <32 x i32> %y, <32 x i32> %x, i32 %w, i32 42) ret <32 x i32> %t @@ -367,13 +369,13 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { ; LMULMAX8-LABEL: pass_vector_arg_indirect_stack: ; LMULMAX8: # %bb.0: -; LMULMAX8-NEXT: addi sp, sp, -256 -; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: addi sp, sp, -384 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX8-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX8-NEXT: .cfi_offset ra, -8 ; LMULMAX8-NEXT: .cfi_offset s0, -16 -; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: addi s0, sp, 384 ; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX8-NEXT: andi sp, sp, -128 ; LMULMAX8-NEXT: addi a0, zero, 32 @@ -386,30 +388,31 @@ ; LMULMAX8-NEXT: addi a5, zero, 5 ; LMULMAX8-NEXT: addi a6, zero, 6 ; LMULMAX8-NEXT: addi a7, zero, 7 -; LMULMAX8-NEXT: mv t2, sp +; LMULMAX8-NEXT: addi t2, sp, 128 ; LMULMAX8-NEXT: addi t3, zero, 8 -; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: addi a0, sp, 128 +; LMULMAX8-NEXT: vse32.v v8, (a0) ; LMULMAX8-NEXT: mv a0, zero ; LMULMAX8-NEXT: vmv8r.v v16, v8 ; LMULMAX8-NEXT: call vector_arg_indirect_stack@plt -; LMULMAX8-NEXT: addi sp, s0, -256 -; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: addi sp, s0, -384 +; LMULMAX8-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 384 ; LMULMAX8-NEXT: ret ; ; LMULMAX4-LABEL: pass_vector_arg_indirect_stack: ; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: addi sp, sp, -256 -; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: addi sp, sp, -384 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX4-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX4-NEXT: .cfi_offset ra, -8 ; LMULMAX4-NEXT: .cfi_offset s0, -16 -; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: addi s0, sp, 384 ; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX4-NEXT: andi sp, sp, -128 -; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: addi a0, sp, 192 ; LMULMAX4-NEXT: vsetivli zero, 16, e32,m4,ta,mu ; LMULMAX4-NEXT: vmv.v.i v8, 0 ; LMULMAX4-NEXT: vse32.v v8, (a0) @@ -420,18 +423,19 @@ ; LMULMAX4-NEXT: addi a5, zero, 5 ; LMULMAX4-NEXT: addi a6, zero, 6 ; LMULMAX4-NEXT: addi a7, zero, 7 -; LMULMAX4-NEXT: mv t2, sp +; LMULMAX4-NEXT: addi t2, sp, 128 ; LMULMAX4-NEXT: addi t4, zero, 8 -; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: addi a0, sp, 128 +; LMULMAX4-NEXT: vse32.v v8, (a0) ; LMULMAX4-NEXT: mv a0, zero ; LMULMAX4-NEXT: vmv4r.v v12, v8 ; LMULMAX4-NEXT: vmv4r.v v16, v8 ; LMULMAX4-NEXT: vmv4r.v v20, v8 ; LMULMAX4-NEXT: call vector_arg_indirect_stack@plt -; LMULMAX4-NEXT: addi sp, s0, -256 -; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: addi sp, s0, -384 +; LMULMAX4-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 384 ; LMULMAX4-NEXT: ret %s = call fastcc <32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 8) ret <32 x i32> %s @@ -441,25 +445,31 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %last) { ; LMULMAX8-LABEL: vector_arg_direct_stack: ; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -16 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX8-NEXT: addi a0, zero, 32 ; LMULMAX8-NEXT: vsetvli zero, a0, e32,m8,ta,mu -; LMULMAX8-NEXT: addi a0, sp, 8 +; LMULMAX8-NEXT: addi a0, sp, 24 ; LMULMAX8-NEXT: vle32.v v24, (a0) ; LMULMAX8-NEXT: vadd.vv v8, v8, v16 ; LMULMAX8-NEXT: vadd.vv v8, v8, v24 +; LMULMAX8-NEXT: addi sp, sp, 16 ; LMULMAX8-NEXT: ret ; ; LMULMAX4-LABEL: vector_arg_direct_stack: ; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -16 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX4-NEXT: vsetivli zero, 16, e32,m4,ta,mu -; LMULMAX4-NEXT: addi a0, sp, 8 +; LMULMAX4-NEXT: addi a0, sp, 24 ; LMULMAX4-NEXT: vle32.v v28, (a0) -; LMULMAX4-NEXT: addi a0, sp, 72 +; LMULMAX4-NEXT: addi a0, sp, 88 ; LMULMAX4-NEXT: vle32.v v24, (a0) ; LMULMAX4-NEXT: vadd.vv v12, v12, v20 ; LMULMAX4-NEXT: vadd.vv v8, v8, v16 ; LMULMAX4-NEXT: vadd.vv v8, v8, v28 ; LMULMAX4-NEXT: vadd.vv v12, v12, v24 +; LMULMAX4-NEXT: addi sp, sp, 16 ; LMULMAX4-NEXT: ret %s = add <32 x i32> %x, %y %t = add <32 x i32> %s, %z @@ -547,10 +557,13 @@ define fastcc <4 x i1> @vector_mask_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, <4 x i1> %m1, <4 x i1> %m2, i32 %last) { ; CHECK-LABEL: vector_mask_arg_direct_stack: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vsetivli zero, 4, e8,mf4,ta,mu -; CHECK-NEXT: addi a0, sp, 136 +; CHECK-NEXT: addi a0, sp, 152 ; CHECK-NEXT: vle1.v v25, (a0) ; CHECK-NEXT: vmxor.mm v0, v0, v25 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %r = xor <4 x i1> %m1, %m2 ret <4 x i1> %r diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll @@ -782,67 +782,69 @@ define <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { ; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: ; LMULMAX8: # %bb.0: -; LMULMAX8-NEXT: addi sp, sp, -256 -; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: addi sp, sp, -384 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX8-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX8-NEXT: .cfi_offset ra, -8 ; LMULMAX8-NEXT: .cfi_offset s0, -16 -; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: addi s0, sp, 384 ; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX8-NEXT: andi sp, sp, -128 ; LMULMAX8-NEXT: addi a2, zero, 32 ; LMULMAX8-NEXT: vsetvli zero, a2, e32,m8,ta,mu ; LMULMAX8-NEXT: vle32.v v24, (a0) -; LMULMAX8-NEXT: mv a0, sp +; LMULMAX8-NEXT: addi a0, sp, 128 ; LMULMAX8-NEXT: addi a2, zero, 42 -; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: addi a3, sp, 128 +; LMULMAX8-NEXT: vse32.v v8, (a3) ; LMULMAX8-NEXT: vmv8r.v v8, v24 ; LMULMAX8-NEXT: call ext3@plt -; LMULMAX8-NEXT: addi sp, s0, -256 -; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: addi sp, s0, -384 +; LMULMAX8-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 384 ; LMULMAX8-NEXT: ret ; ; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: ; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: addi sp, sp, -256 -; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: addi sp, sp, -384 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX4-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX4-NEXT: .cfi_offset ra, -8 ; LMULMAX4-NEXT: .cfi_offset s0, -16 -; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: addi s0, sp, 384 ; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX4-NEXT: andi sp, sp, -128 ; LMULMAX4-NEXT: vsetivli zero, 16, e32,m4,ta,mu ; LMULMAX4-NEXT: vle32.v v28, (a0) ; LMULMAX4-NEXT: addi a0, a0, 64 ; LMULMAX4-NEXT: vle32.v v24, (a0) -; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: addi a0, sp, 192 ; LMULMAX4-NEXT: vse32.v v12, (a0) -; LMULMAX4-NEXT: mv a0, sp +; LMULMAX4-NEXT: addi a0, sp, 128 ; LMULMAX4-NEXT: addi a3, zero, 42 -; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: addi a1, sp, 128 +; LMULMAX4-NEXT: vse32.v v8, (a1) ; LMULMAX4-NEXT: vmv4r.v v8, v28 ; LMULMAX4-NEXT: vmv4r.v v12, v24 ; LMULMAX4-NEXT: call ext3@plt -; LMULMAX4-NEXT: addi sp, s0, -256 -; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: addi sp, s0, -384 +; LMULMAX4-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 384 ; LMULMAX4-NEXT: ret ; ; LMULMAX2-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: addi sp, sp, -256 -; LMULMAX2-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX2-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX2-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: addi sp, sp, -384 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX2-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX2-NEXT: .cfi_offset ra, -8 ; LMULMAX2-NEXT: .cfi_offset s0, -16 -; LMULMAX2-NEXT: addi s0, sp, 256 +; LMULMAX2-NEXT: addi s0, sp, 384 ; LMULMAX2-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-NEXT: andi sp, sp, -128 ; LMULMAX2-NEXT: vsetivli zero, 8, e32,m2,ta,mu @@ -853,24 +855,25 @@ ; LMULMAX2-NEXT: vle32.v v30, (a1) ; LMULMAX2-NEXT: addi a0, a0, 96 ; LMULMAX2-NEXT: vle32.v v24, (a0) -; LMULMAX2-NEXT: addi a0, sp, 96 +; LMULMAX2-NEXT: addi a0, sp, 224 ; LMULMAX2-NEXT: vse32.v v14, (a0) -; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: addi a0, sp, 192 ; LMULMAX2-NEXT: vse32.v v12, (a0) -; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: addi a0, sp, 160 ; LMULMAX2-NEXT: vse32.v v10, (a0) -; LMULMAX2-NEXT: mv a0, sp +; LMULMAX2-NEXT: addi a0, sp, 128 ; LMULMAX2-NEXT: addi a5, zero, 42 -; LMULMAX2-NEXT: vse32.v v8, (sp) +; LMULMAX2-NEXT: addi a1, sp, 128 +; LMULMAX2-NEXT: vse32.v v8, (a1) ; LMULMAX2-NEXT: vmv2r.v v8, v26 ; LMULMAX2-NEXT: vmv2r.v v10, v28 ; LMULMAX2-NEXT: vmv2r.v v12, v30 ; LMULMAX2-NEXT: vmv2r.v v14, v24 ; LMULMAX2-NEXT: call ext3@plt -; LMULMAX2-NEXT: addi sp, s0, -256 -; LMULMAX2-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX2-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX2-NEXT: addi sp, sp, 256 +; LMULMAX2-NEXT: addi sp, s0, -384 +; LMULMAX2-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 384 ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: @@ -1013,13 +1016,13 @@ define <32 x i32> @call_split_vector_args(<2 x i32>* %pa, <32 x i32>* %pb) { ; LMULMAX8-LABEL: call_split_vector_args: ; LMULMAX8: # %bb.0: -; LMULMAX8-NEXT: addi sp, sp, -256 -; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: addi sp, sp, -384 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX8-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX8-NEXT: .cfi_offset ra, -8 ; LMULMAX8-NEXT: .cfi_offset s0, -16 -; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: addi s0, sp, 384 ; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX8-NEXT: andi sp, sp, -128 ; LMULMAX8-NEXT: vsetivli zero, 2, e32,mf2,ta,mu @@ -1027,28 +1030,29 @@ ; LMULMAX8-NEXT: addi a0, zero, 32 ; LMULMAX8-NEXT: vsetvli zero, a0, e32,m8,ta,mu ; LMULMAX8-NEXT: vle32.v v16, (a1) -; LMULMAX8-NEXT: mv a0, sp -; LMULMAX8-NEXT: vse32.v v16, (sp) +; LMULMAX8-NEXT: addi a0, sp, 128 +; LMULMAX8-NEXT: addi a1, sp, 128 +; LMULMAX8-NEXT: vse32.v v16, (a1) ; LMULMAX8-NEXT: vmv1r.v v9, v8 ; LMULMAX8-NEXT: vmv1r.v v10, v8 ; LMULMAX8-NEXT: vmv1r.v v11, v8 ; LMULMAX8-NEXT: vmv1r.v v12, v8 ; LMULMAX8-NEXT: call split_vector_args@plt -; LMULMAX8-NEXT: addi sp, s0, -256 -; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: addi sp, s0, -384 +; LMULMAX8-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 384 ; LMULMAX8-NEXT: ret ; ; LMULMAX4-LABEL: call_split_vector_args: ; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: addi sp, sp, -256 -; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 -; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: addi sp, sp, -384 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX4-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; LMULMAX4-NEXT: .cfi_offset ra, -8 ; LMULMAX4-NEXT: .cfi_offset s0, -16 -; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: addi s0, sp, 384 ; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX4-NEXT: andi sp, sp, -128 ; LMULMAX4-NEXT: vsetivli zero, 2, e32,mf2,ta,mu @@ -1057,30 +1061,31 @@ ; LMULMAX4-NEXT: vle32.v v16, (a1) ; LMULMAX4-NEXT: addi a0, a1, 64 ; LMULMAX4-NEXT: vle32.v v20, (a0) -; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: addi a0, sp, 192 ; LMULMAX4-NEXT: vse32.v v20, (a0) -; LMULMAX4-NEXT: mv a0, sp -; LMULMAX4-NEXT: vse32.v v16, (sp) +; LMULMAX4-NEXT: addi a0, sp, 128 +; LMULMAX4-NEXT: addi a1, sp, 128 +; LMULMAX4-NEXT: vse32.v v16, (a1) ; LMULMAX4-NEXT: vmv1r.v v9, v8 ; LMULMAX4-NEXT: vmv1r.v v10, v8 ; LMULMAX4-NEXT: vmv1r.v v11, v8 ; LMULMAX4-NEXT: vmv1r.v v12, v8 ; LMULMAX4-NEXT: call split_vector_args@plt -; LMULMAX4-NEXT: addi sp, s0, -256 -; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: addi sp, s0, -384 +; LMULMAX4-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 384 ; LMULMAX4-NEXT: ret ; ; LMULMAX2-LABEL: call_split_vector_args: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: addi sp, sp, -128 -; LMULMAX2-NEXT: .cfi_def_cfa_offset 128 -; LMULMAX2-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; LMULMAX2-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: addi sp, sp, -256 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX2-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: sd s0, 240(sp) # 8-byte Folded Spill ; LMULMAX2-NEXT: .cfi_offset ra, -8 ; LMULMAX2-NEXT: .cfi_offset s0, -16 -; LMULMAX2-NEXT: addi s0, sp, 128 +; LMULMAX2-NEXT: addi s0, sp, 256 ; LMULMAX2-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-NEXT: andi sp, sp, -128 ; LMULMAX2-NEXT: vsetivli zero, 2, e32,mf2,ta,mu @@ -1093,33 +1098,34 @@ ; LMULMAX2-NEXT: vle32.v v18, (a0) ; LMULMAX2-NEXT: addi a0, a1, 96 ; LMULMAX2-NEXT: vle32.v v20, (a0) -; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: addi a0, sp, 192 ; LMULMAX2-NEXT: vse32.v v20, (a0) -; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: addi a0, sp, 160 ; LMULMAX2-NEXT: vse32.v v18, (a0) -; LMULMAX2-NEXT: mv a0, sp -; LMULMAX2-NEXT: vse32.v v16, (sp) +; LMULMAX2-NEXT: addi a0, sp, 128 +; LMULMAX2-NEXT: addi a1, sp, 128 +; LMULMAX2-NEXT: vse32.v v16, (a1) ; LMULMAX2-NEXT: vmv1r.v v9, v8 ; LMULMAX2-NEXT: vmv1r.v v10, v8 ; LMULMAX2-NEXT: vmv1r.v v11, v8 ; LMULMAX2-NEXT: vmv1r.v v12, v8 ; LMULMAX2-NEXT: vmv2r.v v22, v14 ; LMULMAX2-NEXT: call split_vector_args@plt -; LMULMAX2-NEXT: addi sp, s0, -128 -; LMULMAX2-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; LMULMAX2-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; LMULMAX2-NEXT: addi sp, sp, 128 +; LMULMAX2-NEXT: addi sp, s0, -256 +; LMULMAX2-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 256 ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: call_split_vector_args: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -128 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 128 -; LMULMAX1-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; LMULMAX1-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: addi sp, sp, -256 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX1-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: sd s0, 240(sp) # 8-byte Folded Spill ; LMULMAX1-NEXT: .cfi_offset ra, -8 ; LMULMAX1-NEXT: .cfi_offset s0, -16 -; LMULMAX1-NEXT: addi s0, sp, 128 +; LMULMAX1-NEXT: addi s0, sp, 256 ; LMULMAX1-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX1-NEXT: andi sp, sp, -128 ; LMULMAX1-NEXT: vsetivli zero, 2, e32,mf2,ta,mu @@ -1140,16 +1146,17 @@ ; LMULMAX1-NEXT: vle32.v v19, (a0) ; LMULMAX1-NEXT: addi a0, a1, 112 ; LMULMAX1-NEXT: vle32.v v20, (a0) -; LMULMAX1-NEXT: addi a0, sp, 64 +; LMULMAX1-NEXT: addi a0, sp, 192 ; LMULMAX1-NEXT: vse32.v v20, (a0) -; LMULMAX1-NEXT: addi a0, sp, 48 +; LMULMAX1-NEXT: addi a0, sp, 176 ; LMULMAX1-NEXT: vse32.v v19, (a0) -; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: addi a0, sp, 160 ; LMULMAX1-NEXT: vse32.v v18, (a0) -; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: addi a0, sp, 144 ; LMULMAX1-NEXT: vse32.v v17, (a0) -; LMULMAX1-NEXT: mv a0, sp -; LMULMAX1-NEXT: vse32.v v16, (sp) +; LMULMAX1-NEXT: addi a0, sp, 128 +; LMULMAX1-NEXT: addi a1, sp, 128 +; LMULMAX1-NEXT: vse32.v v16, (a1) ; LMULMAX1-NEXT: vmv1r.v v9, v8 ; LMULMAX1-NEXT: vmv1r.v v10, v8 ; LMULMAX1-NEXT: vmv1r.v v11, v8 @@ -1158,10 +1165,10 @@ ; LMULMAX1-NEXT: vmv1r.v v22, v14 ; LMULMAX1-NEXT: vmv1r.v v23, v15 ; LMULMAX1-NEXT: call split_vector_args@plt -; LMULMAX1-NEXT: addi sp, s0, -128 -; LMULMAX1-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; LMULMAX1-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; LMULMAX1-NEXT: addi sp, sp, 128 +; LMULMAX1-NEXT: addi sp, s0, -256 +; LMULMAX1-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 256 ; LMULMAX1-NEXT: ret %a = load <2 x i32>, <2 x i32>* %pa %b = load <32 x i32>, <32 x i32>* %pb @@ -1174,55 +1181,70 @@ define <32 x i32> @vector_arg_via_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8) { ; LMULMAX8-LABEL: vector_arg_via_stack: ; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -16 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX8-NEXT: addi a0, zero, 32 ; LMULMAX8-NEXT: vsetvli zero, a0, e32,m8,ta,mu -; LMULMAX8-NEXT: vle32.v v16, (sp) +; LMULMAX8-NEXT: addi a0, sp, 16 +; LMULMAX8-NEXT: vle32.v v16, (a0) ; LMULMAX8-NEXT: vadd.vv v8, v8, v16 +; LMULMAX8-NEXT: addi sp, sp, 16 ; LMULMAX8-NEXT: ret ; ; LMULMAX4-LABEL: vector_arg_via_stack: ; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -16 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX4-NEXT: vsetivli zero, 16, e32,m4,ta,mu -; LMULMAX4-NEXT: vle32.v v28, (sp) -; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: addi a0, sp, 16 +; LMULMAX4-NEXT: vle32.v v28, (a0) +; LMULMAX4-NEXT: addi a0, sp, 80 ; LMULMAX4-NEXT: vle32.v v16, (a0) ; LMULMAX4-NEXT: vadd.vv v8, v8, v28 ; LMULMAX4-NEXT: vadd.vv v12, v12, v16 +; LMULMAX4-NEXT: addi sp, sp, 16 ; LMULMAX4-NEXT: ret ; ; LMULMAX2-LABEL: vector_arg_via_stack: ; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -16 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX2-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-NEXT: vle32.v v26, (sp) -; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: addi a0, sp, 16 +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: addi a0, sp, 48 ; LMULMAX2-NEXT: vle32.v v28, (a0) -; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: addi a0, sp, 80 ; LMULMAX2-NEXT: vle32.v v30, (a0) -; LMULMAX2-NEXT: addi a0, sp, 96 +; LMULMAX2-NEXT: addi a0, sp, 112 ; LMULMAX2-NEXT: vle32.v v16, (a0) ; LMULMAX2-NEXT: vadd.vv v8, v8, v26 ; LMULMAX2-NEXT: vadd.vv v10, v10, v28 ; LMULMAX2-NEXT: vadd.vv v12, v12, v30 ; LMULMAX2-NEXT: vadd.vv v14, v14, v16 +; LMULMAX2-NEXT: addi sp, sp, 16 ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: vector_arg_via_stack: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -16 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-NEXT: addi a0, sp, 112 +; LMULMAX1-NEXT: addi a0, sp, 128 ; LMULMAX1-NEXT: vle32.v v25, (a0) -; LMULMAX1-NEXT: addi a0, sp, 96 +; LMULMAX1-NEXT: addi a0, sp, 112 ; LMULMAX1-NEXT: vle32.v v26, (a0) -; LMULMAX1-NEXT: addi a0, sp, 80 +; LMULMAX1-NEXT: addi a0, sp, 96 ; LMULMAX1-NEXT: vle32.v v27, (a0) -; LMULMAX1-NEXT: addi a0, sp, 64 +; LMULMAX1-NEXT: addi a0, sp, 80 ; LMULMAX1-NEXT: vle32.v v28, (a0) -; LMULMAX1-NEXT: vle32.v v29, (sp) ; LMULMAX1-NEXT: addi a0, sp, 16 -; LMULMAX1-NEXT: vle32.v v30, (a0) +; LMULMAX1-NEXT: vle32.v v29, (a0) ; LMULMAX1-NEXT: addi a0, sp, 32 -; LMULMAX1-NEXT: vle32.v v31, (a0) +; LMULMAX1-NEXT: vle32.v v30, (a0) ; LMULMAX1-NEXT: addi a0, sp, 48 +; LMULMAX1-NEXT: vle32.v v31, (a0) +; LMULMAX1-NEXT: addi a0, sp, 64 ; LMULMAX1-NEXT: vle32.v v16, (a0) ; LMULMAX1-NEXT: vadd.vv v8, v8, v29 ; LMULMAX1-NEXT: vadd.vv v9, v9, v30 @@ -1232,6 +1254,7 @@ ; LMULMAX1-NEXT: vadd.vv v13, v13, v27 ; LMULMAX1-NEXT: vadd.vv v14, v14, v26 ; LMULMAX1-NEXT: vadd.vv v15, v15, v25 +; LMULMAX1-NEXT: addi sp, sp, 16 ; LMULMAX1-NEXT: ret %s = add <32 x i32> %x, %z ret <32 x i32> %s @@ -1392,9 +1415,12 @@ define <4 x i1> @vector_mask_arg_via_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8, <4 x i1> %9, <4 x i1> %10) { ; CHECK-LABEL: vector_mask_arg_via_stack: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vsetivli zero, 4, e8,mf4,ta,mu -; CHECK-NEXT: addi a0, sp, 136 +; CHECK-NEXT: addi a0, sp, 152 ; CHECK-NEXT: vle1.v v0, (a0) +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ret <4 x i1> %10 } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -7,8 +7,8 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v16i8: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle8.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -45,7 +45,7 @@ ; LMULMAX2-RV32-NEXT: mul a5, a5, a4 ; LMULMAX2-RV32-NEXT: srli a5, a5, 24 ; LMULMAX2-RV32-NEXT: addi a5, a5, -24 -; LMULMAX2-RV32-NEXT: sb a5, 0(sp) +; LMULMAX2-RV32-NEXT: sb a5, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26 @@ -74,7 +74,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) +; LMULMAX2-RV32-NEXT: sb a1, 31(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -102,7 +102,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) +; LMULMAX2-RV32-NEXT: sb a1, 30(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -130,7 +130,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) +; LMULMAX2-RV32-NEXT: sb a1, 29(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -158,7 +158,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) +; LMULMAX2-RV32-NEXT: sb a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -186,7 +186,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) +; LMULMAX2-RV32-NEXT: sb a1, 27(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -214,7 +214,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) +; LMULMAX2-RV32-NEXT: sb a1, 26(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -242,7 +242,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) +; LMULMAX2-RV32-NEXT: sb a1, 25(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -270,7 +270,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) +; LMULMAX2-RV32-NEXT: sb a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -298,7 +298,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) +; LMULMAX2-RV32-NEXT: sb a1, 23(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -326,7 +326,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) +; LMULMAX2-RV32-NEXT: sb a1, 22(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -354,7 +354,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) +; LMULMAX2-RV32-NEXT: sb a1, 21(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -382,7 +382,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) +; LMULMAX2-RV32-NEXT: sb a1, 20(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -410,7 +410,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) +; LMULMAX2-RV32-NEXT: sb a1, 19(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -438,7 +438,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) +; LMULMAX2-RV32-NEXT: sb a1, 18(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -466,17 +466,18 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) +; LMULMAX2-RV32-NEXT: sb a1, 17(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle8.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse8.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v16i8: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle8.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 @@ -537,7 +538,7 @@ ; LMULMAX2-RV64-NEXT: mul a5, a5, a4 ; LMULMAX2-RV64-NEXT: srli a5, a5, 56 ; LMULMAX2-RV64-NEXT: addi a5, a5, -56 -; LMULMAX2-RV64-NEXT: sb a5, 0(sp) +; LMULMAX2-RV64-NEXT: sb a5, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX2-RV64-NEXT: vmv.x.s a5, v26 @@ -568,7 +569,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) +; LMULMAX2-RV64-NEXT: sb a1, 31(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -598,7 +599,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) +; LMULMAX2-RV64-NEXT: sb a1, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -628,7 +629,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) +; LMULMAX2-RV64-NEXT: sb a1, 29(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -658,7 +659,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) +; LMULMAX2-RV64-NEXT: sb a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -688,7 +689,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) +; LMULMAX2-RV64-NEXT: sb a1, 27(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -718,7 +719,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) +; LMULMAX2-RV64-NEXT: sb a1, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -748,7 +749,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) +; LMULMAX2-RV64-NEXT: sb a1, 25(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -778,7 +779,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) +; LMULMAX2-RV64-NEXT: sb a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -808,7 +809,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) +; LMULMAX2-RV64-NEXT: sb a1, 23(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -838,7 +839,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) +; LMULMAX2-RV64-NEXT: sb a1, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -868,7 +869,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) +; LMULMAX2-RV64-NEXT: sb a1, 21(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -898,7 +899,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) +; LMULMAX2-RV64-NEXT: sb a1, 20(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -928,7 +929,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) +; LMULMAX2-RV64-NEXT: sb a1, 19(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -958,7 +959,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) +; LMULMAX2-RV64-NEXT: sb a1, 18(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -988,17 +989,18 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) +; LMULMAX2-RV64-NEXT: sb a1, 17(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle8.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse8.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v16i8: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -1035,7 +1037,7 @@ ; LMULMAX1-RV32-NEXT: mul a5, a5, a4 ; LMULMAX1-RV32-NEXT: srli a5, a5, 24 ; LMULMAX1-RV32-NEXT: addi a5, a5, -24 -; LMULMAX1-RV32-NEXT: sb a5, 0(sp) +; LMULMAX1-RV32-NEXT: sb a5, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26 @@ -1064,7 +1066,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 15(sp) +; LMULMAX1-RV32-NEXT: sb a1, 31(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1092,7 +1094,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 14(sp) +; LMULMAX1-RV32-NEXT: sb a1, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1120,7 +1122,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 13(sp) +; LMULMAX1-RV32-NEXT: sb a1, 29(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1148,7 +1150,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 12(sp) +; LMULMAX1-RV32-NEXT: sb a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1176,7 +1178,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 11(sp) +; LMULMAX1-RV32-NEXT: sb a1, 27(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1204,7 +1206,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 10(sp) +; LMULMAX1-RV32-NEXT: sb a1, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1232,7 +1234,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 9(sp) +; LMULMAX1-RV32-NEXT: sb a1, 25(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1260,7 +1262,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 8(sp) +; LMULMAX1-RV32-NEXT: sb a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1288,7 +1290,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 7(sp) +; LMULMAX1-RV32-NEXT: sb a1, 23(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1316,7 +1318,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 6(sp) +; LMULMAX1-RV32-NEXT: sb a1, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1344,7 +1346,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 5(sp) +; LMULMAX1-RV32-NEXT: sb a1, 21(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1372,7 +1374,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 4(sp) +; LMULMAX1-RV32-NEXT: sb a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1400,7 +1402,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 3(sp) +; LMULMAX1-RV32-NEXT: sb a1, 19(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1428,7 +1430,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 2(sp) +; LMULMAX1-RV32-NEXT: sb a1, 18(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -1456,17 +1458,18 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 1(sp) +; LMULMAX1-RV32-NEXT: sb a1, 17(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle8.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v16i8: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 @@ -1527,7 +1530,7 @@ ; LMULMAX1-RV64-NEXT: mul a5, a5, a4 ; LMULMAX1-RV64-NEXT: srli a5, a5, 56 ; LMULMAX1-RV64-NEXT: addi a5, a5, -56 -; LMULMAX1-RV64-NEXT: sb a5, 0(sp) +; LMULMAX1-RV64-NEXT: sb a5, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX1-RV64-NEXT: vmv.x.s a5, v26 @@ -1558,7 +1561,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 15(sp) +; LMULMAX1-RV64-NEXT: sb a1, 31(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1588,7 +1591,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 14(sp) +; LMULMAX1-RV64-NEXT: sb a1, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1618,7 +1621,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 13(sp) +; LMULMAX1-RV64-NEXT: sb a1, 29(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1648,7 +1651,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 12(sp) +; LMULMAX1-RV64-NEXT: sb a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1678,7 +1681,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 11(sp) +; LMULMAX1-RV64-NEXT: sb a1, 27(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1708,7 +1711,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 10(sp) +; LMULMAX1-RV64-NEXT: sb a1, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1738,7 +1741,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 9(sp) +; LMULMAX1-RV64-NEXT: sb a1, 25(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1768,7 +1771,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 8(sp) +; LMULMAX1-RV64-NEXT: sb a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1798,7 +1801,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 7(sp) +; LMULMAX1-RV64-NEXT: sb a1, 23(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1828,7 +1831,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 6(sp) +; LMULMAX1-RV64-NEXT: sb a1, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1858,7 +1861,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 5(sp) +; LMULMAX1-RV64-NEXT: sb a1, 21(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1888,7 +1891,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 4(sp) +; LMULMAX1-RV64-NEXT: sb a1, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1918,7 +1921,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 3(sp) +; LMULMAX1-RV64-NEXT: sb a1, 19(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1948,7 +1951,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 2(sp) +; LMULMAX1-RV64-NEXT: sb a1, 18(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -1978,11 +1981,12 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 1(sp) +; LMULMAX1-RV64-NEXT: sb a1, 17(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle8.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = load <16 x i8>, <16 x i8>* %y @@ -1995,8 +1999,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v8i16: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v25 @@ -2035,7 +2039,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: sh a1, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 @@ -2064,7 +2068,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 14(sp) +; LMULMAX2-RV32-NEXT: sh a1, 30(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -2092,7 +2096,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: sh a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -2120,7 +2124,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 10(sp) +; LMULMAX2-RV32-NEXT: sh a1, 26(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -2148,7 +2152,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: sh a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -2176,7 +2180,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 6(sp) +; LMULMAX2-RV32-NEXT: sh a1, 22(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -2204,7 +2208,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 4(sp) +; LMULMAX2-RV32-NEXT: sh a1, 20(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -2232,17 +2236,18 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 2(sp) +; LMULMAX2-RV32-NEXT: sh a1, 18(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle16.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v8i16: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 @@ -2305,7 +2310,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: sh a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 @@ -2336,7 +2341,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 14(sp) +; LMULMAX2-RV64-NEXT: sh a1, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -2366,7 +2371,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: sh a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -2396,7 +2401,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 10(sp) +; LMULMAX2-RV64-NEXT: sh a1, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -2426,7 +2431,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: sh a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -2456,7 +2461,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 6(sp) +; LMULMAX2-RV64-NEXT: sh a1, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -2486,7 +2491,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 4(sp) +; LMULMAX2-RV64-NEXT: sh a1, 20(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -2516,17 +2521,18 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 2(sp) +; LMULMAX2-RV64-NEXT: sh a1, 18(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle16.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v8i16: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 @@ -2565,7 +2571,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 0(sp) +; LMULMAX1-RV32-NEXT: sh a1, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -2594,7 +2600,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 14(sp) +; LMULMAX1-RV32-NEXT: sh a1, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: and a1, a1, a6 @@ -2622,7 +2628,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 12(sp) +; LMULMAX1-RV32-NEXT: sh a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: and a1, a1, a6 @@ -2650,7 +2656,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 10(sp) +; LMULMAX1-RV32-NEXT: sh a1, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: and a1, a1, a6 @@ -2678,7 +2684,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 8(sp) +; LMULMAX1-RV32-NEXT: sh a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: and a1, a1, a6 @@ -2706,7 +2712,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 6(sp) +; LMULMAX1-RV32-NEXT: sh a1, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: and a1, a1, a6 @@ -2734,7 +2740,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 4(sp) +; LMULMAX1-RV32-NEXT: sh a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: and a1, a1, a6 @@ -2762,17 +2768,18 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 2(sp) +; LMULMAX1-RV32-NEXT: sh a1, 18(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v8i16: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 @@ -2835,7 +2842,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 0(sp) +; LMULMAX1-RV64-NEXT: sh a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 @@ -2866,7 +2873,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 14(sp) +; LMULMAX1-RV64-NEXT: sh a1, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: and a1, a1, a6 @@ -2896,7 +2903,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 12(sp) +; LMULMAX1-RV64-NEXT: sh a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: and a1, a1, a6 @@ -2926,7 +2933,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 10(sp) +; LMULMAX1-RV64-NEXT: sh a1, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: and a1, a1, a6 @@ -2956,7 +2963,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 8(sp) +; LMULMAX1-RV64-NEXT: sh a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: and a1, a1, a6 @@ -2986,7 +2993,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 6(sp) +; LMULMAX1-RV64-NEXT: sh a1, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: and a1, a1, a6 @@ -3016,7 +3023,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 4(sp) +; LMULMAX1-RV64-NEXT: sh a1, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: and a1, a1, a6 @@ -3046,11 +3053,12 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 2(sp) +; LMULMAX1-RV64-NEXT: sh a1, 18(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y @@ -3063,8 +3071,8 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v4i32: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle32.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -3099,7 +3107,7 @@ ; LMULMAX2-RV32-NEXT: addi a4, a4, 257 ; LMULMAX2-RV32-NEXT: mul a5, a5, a4 ; LMULMAX2-RV32-NEXT: srli a5, a5, 24 -; LMULMAX2-RV32-NEXT: sw a5, 0(sp) +; LMULMAX2-RV32-NEXT: sw a5, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26 @@ -3126,7 +3134,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -3152,7 +3160,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -3178,17 +3186,18 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 20(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v4i32: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 @@ -3250,7 +3259,7 @@ ; LMULMAX2-RV64-NEXT: mul a5, a5, a4 ; LMULMAX2-RV64-NEXT: srli a5, a5, 56 ; LMULMAX2-RV64-NEXT: addi a5, a5, -32 -; LMULMAX2-RV64-NEXT: sw a5, 0(sp) +; LMULMAX2-RV64-NEXT: sw a5, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a5, v26 @@ -3282,7 +3291,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 12(sp) +; LMULMAX2-RV64-NEXT: sw a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -3313,7 +3322,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 8(sp) +; LMULMAX2-RV64-NEXT: sw a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -3344,17 +3353,18 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 4(sp) +; LMULMAX2-RV64-NEXT: sw a1, 20(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v4i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -3389,7 +3399,7 @@ ; LMULMAX1-RV32-NEXT: addi a4, a4, 257 ; LMULMAX1-RV32-NEXT: mul a5, a5, a4 ; LMULMAX1-RV32-NEXT: srli a5, a5, 24 -; LMULMAX1-RV32-NEXT: sw a5, 0(sp) +; LMULMAX1-RV32-NEXT: sw a5, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26 @@ -3416,7 +3426,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a5, a1, 1 @@ -3442,7 +3452,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a5, a1, 1 @@ -3468,17 +3478,18 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v4i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 @@ -3540,7 +3551,7 @@ ; LMULMAX1-RV64-NEXT: mul a5, a5, a4 ; LMULMAX1-RV64-NEXT: srli a5, a5, 56 ; LMULMAX1-RV64-NEXT: addi a5, a5, -32 -; LMULMAX1-RV64-NEXT: sw a5, 0(sp) +; LMULMAX1-RV64-NEXT: sw a5, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a5, v26 @@ -3572,7 +3583,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 12(sp) +; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 1 @@ -3603,7 +3614,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 8(sp) +; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srliw a5, a1, 1 @@ -3634,11 +3645,12 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 4(sp) +; LMULMAX1-RV64-NEXT: sw a1, 20(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y @@ -3651,12 +3663,12 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v2i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX2-RV32-NEXT: sw zero, 12(sp) -; LMULMAX2-RV32-NEXT: sw zero, 4(sp) +; LMULMAX2-RV32-NEXT: sw zero, 28(sp) +; LMULMAX2-RV32-NEXT: sw zero, 20(sp) ; LMULMAX2-RV32-NEXT: addi a6, zero, 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6 @@ -3725,7 +3737,7 @@ ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sw a5, 0(sp) +; LMULMAX2-RV32-NEXT: sw a5, 16(sp) ; LMULMAX2-RV32-NEXT: bnez a1, .LBB3_5 ; LMULMAX2-RV32-NEXT: # %bb.4: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -3779,12 +3791,13 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB3_6: -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v2i64: @@ -3883,12 +3896,12 @@ ; ; LMULMAX1-RV32-LABEL: ctlz_v2i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: sw zero, 12(sp) -; LMULMAX1-RV32-NEXT: sw zero, 4(sp) +; LMULMAX1-RV32-NEXT: sw zero, 28(sp) +; LMULMAX1-RV32-NEXT: sw zero, 20(sp) ; LMULMAX1-RV32-NEXT: addi a6, zero, 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 @@ -3957,7 +3970,7 @@ ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: sw a5, 0(sp) +; LMULMAX1-RV32-NEXT: sw a5, 16(sp) ; LMULMAX1-RV32-NEXT: bnez a1, .LBB3_5 ; LMULMAX1-RV32-NEXT: # %bb.4: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -4011,12 +4024,13 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB3_6: -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v2i64: @@ -4123,13 +4137,13 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v32i8: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: addi a6, zero, 32 @@ -4169,7 +4183,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) +; LMULMAX2-RV32-NEXT: sb a1, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 @@ -4198,7 +4212,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) +; LMULMAX2-RV32-NEXT: sb a1, 63(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4226,7 +4240,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) +; LMULMAX2-RV32-NEXT: sb a1, 62(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4254,7 +4268,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) +; LMULMAX2-RV32-NEXT: sb a1, 61(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4282,7 +4296,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) +; LMULMAX2-RV32-NEXT: sb a1, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4310,7 +4324,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) +; LMULMAX2-RV32-NEXT: sb a1, 59(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4338,7 +4352,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) +; LMULMAX2-RV32-NEXT: sb a1, 58(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4366,7 +4380,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) +; LMULMAX2-RV32-NEXT: sb a1, 57(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4394,7 +4408,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) +; LMULMAX2-RV32-NEXT: sb a1, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4422,7 +4436,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) +; LMULMAX2-RV32-NEXT: sb a1, 55(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4450,7 +4464,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) +; LMULMAX2-RV32-NEXT: sb a1, 54(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4478,7 +4492,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) +; LMULMAX2-RV32-NEXT: sb a1, 53(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4506,7 +4520,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) +; LMULMAX2-RV32-NEXT: sb a1, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4534,7 +4548,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) +; LMULMAX2-RV32-NEXT: sb a1, 51(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4562,7 +4576,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) +; LMULMAX2-RV32-NEXT: sb a1, 50(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4590,7 +4604,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) +; LMULMAX2-RV32-NEXT: sb a1, 49(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4618,7 +4632,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) +; LMULMAX2-RV32-NEXT: sb a1, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4646,7 +4660,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) +; LMULMAX2-RV32-NEXT: sb a1, 47(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4674,7 +4688,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) +; LMULMAX2-RV32-NEXT: sb a1, 46(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4702,7 +4716,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) +; LMULMAX2-RV32-NEXT: sb a1, 45(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4730,7 +4744,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) +; LMULMAX2-RV32-NEXT: sb a1, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4758,7 +4772,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) +; LMULMAX2-RV32-NEXT: sb a1, 43(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4786,7 +4800,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) +; LMULMAX2-RV32-NEXT: sb a1, 42(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4814,7 +4828,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) +; LMULMAX2-RV32-NEXT: sb a1, 41(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4842,7 +4856,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) +; LMULMAX2-RV32-NEXT: sb a1, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4870,7 +4884,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) +; LMULMAX2-RV32-NEXT: sb a1, 39(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4898,7 +4912,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) +; LMULMAX2-RV32-NEXT: sb a1, 38(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4926,7 +4940,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) +; LMULMAX2-RV32-NEXT: sb a1, 37(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4954,7 +4968,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) +; LMULMAX2-RV32-NEXT: sb a1, 36(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -4982,7 +4996,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) +; LMULMAX2-RV32-NEXT: sb a1, 35(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -5010,7 +5024,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) +; LMULMAX2-RV32-NEXT: sb a1, 34(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: andi a1, a1, 255 @@ -5038,25 +5052,26 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -24 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) +; LMULMAX2-RV32-NEXT: sb a1, 33(sp) ; LMULMAX2-RV32-NEXT: vsetvli zero, a6, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle8.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse8.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v32i8: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: addi a6, zero, 32 @@ -5120,7 +5135,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) +; LMULMAX2-RV64-NEXT: sb a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 31 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 @@ -5151,7 +5166,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) +; LMULMAX2-RV64-NEXT: sb a1, 63(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5181,7 +5196,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) +; LMULMAX2-RV64-NEXT: sb a1, 62(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5211,7 +5226,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) +; LMULMAX2-RV64-NEXT: sb a1, 61(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5241,7 +5256,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) +; LMULMAX2-RV64-NEXT: sb a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5271,7 +5286,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) +; LMULMAX2-RV64-NEXT: sb a1, 59(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5301,7 +5316,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) +; LMULMAX2-RV64-NEXT: sb a1, 58(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5331,7 +5346,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) +; LMULMAX2-RV64-NEXT: sb a1, 57(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5361,7 +5376,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) +; LMULMAX2-RV64-NEXT: sb a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5391,7 +5406,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) +; LMULMAX2-RV64-NEXT: sb a1, 55(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5421,7 +5436,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) +; LMULMAX2-RV64-NEXT: sb a1, 54(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5451,7 +5466,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) +; LMULMAX2-RV64-NEXT: sb a1, 53(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5481,7 +5496,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) +; LMULMAX2-RV64-NEXT: sb a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5511,7 +5526,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) +; LMULMAX2-RV64-NEXT: sb a1, 51(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5541,7 +5556,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) +; LMULMAX2-RV64-NEXT: sb a1, 50(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5571,7 +5586,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) +; LMULMAX2-RV64-NEXT: sb a1, 49(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5601,7 +5616,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) +; LMULMAX2-RV64-NEXT: sb a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5631,7 +5646,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) +; LMULMAX2-RV64-NEXT: sb a1, 47(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5661,7 +5676,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) +; LMULMAX2-RV64-NEXT: sb a1, 46(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5691,7 +5706,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) +; LMULMAX2-RV64-NEXT: sb a1, 45(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5721,7 +5736,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) +; LMULMAX2-RV64-NEXT: sb a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5751,7 +5766,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) +; LMULMAX2-RV64-NEXT: sb a1, 43(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5781,7 +5796,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) +; LMULMAX2-RV64-NEXT: sb a1, 42(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5811,7 +5826,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) +; LMULMAX2-RV64-NEXT: sb a1, 41(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5841,7 +5856,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) +; LMULMAX2-RV64-NEXT: sb a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5871,7 +5886,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) +; LMULMAX2-RV64-NEXT: sb a1, 39(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5901,7 +5916,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) +; LMULMAX2-RV64-NEXT: sb a1, 38(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5931,7 +5946,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) +; LMULMAX2-RV64-NEXT: sb a1, 37(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5961,7 +5976,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) +; LMULMAX2-RV64-NEXT: sb a1, 36(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -5991,7 +6006,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) +; LMULMAX2-RV64-NEXT: sb a1, 35(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -6021,7 +6036,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) +; LMULMAX2-RV64-NEXT: sb a1, 34(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: andi a1, a1, 255 @@ -6051,20 +6066,21 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -56 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) +; LMULMAX2-RV64-NEXT: sb a1, 33(sp) ; LMULMAX2-RV64-NEXT: vsetvli zero, a6, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle8.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse8.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v32i8: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle8.v v26, (a6) @@ -6103,7 +6119,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 16(sp) +; LMULMAX1-RV32-NEXT: sb a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 15 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 @@ -6132,7 +6148,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 31(sp) +; LMULMAX1-RV32-NEXT: sb a1, 47(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 14 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6160,7 +6176,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 30(sp) +; LMULMAX1-RV32-NEXT: sb a1, 46(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 13 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6188,7 +6204,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 29(sp) +; LMULMAX1-RV32-NEXT: sb a1, 45(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 12 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6216,7 +6232,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 28(sp) +; LMULMAX1-RV32-NEXT: sb a1, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 11 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6244,7 +6260,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 27(sp) +; LMULMAX1-RV32-NEXT: sb a1, 43(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 10 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6272,7 +6288,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 26(sp) +; LMULMAX1-RV32-NEXT: sb a1, 42(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 9 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6300,7 +6316,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 25(sp) +; LMULMAX1-RV32-NEXT: sb a1, 41(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 8 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6328,7 +6344,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 24(sp) +; LMULMAX1-RV32-NEXT: sb a1, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6356,7 +6372,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 23(sp) +; LMULMAX1-RV32-NEXT: sb a1, 39(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6384,7 +6400,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 22(sp) +; LMULMAX1-RV32-NEXT: sb a1, 38(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6412,7 +6428,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 21(sp) +; LMULMAX1-RV32-NEXT: sb a1, 37(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6440,7 +6456,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 20(sp) +; LMULMAX1-RV32-NEXT: sb a1, 36(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6468,7 +6484,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 19(sp) +; LMULMAX1-RV32-NEXT: sb a1, 35(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6496,7 +6512,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 18(sp) +; LMULMAX1-RV32-NEXT: sb a1, 34(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6524,7 +6540,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 17(sp) +; LMULMAX1-RV32-NEXT: sb a1, 33(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -6551,7 +6567,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 0(sp) +; LMULMAX1-RV32-NEXT: sb a1, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6579,7 +6595,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 15(sp) +; LMULMAX1-RV32-NEXT: sb a1, 31(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6607,7 +6623,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 14(sp) +; LMULMAX1-RV32-NEXT: sb a1, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6635,7 +6651,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 13(sp) +; LMULMAX1-RV32-NEXT: sb a1, 29(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6663,7 +6679,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 12(sp) +; LMULMAX1-RV32-NEXT: sb a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6691,7 +6707,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 11(sp) +; LMULMAX1-RV32-NEXT: sb a1, 27(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6719,7 +6735,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 10(sp) +; LMULMAX1-RV32-NEXT: sb a1, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6747,7 +6763,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 9(sp) +; LMULMAX1-RV32-NEXT: sb a1, 25(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6775,7 +6791,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 8(sp) +; LMULMAX1-RV32-NEXT: sb a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6803,7 +6819,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 7(sp) +; LMULMAX1-RV32-NEXT: sb a1, 23(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6831,7 +6847,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 6(sp) +; LMULMAX1-RV32-NEXT: sb a1, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6859,7 +6875,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 5(sp) +; LMULMAX1-RV32-NEXT: sb a1, 21(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6887,7 +6903,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 4(sp) +; LMULMAX1-RV32-NEXT: sb a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6915,7 +6931,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 3(sp) +; LMULMAX1-RV32-NEXT: sb a1, 19(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6943,7 +6959,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 2(sp) +; LMULMAX1-RV32-NEXT: sb a1, 18(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: andi a1, a1, 255 @@ -6971,20 +6987,21 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -24 -; LMULMAX1-RV32-NEXT: sb a1, 1(sp) +; LMULMAX1-RV32-NEXT: sb a1, 17(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle8.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle8.v v26, (a1) ; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse8.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v32i8: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle8.v v26, (a6) @@ -7047,7 +7064,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 16(sp) +; LMULMAX1-RV64-NEXT: sb a1, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 15 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 @@ -7078,7 +7095,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 31(sp) +; LMULMAX1-RV64-NEXT: sb a1, 47(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 14 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7108,7 +7125,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 30(sp) +; LMULMAX1-RV64-NEXT: sb a1, 46(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 13 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7138,7 +7155,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 29(sp) +; LMULMAX1-RV64-NEXT: sb a1, 45(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 12 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7168,7 +7185,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 28(sp) +; LMULMAX1-RV64-NEXT: sb a1, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 11 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7198,7 +7215,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 27(sp) +; LMULMAX1-RV64-NEXT: sb a1, 43(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 10 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7228,7 +7245,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 26(sp) +; LMULMAX1-RV64-NEXT: sb a1, 42(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 9 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7258,7 +7275,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 25(sp) +; LMULMAX1-RV64-NEXT: sb a1, 41(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 8 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7288,7 +7305,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 24(sp) +; LMULMAX1-RV64-NEXT: sb a1, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7318,7 +7335,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 23(sp) +; LMULMAX1-RV64-NEXT: sb a1, 39(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7348,7 +7365,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 22(sp) +; LMULMAX1-RV64-NEXT: sb a1, 38(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7378,7 +7395,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 21(sp) +; LMULMAX1-RV64-NEXT: sb a1, 37(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7408,7 +7425,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 20(sp) +; LMULMAX1-RV64-NEXT: sb a1, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7438,7 +7455,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 19(sp) +; LMULMAX1-RV64-NEXT: sb a1, 35(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7468,7 +7485,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 18(sp) +; LMULMAX1-RV64-NEXT: sb a1, 34(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7498,7 +7515,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 17(sp) +; LMULMAX1-RV64-NEXT: sb a1, 33(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 @@ -7527,7 +7544,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 0(sp) +; LMULMAX1-RV64-NEXT: sb a1, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7557,7 +7574,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 15(sp) +; LMULMAX1-RV64-NEXT: sb a1, 31(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7587,7 +7604,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 14(sp) +; LMULMAX1-RV64-NEXT: sb a1, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7617,7 +7634,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 13(sp) +; LMULMAX1-RV64-NEXT: sb a1, 29(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7647,7 +7664,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 12(sp) +; LMULMAX1-RV64-NEXT: sb a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7677,7 +7694,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 11(sp) +; LMULMAX1-RV64-NEXT: sb a1, 27(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7707,7 +7724,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 10(sp) +; LMULMAX1-RV64-NEXT: sb a1, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7737,7 +7754,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 9(sp) +; LMULMAX1-RV64-NEXT: sb a1, 25(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7767,7 +7784,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 8(sp) +; LMULMAX1-RV64-NEXT: sb a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7797,7 +7814,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 7(sp) +; LMULMAX1-RV64-NEXT: sb a1, 23(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7827,7 +7844,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 6(sp) +; LMULMAX1-RV64-NEXT: sb a1, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7857,7 +7874,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 5(sp) +; LMULMAX1-RV64-NEXT: sb a1, 21(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7887,7 +7904,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 4(sp) +; LMULMAX1-RV64-NEXT: sb a1, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7917,7 +7934,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 3(sp) +; LMULMAX1-RV64-NEXT: sb a1, 19(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7947,7 +7964,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 2(sp) +; LMULMAX1-RV64-NEXT: sb a1, 18(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: andi a1, a1, 255 @@ -7977,14 +7994,15 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -56 -; LMULMAX1-RV64-NEXT: sb a1, 1(sp) +; LMULMAX1-RV64-NEXT: sb a1, 17(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle8.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV64-NEXT: addi a1, sp, 32 ; LMULMAX1-RV64-NEXT: vle8.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse8.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = load <32 x i8>, <32 x i8>* %y @@ -7997,13 +8015,13 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v16i16: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu @@ -8044,7 +8062,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: sh a1, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 @@ -8073,7 +8091,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 30(sp) +; LMULMAX2-RV32-NEXT: sh a1, 62(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8101,7 +8119,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 28(sp) +; LMULMAX2-RV32-NEXT: sh a1, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8129,7 +8147,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 26(sp) +; LMULMAX2-RV32-NEXT: sh a1, 58(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8157,7 +8175,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 24(sp) +; LMULMAX2-RV32-NEXT: sh a1, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8185,7 +8203,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 22(sp) +; LMULMAX2-RV32-NEXT: sh a1, 54(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8213,7 +8231,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 20(sp) +; LMULMAX2-RV32-NEXT: sh a1, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8241,7 +8259,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 18(sp) +; LMULMAX2-RV32-NEXT: sh a1, 50(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8269,7 +8287,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 16(sp) +; LMULMAX2-RV32-NEXT: sh a1, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8297,7 +8315,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 14(sp) +; LMULMAX2-RV32-NEXT: sh a1, 46(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8325,7 +8343,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: sh a1, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8353,7 +8371,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 10(sp) +; LMULMAX2-RV32-NEXT: sh a1, 42(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8381,7 +8399,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: sh a1, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8409,7 +8427,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 6(sp) +; LMULMAX2-RV32-NEXT: sh a1, 38(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8437,7 +8455,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 4(sp) +; LMULMAX2-RV32-NEXT: sh a1, 36(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: and a1, a1, a6 @@ -8465,25 +8483,26 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, -16 -; LMULMAX2-RV32-NEXT: sh a1, 2(sp) +; LMULMAX2-RV32-NEXT: sh a1, 34(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle16.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle16.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v16i16: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu @@ -8548,7 +8567,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: sh a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 @@ -8579,7 +8598,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 30(sp) +; LMULMAX2-RV64-NEXT: sh a1, 62(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8609,7 +8628,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 28(sp) +; LMULMAX2-RV64-NEXT: sh a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8639,7 +8658,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 26(sp) +; LMULMAX2-RV64-NEXT: sh a1, 58(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8669,7 +8688,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 24(sp) +; LMULMAX2-RV64-NEXT: sh a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8699,7 +8718,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 22(sp) +; LMULMAX2-RV64-NEXT: sh a1, 54(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8729,7 +8748,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 20(sp) +; LMULMAX2-RV64-NEXT: sh a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8759,7 +8778,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 18(sp) +; LMULMAX2-RV64-NEXT: sh a1, 50(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8789,7 +8808,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 16(sp) +; LMULMAX2-RV64-NEXT: sh a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8819,7 +8838,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 14(sp) +; LMULMAX2-RV64-NEXT: sh a1, 46(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8849,7 +8868,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: sh a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8879,7 +8898,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 10(sp) +; LMULMAX2-RV64-NEXT: sh a1, 42(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8909,7 +8928,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: sh a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8939,7 +8958,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 6(sp) +; LMULMAX2-RV64-NEXT: sh a1, 38(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8969,7 +8988,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 4(sp) +; LMULMAX2-RV64-NEXT: sh a1, 36(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: and a1, a1, a6 @@ -8999,20 +9018,21 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 2(sp) +; LMULMAX2-RV64-NEXT: sh a1, 34(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle16.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle16.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v16i16: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle16.v v26, (a6) @@ -9053,7 +9073,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 16(sp) +; LMULMAX1-RV32-NEXT: sh a2, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 @@ -9082,7 +9102,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 30(sp) +; LMULMAX1-RV32-NEXT: sh a2, 46(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9110,7 +9130,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 28(sp) +; LMULMAX1-RV32-NEXT: sh a2, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9138,7 +9158,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 26(sp) +; LMULMAX1-RV32-NEXT: sh a2, 42(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9166,7 +9186,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 24(sp) +; LMULMAX1-RV32-NEXT: sh a2, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9194,7 +9214,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 22(sp) +; LMULMAX1-RV32-NEXT: sh a2, 38(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9222,7 +9242,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 20(sp) +; LMULMAX1-RV32-NEXT: sh a2, 36(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9250,7 +9270,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 18(sp) +; LMULMAX1-RV32-NEXT: sh a2, 34(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 ; LMULMAX1-RV32-NEXT: srli a3, a2, 1 @@ -9277,7 +9297,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 0(sp) +; LMULMAX1-RV32-NEXT: sh a2, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9305,7 +9325,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 14(sp) +; LMULMAX1-RV32-NEXT: sh a2, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9333,7 +9353,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 12(sp) +; LMULMAX1-RV32-NEXT: sh a2, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9361,7 +9381,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 10(sp) +; LMULMAX1-RV32-NEXT: sh a2, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9389,7 +9409,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 8(sp) +; LMULMAX1-RV32-NEXT: sh a2, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9417,7 +9437,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 6(sp) +; LMULMAX1-RV32-NEXT: sh a2, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9445,7 +9465,7 @@ ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: addi a2, a2, -16 -; LMULMAX1-RV32-NEXT: sh a2, 4(sp) +; LMULMAX1-RV32-NEXT: sh a2, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: and a2, a2, a7 @@ -9473,20 +9493,21 @@ ; LMULMAX1-RV32-NEXT: mul a1, a2, a1 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, -16 -; LMULMAX1-RV32-NEXT: sh a1, 2(sp) +; LMULMAX1-RV32-NEXT: sh a1, 18(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle16.v v26, (a1) ; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse16.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v16i16: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle16.v v26, (a6) @@ -9551,7 +9572,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 16(sp) +; LMULMAX1-RV64-NEXT: sh a2, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 @@ -9582,7 +9603,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 30(sp) +; LMULMAX1-RV64-NEXT: sh a2, 46(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9612,7 +9633,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 28(sp) +; LMULMAX1-RV64-NEXT: sh a2, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9642,7 +9663,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 26(sp) +; LMULMAX1-RV64-NEXT: sh a2, 42(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9672,7 +9693,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 24(sp) +; LMULMAX1-RV64-NEXT: sh a2, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9702,7 +9723,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 22(sp) +; LMULMAX1-RV64-NEXT: sh a2, 38(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9732,7 +9753,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 20(sp) +; LMULMAX1-RV64-NEXT: sh a2, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9762,7 +9783,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 18(sp) +; LMULMAX1-RV64-NEXT: sh a2, 34(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 @@ -9791,7 +9812,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 0(sp) +; LMULMAX1-RV64-NEXT: sh a2, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9821,7 +9842,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 14(sp) +; LMULMAX1-RV64-NEXT: sh a2, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9851,7 +9872,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 12(sp) +; LMULMAX1-RV64-NEXT: sh a2, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9881,7 +9902,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 10(sp) +; LMULMAX1-RV64-NEXT: sh a2, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9911,7 +9932,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 8(sp) +; LMULMAX1-RV64-NEXT: sh a2, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9941,7 +9962,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 6(sp) +; LMULMAX1-RV64-NEXT: sh a2, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -9971,7 +9992,7 @@ ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 ; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 4(sp) +; LMULMAX1-RV64-NEXT: sh a2, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: and a2, a2, a7 @@ -10001,14 +10022,15 @@ ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 2(sp) +; LMULMAX1-RV64-NEXT: sh a1, 18(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV64-NEXT: addi a1, sp, 32 ; LMULMAX1-RV64-NEXT: vle16.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse16.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y @@ -10021,13 +10043,13 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v8i32: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu @@ -10064,7 +10086,7 @@ ; LMULMAX2-RV32-NEXT: addi a4, a4, 257 ; LMULMAX2-RV32-NEXT: mul a5, a5, a4 ; LMULMAX2-RV32-NEXT: srli a5, a5, 24 -; LMULMAX2-RV32-NEXT: sw a5, 0(sp) +; LMULMAX2-RV32-NEXT: sw a5, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a5, v28 @@ -10091,7 +10113,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32-NEXT: sw a1, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -10117,7 +10139,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 24(sp) +; LMULMAX2-RV32-NEXT: sw a1, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -10143,7 +10165,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32-NEXT: sw a1, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -10169,7 +10191,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32-NEXT: sw a1, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -10195,7 +10217,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -10221,7 +10243,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: srli a5, a1, 1 @@ -10247,25 +10269,26 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 36(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v8i32: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu @@ -10329,7 +10352,7 @@ ; LMULMAX2-RV64-NEXT: mul a5, a5, a4 ; LMULMAX2-RV64-NEXT: srli a5, a5, 56 ; LMULMAX2-RV64-NEXT: addi a5, a5, -32 -; LMULMAX2-RV64-NEXT: sw a5, 0(sp) +; LMULMAX2-RV64-NEXT: sw a5, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a5, v28 @@ -10361,7 +10384,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 28(sp) +; LMULMAX2-RV64-NEXT: sw a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -10392,7 +10415,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 24(sp) +; LMULMAX2-RV64-NEXT: sw a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -10423,7 +10446,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 20(sp) +; LMULMAX2-RV64-NEXT: sw a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -10454,7 +10477,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 16(sp) +; LMULMAX2-RV64-NEXT: sw a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -10485,7 +10508,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 12(sp) +; LMULMAX2-RV64-NEXT: sw a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -10516,7 +10539,7 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 8(sp) +; LMULMAX2-RV64-NEXT: sw a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: srliw a5, a1, 1 @@ -10547,20 +10570,21 @@ ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -32 -; LMULMAX2-RV64-NEXT: sw a1, 4(sp) +; LMULMAX2-RV64-NEXT: sw a1, 36(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v8i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a6) @@ -10597,7 +10621,7 @@ ; LMULMAX1-RV32-NEXT: addi a5, a5, 257 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: sw a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 @@ -10624,7 +10648,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 28(sp) +; LMULMAX1-RV32-NEXT: sw a1, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -10650,7 +10674,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 24(sp) +; LMULMAX1-RV32-NEXT: sw a1, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -10676,7 +10700,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 20(sp) +; LMULMAX1-RV32-NEXT: sw a1, 36(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 @@ -10701,7 +10725,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -10727,7 +10751,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -10753,7 +10777,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -10779,20 +10803,21 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse32.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v8i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle32.v v26, (a6) @@ -10856,7 +10881,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 16(sp) +; LMULMAX1-RV64-NEXT: sw a1, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 @@ -10888,7 +10913,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 28(sp) +; LMULMAX1-RV64-NEXT: sw a1, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 1 @@ -10919,7 +10944,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 24(sp) +; LMULMAX1-RV64-NEXT: sw a1, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 1 @@ -10950,7 +10975,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 20(sp) +; LMULMAX1-RV64-NEXT: sw a1, 36(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 1 ; LMULMAX1-RV64-NEXT: slli a1, a1, 32 @@ -10980,7 +11005,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 0(sp) +; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 1 @@ -11011,7 +11036,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 12(sp) +; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 1 @@ -11042,7 +11067,7 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 8(sp) +; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srliw a2, a1, 1 @@ -11073,14 +11098,15 @@ ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -32 -; LMULMAX1-RV64-NEXT: sw a1, 4(sp) +; LMULMAX1-RV64-NEXT: sw a1, 20(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV64-NEXT: addi a1, sp, 32 ; LMULMAX1-RV64-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse32.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y @@ -11093,21 +11119,21 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-LABEL: ctlz_v4i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle64.v v26, (a0) -; LMULMAX2-RV32-NEXT: sw zero, 28(sp) -; LMULMAX2-RV32-NEXT: sw zero, 20(sp) -; LMULMAX2-RV32-NEXT: sw zero, 12(sp) -; LMULMAX2-RV32-NEXT: sw zero, 4(sp) +; LMULMAX2-RV32-NEXT: sw zero, 60(sp) +; LMULMAX2-RV32-NEXT: sw zero, 52(sp) +; LMULMAX2-RV32-NEXT: sw zero, 44(sp) +; LMULMAX2-RV32-NEXT: sw zero, 36(sp) ; LMULMAX2-RV32-NEXT: addi a6, zero, 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6 @@ -11176,7 +11202,7 @@ ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30 -; LMULMAX2-RV32-NEXT: sw a5, 0(sp) +; LMULMAX2-RV32-NEXT: sw a5, 32(sp) ; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_5 ; LMULMAX2-RV32-NEXT: # %bb.4: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 @@ -11233,7 +11259,7 @@ ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30 -; LMULMAX2-RV32-NEXT: sw a5, 24(sp) +; LMULMAX2-RV32-NEXT: sw a5, 56(sp) ; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX2-RV32-NEXT: # %bb.7: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 @@ -11290,7 +11316,7 @@ ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sw a5, 16(sp) +; LMULMAX2-RV32-NEXT: sw a5, 48(sp) ; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_11 ; LMULMAX2-RV32-NEXT: # %bb.10: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 @@ -11344,26 +11370,27 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_12: -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 40(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctlz_v4i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu @@ -11424,7 +11451,7 @@ ; LMULMAX2-RV64-NEXT: addi a4, a4, 257 ; LMULMAX2-RV64-NEXT: mul a5, a5, a4 ; LMULMAX2-RV64-NEXT: srli a5, a5, 56 -; LMULMAX2-RV64-NEXT: sd a5, 0(sp) +; LMULMAX2-RV64-NEXT: sd a5, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a5, v28 @@ -11453,7 +11480,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 24(sp) +; LMULMAX2-RV64-NEXT: sd a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: srli a5, a1, 1 @@ -11481,7 +11508,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 16(sp) +; LMULMAX2-RV64-NEXT: sd a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: srli a5, a1, 1 @@ -11509,26 +11536,27 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 8(sp) +; LMULMAX2-RV64-NEXT: sd a1, 40(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle64.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle64.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) -; LMULMAX1-RV32-NEXT: sw zero, 28(sp) -; LMULMAX1-RV32-NEXT: sw zero, 20(sp) +; LMULMAX1-RV32-NEXT: sw zero, 44(sp) +; LMULMAX1-RV32-NEXT: sw zero, 36(sp) ; LMULMAX1-RV32-NEXT: addi a7, zero, 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 @@ -11597,7 +11625,7 @@ ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: sw a1, 32(sp) ; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_5 ; LMULMAX1-RV32-NEXT: # %bb.4: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -11651,11 +11679,11 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_6: -; LMULMAX1-RV32-NEXT: sw a1, 24(sp) -; LMULMAX1-RV32-NEXT: sw zero, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 40(sp) +; LMULMAX1-RV32-NEXT: sw zero, 28(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: sw zero, 4(sp) +; LMULMAX1-RV32-NEXT: sw zero, 20(sp) ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX1-RV32-NEXT: # %bb.7: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -11712,7 +11740,7 @@ ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11 ; LMULMAX1-RV32-NEXT: # %bb.10: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -11766,15 +11794,16 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_12: -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse64.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctlz_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -7,8 +7,8 @@ define void @cttz_v16i8(<16 x i8>* %x, <16 x i8>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v16i8: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle8.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -36,7 +36,7 @@ ; LMULMAX2-RV32-NEXT: addi a4, a4, 257 ; LMULMAX2-RV32-NEXT: mul a5, a5, a4 ; LMULMAX2-RV32-NEXT: srli a5, a5, 24 -; LMULMAX2-RV32-NEXT: sb a5, 0(sp) +; LMULMAX2-RV32-NEXT: sb a5, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26 @@ -56,7 +56,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) +; LMULMAX2-RV32-NEXT: sb a1, 31(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -75,7 +75,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) +; LMULMAX2-RV32-NEXT: sb a1, 30(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -94,7 +94,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) +; LMULMAX2-RV32-NEXT: sb a1, 29(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -113,7 +113,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) +; LMULMAX2-RV32-NEXT: sb a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -132,7 +132,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) +; LMULMAX2-RV32-NEXT: sb a1, 27(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -151,7 +151,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) +; LMULMAX2-RV32-NEXT: sb a1, 26(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -170,7 +170,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) +; LMULMAX2-RV32-NEXT: sb a1, 25(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -189,7 +189,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) +; LMULMAX2-RV32-NEXT: sb a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -208,7 +208,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) +; LMULMAX2-RV32-NEXT: sb a1, 23(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -227,7 +227,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) +; LMULMAX2-RV32-NEXT: sb a1, 22(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -246,7 +246,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) +; LMULMAX2-RV32-NEXT: sb a1, 21(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -265,7 +265,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) +; LMULMAX2-RV32-NEXT: sb a1, 20(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -284,7 +284,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) +; LMULMAX2-RV32-NEXT: sb a1, 19(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -303,7 +303,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) +; LMULMAX2-RV32-NEXT: sb a1, 18(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -322,17 +322,18 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) +; LMULMAX2-RV32-NEXT: sb a1, 17(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle8.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse8.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v16i8: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle8.v v25, (a0) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8,m1,ta,mu @@ -384,7 +385,7 @@ ; LMULMAX2-RV64-NEXT: addi a4, a4, 257 ; LMULMAX2-RV64-NEXT: mul a5, a5, a4 ; LMULMAX2-RV64-NEXT: srli a5, a5, 56 -; LMULMAX2-RV64-NEXT: sb a5, 15(sp) +; LMULMAX2-RV64-NEXT: sb a5, 31(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a5, v26 ; LMULMAX2-RV64-NEXT: ori a5, a5, 256 @@ -403,7 +404,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) +; LMULMAX2-RV64-NEXT: sb a1, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -422,7 +423,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) +; LMULMAX2-RV64-NEXT: sb a1, 29(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -441,7 +442,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) +; LMULMAX2-RV64-NEXT: sb a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -460,7 +461,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) +; LMULMAX2-RV64-NEXT: sb a1, 27(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -479,7 +480,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) +; LMULMAX2-RV64-NEXT: sb a1, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -498,7 +499,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) +; LMULMAX2-RV64-NEXT: sb a1, 25(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -517,7 +518,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) +; LMULMAX2-RV64-NEXT: sb a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -536,7 +537,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) +; LMULMAX2-RV64-NEXT: sb a1, 23(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -555,7 +556,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) +; LMULMAX2-RV64-NEXT: sb a1, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -574,7 +575,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) +; LMULMAX2-RV64-NEXT: sb a1, 21(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -593,7 +594,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) +; LMULMAX2-RV64-NEXT: sb a1, 20(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -612,7 +613,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) +; LMULMAX2-RV64-NEXT: sb a1, 19(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -631,7 +632,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) +; LMULMAX2-RV64-NEXT: sb a1, 18(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -650,7 +651,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) +; LMULMAX2-RV64-NEXT: sb a1, 17(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 ; LMULMAX2-RV64-NEXT: addi a5, a1, -1 @@ -668,17 +669,18 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a4 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) +; LMULMAX2-RV64-NEXT: sb a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle8.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse8.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v16i8: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -706,7 +708,7 @@ ; LMULMAX1-RV32-NEXT: addi a4, a4, 257 ; LMULMAX1-RV32-NEXT: mul a5, a5, a4 ; LMULMAX1-RV32-NEXT: srli a5, a5, 24 -; LMULMAX1-RV32-NEXT: sb a5, 0(sp) +; LMULMAX1-RV32-NEXT: sb a5, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26 @@ -726,7 +728,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 15(sp) +; LMULMAX1-RV32-NEXT: sb a1, 31(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -745,7 +747,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 14(sp) +; LMULMAX1-RV32-NEXT: sb a1, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -764,7 +766,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 13(sp) +; LMULMAX1-RV32-NEXT: sb a1, 29(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -783,7 +785,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 12(sp) +; LMULMAX1-RV32-NEXT: sb a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -802,7 +804,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 11(sp) +; LMULMAX1-RV32-NEXT: sb a1, 27(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -821,7 +823,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 10(sp) +; LMULMAX1-RV32-NEXT: sb a1, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -840,7 +842,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 9(sp) +; LMULMAX1-RV32-NEXT: sb a1, 25(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -859,7 +861,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 8(sp) +; LMULMAX1-RV32-NEXT: sb a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -878,7 +880,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 7(sp) +; LMULMAX1-RV32-NEXT: sb a1, 23(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -897,7 +899,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 6(sp) +; LMULMAX1-RV32-NEXT: sb a1, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -916,7 +918,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 5(sp) +; LMULMAX1-RV32-NEXT: sb a1, 21(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -935,7 +937,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 4(sp) +; LMULMAX1-RV32-NEXT: sb a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -954,7 +956,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 3(sp) +; LMULMAX1-RV32-NEXT: sb a1, 19(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -973,7 +975,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 2(sp) +; LMULMAX1-RV32-NEXT: sb a1, 18(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -992,17 +994,18 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a4 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 1(sp) +; LMULMAX1-RV32-NEXT: sb a1, 17(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle8.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v16i8: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e8,m1,ta,mu @@ -1054,7 +1057,7 @@ ; LMULMAX1-RV64-NEXT: addi a4, a4, 257 ; LMULMAX1-RV64-NEXT: mul a5, a5, a4 ; LMULMAX1-RV64-NEXT: srli a5, a5, 56 -; LMULMAX1-RV64-NEXT: sb a5, 15(sp) +; LMULMAX1-RV64-NEXT: sb a5, 31(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV64-NEXT: vmv.x.s a5, v26 ; LMULMAX1-RV64-NEXT: ori a5, a5, 256 @@ -1073,7 +1076,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 14(sp) +; LMULMAX1-RV64-NEXT: sb a1, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1092,7 +1095,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 13(sp) +; LMULMAX1-RV64-NEXT: sb a1, 29(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1111,7 +1114,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 12(sp) +; LMULMAX1-RV64-NEXT: sb a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1130,7 +1133,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 11(sp) +; LMULMAX1-RV64-NEXT: sb a1, 27(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1149,7 +1152,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 10(sp) +; LMULMAX1-RV64-NEXT: sb a1, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1168,7 +1171,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 9(sp) +; LMULMAX1-RV64-NEXT: sb a1, 25(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1187,7 +1190,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 8(sp) +; LMULMAX1-RV64-NEXT: sb a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1206,7 +1209,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 7(sp) +; LMULMAX1-RV64-NEXT: sb a1, 23(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1225,7 +1228,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 6(sp) +; LMULMAX1-RV64-NEXT: sb a1, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1244,7 +1247,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 5(sp) +; LMULMAX1-RV64-NEXT: sb a1, 21(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1263,7 +1266,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 4(sp) +; LMULMAX1-RV64-NEXT: sb a1, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1282,7 +1285,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 3(sp) +; LMULMAX1-RV64-NEXT: sb a1, 19(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1301,7 +1304,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 2(sp) +; LMULMAX1-RV64-NEXT: sb a1, 18(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -1320,7 +1323,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 1(sp) +; LMULMAX1-RV64-NEXT: sb a1, 17(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 ; LMULMAX1-RV64-NEXT: addi a5, a1, -1 @@ -1338,11 +1341,12 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a3 ; LMULMAX1-RV64-NEXT: mul a1, a1, a4 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 0(sp) +; LMULMAX1-RV64-NEXT: sb a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle8.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = load <16 x i8>, <16 x i8>* %y @@ -1355,8 +1359,8 @@ define void @cttz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v8i16: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v25 @@ -1385,7 +1389,7 @@ ; LMULMAX2-RV32-NEXT: addi a5, a5, 257 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: sh a1, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 @@ -1405,7 +1409,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 14(sp) +; LMULMAX2-RV32-NEXT: sh a1, 30(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -1424,7 +1428,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: sh a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -1443,7 +1447,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 10(sp) +; LMULMAX2-RV32-NEXT: sh a1, 26(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -1462,7 +1466,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: sh a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -1481,7 +1485,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 6(sp) +; LMULMAX2-RV32-NEXT: sh a1, 22(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -1500,7 +1504,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 4(sp) +; LMULMAX2-RV32-NEXT: sh a1, 20(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -1519,17 +1523,18 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 2(sp) +; LMULMAX2-RV32-NEXT: sh a1, 18(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle16.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v8i16: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu @@ -1582,7 +1587,7 @@ ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 14(sp) +; LMULMAX2-RV64-NEXT: sh a1, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -1601,7 +1606,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: sh a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -1620,7 +1625,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 10(sp) +; LMULMAX2-RV64-NEXT: sh a1, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -1639,7 +1644,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: sh a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -1658,7 +1663,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 6(sp) +; LMULMAX2-RV64-NEXT: sh a1, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -1677,7 +1682,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 4(sp) +; LMULMAX2-RV64-NEXT: sh a1, 20(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -1696,7 +1701,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 2(sp) +; LMULMAX2-RV64-NEXT: sh a1, 18(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 ; LMULMAX2-RV64-NEXT: addi a2, a1, -1 @@ -1714,17 +1719,18 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: sh a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle16.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v8i16: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 @@ -1753,7 +1759,7 @@ ; LMULMAX1-RV32-NEXT: addi a5, a5, 257 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 0(sp) +; LMULMAX1-RV32-NEXT: sh a1, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -1773,7 +1779,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 14(sp) +; LMULMAX1-RV32-NEXT: sh a1, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: or a1, a1, a6 @@ -1792,7 +1798,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 12(sp) +; LMULMAX1-RV32-NEXT: sh a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: or a1, a1, a6 @@ -1811,7 +1817,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 10(sp) +; LMULMAX1-RV32-NEXT: sh a1, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: or a1, a1, a6 @@ -1830,7 +1836,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 8(sp) +; LMULMAX1-RV32-NEXT: sh a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: or a1, a1, a6 @@ -1849,7 +1855,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 6(sp) +; LMULMAX1-RV32-NEXT: sh a1, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: or a1, a1, a6 @@ -1868,7 +1874,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 4(sp) +; LMULMAX1-RV32-NEXT: sh a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: or a1, a1, a6 @@ -1887,17 +1893,18 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 2(sp) +; LMULMAX1-RV32-NEXT: sh a1, 18(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v8i16: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu @@ -1950,7 +1957,7 @@ ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 14(sp) +; LMULMAX1-RV64-NEXT: sh a1, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: or a1, a1, a6 @@ -1969,7 +1976,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 12(sp) +; LMULMAX1-RV64-NEXT: sh a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: or a1, a1, a6 @@ -1988,7 +1995,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 10(sp) +; LMULMAX1-RV64-NEXT: sh a1, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: or a1, a1, a6 @@ -2007,7 +2014,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 8(sp) +; LMULMAX1-RV64-NEXT: sh a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: or a1, a1, a6 @@ -2026,7 +2033,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 6(sp) +; LMULMAX1-RV64-NEXT: sh a1, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: or a1, a1, a6 @@ -2045,7 +2052,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 4(sp) +; LMULMAX1-RV64-NEXT: sh a1, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: or a1, a1, a6 @@ -2064,7 +2071,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 2(sp) +; LMULMAX1-RV64-NEXT: sh a1, 18(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: or a1, a1, a6 ; LMULMAX1-RV64-NEXT: addi a2, a1, -1 @@ -2082,11 +2089,12 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 0(sp) +; LMULMAX1-RV64-NEXT: sh a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y @@ -2099,8 +2107,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v4i32: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle32.v v25, (a0) ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 @@ -2127,7 +2135,7 @@ ; LMULMAX2-RV32-NEXT: addi a5, a5, 257 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 @@ -2146,7 +2154,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: addi a3, a1, -1 @@ -2164,7 +2172,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: addi a3, a1, -1 @@ -2182,17 +2190,18 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 20(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v4i32: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, -32 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu @@ -2246,7 +2255,7 @@ ; LMULMAX2-RV64-NEXT: addi a1, a1, 257 ; LMULMAX2-RV64-NEXT: mul a4, a4, a1 ; LMULMAX2-RV64-NEXT: srli a4, a4, 56 -; LMULMAX2-RV64-NEXT: sw a4, 12(sp) +; LMULMAX2-RV64-NEXT: sw a4, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a4, v26 ; LMULMAX2-RV64-NEXT: or a4, a4, a6 @@ -2265,7 +2274,7 @@ ; LMULMAX2-RV64-NEXT: and a2, a2, a5 ; LMULMAX2-RV64-NEXT: mul a2, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 8(sp) +; LMULMAX2-RV64-NEXT: sw a2, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV64-NEXT: or a2, a2, a6 @@ -2284,7 +2293,7 @@ ; LMULMAX2-RV64-NEXT: and a2, a2, a5 ; LMULMAX2-RV64-NEXT: mul a2, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 4(sp) +; LMULMAX2-RV64-NEXT: sw a2, 20(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX2-RV64-NEXT: or a2, a2, a6 ; LMULMAX2-RV64-NEXT: addi a4, a2, -1 @@ -2302,17 +2311,18 @@ ; LMULMAX2-RV64-NEXT: and a2, a2, a5 ; LMULMAX2-RV64-NEXT: mul a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 0(sp) +; LMULMAX2-RV64-NEXT: sw a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 16 +; LMULMAX2-RV64-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: addi sp, sp, 32 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v4i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 @@ -2339,7 +2349,7 @@ ; LMULMAX1-RV32-NEXT: addi a5, a5, 257 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -2358,7 +2368,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a3, a1, -1 @@ -2376,7 +2386,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: addi a3, a1, -1 @@ -2394,17 +2404,18 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v4i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu @@ -2458,7 +2469,7 @@ ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: mul a4, a4, a1 ; LMULMAX1-RV64-NEXT: srli a4, a4, 56 -; LMULMAX1-RV64-NEXT: sw a4, 12(sp) +; LMULMAX1-RV64-NEXT: sw a4, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 ; LMULMAX1-RV64-NEXT: or a4, a4, a6 @@ -2477,7 +2488,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 8(sp) +; LMULMAX1-RV64-NEXT: sw a2, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a6 @@ -2496,7 +2507,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 4(sp) +; LMULMAX1-RV64-NEXT: sw a2, 20(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: or a2, a2, a6 ; LMULMAX1-RV64-NEXT: addi a4, a2, -1 @@ -2514,11 +2525,12 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sw a1, 0(sp) +; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y @@ -2531,12 +2543,12 @@ define void @cttz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v2i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -16 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX2-RV32-NEXT: sw zero, 12(sp) -; LMULMAX2-RV32-NEXT: sw zero, 4(sp) +; LMULMAX2-RV32-NEXT: sw zero, 28(sp) +; LMULMAX2-RV32-NEXT: sw zero, 20(sp) ; LMULMAX2-RV32-NEXT: addi a6, zero, 32 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a4, a1, 1365 @@ -2586,7 +2598,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB3_3: -; LMULMAX2-RV32-NEXT: sw a5, 0(sp) +; LMULMAX2-RV32-NEXT: sw a5, 16(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a5, v25 @@ -2628,12 +2640,13 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB3_6: -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX2-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v2i64: @@ -2712,12 +2725,12 @@ ; ; LMULMAX1-RV32-LABEL: cttz_v2i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -16 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: sw zero, 12(sp) -; LMULMAX1-RV32-NEXT: sw zero, 4(sp) +; LMULMAX1-RV32-NEXT: sw zero, 28(sp) +; LMULMAX1-RV32-NEXT: sw zero, 20(sp) ; LMULMAX1-RV32-NEXT: addi a6, zero, 32 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a4, a1, 1365 @@ -2767,7 +2780,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a5, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB3_3: -; LMULMAX1-RV32-NEXT: sw a5, 0(sp) +; LMULMAX1-RV32-NEXT: sw a5, 16(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a5, v25 @@ -2809,12 +2822,13 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB3_6: -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v2i64: @@ -2901,13 +2915,13 @@ define void @cttz_v32i8(<32 x i8>* %x, <32 x i8>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v32i8: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: addi a6, zero, 32 @@ -2938,7 +2952,7 @@ ; LMULMAX2-RV32-NEXT: addi a5, a5, 257 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) +; LMULMAX2-RV32-NEXT: sb a1, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 @@ -2958,7 +2972,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) +; LMULMAX2-RV32-NEXT: sb a1, 63(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -2977,7 +2991,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) +; LMULMAX2-RV32-NEXT: sb a1, 62(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -2996,7 +3010,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) +; LMULMAX2-RV32-NEXT: sb a1, 61(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3015,7 +3029,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) +; LMULMAX2-RV32-NEXT: sb a1, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3034,7 +3048,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) +; LMULMAX2-RV32-NEXT: sb a1, 59(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3053,7 +3067,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) +; LMULMAX2-RV32-NEXT: sb a1, 58(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3072,7 +3086,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) +; LMULMAX2-RV32-NEXT: sb a1, 57(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3091,7 +3105,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) +; LMULMAX2-RV32-NEXT: sb a1, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3110,7 +3124,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) +; LMULMAX2-RV32-NEXT: sb a1, 55(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3129,7 +3143,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) +; LMULMAX2-RV32-NEXT: sb a1, 54(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3148,7 +3162,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) +; LMULMAX2-RV32-NEXT: sb a1, 53(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3167,7 +3181,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) +; LMULMAX2-RV32-NEXT: sb a1, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3186,7 +3200,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) +; LMULMAX2-RV32-NEXT: sb a1, 51(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3205,7 +3219,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) +; LMULMAX2-RV32-NEXT: sb a1, 50(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3224,7 +3238,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) +; LMULMAX2-RV32-NEXT: sb a1, 49(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3243,7 +3257,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) +; LMULMAX2-RV32-NEXT: sb a1, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3262,7 +3276,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) +; LMULMAX2-RV32-NEXT: sb a1, 47(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3281,7 +3295,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) +; LMULMAX2-RV32-NEXT: sb a1, 46(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3300,7 +3314,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) +; LMULMAX2-RV32-NEXT: sb a1, 45(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3319,7 +3333,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) +; LMULMAX2-RV32-NEXT: sb a1, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3338,7 +3352,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) +; LMULMAX2-RV32-NEXT: sb a1, 43(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3357,7 +3371,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) +; LMULMAX2-RV32-NEXT: sb a1, 42(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3376,7 +3390,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) +; LMULMAX2-RV32-NEXT: sb a1, 41(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3395,7 +3409,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) +; LMULMAX2-RV32-NEXT: sb a1, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3414,7 +3428,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) +; LMULMAX2-RV32-NEXT: sb a1, 39(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3433,7 +3447,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) +; LMULMAX2-RV32-NEXT: sb a1, 38(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3452,7 +3466,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) +; LMULMAX2-RV32-NEXT: sb a1, 37(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3471,7 +3485,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) +; LMULMAX2-RV32-NEXT: sb a1, 36(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3490,7 +3504,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) +; LMULMAX2-RV32-NEXT: sb a1, 35(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3509,7 +3523,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) +; LMULMAX2-RV32-NEXT: sb a1, 34(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: ori a1, a1, 256 @@ -3528,25 +3542,26 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) +; LMULMAX2-RV32-NEXT: sb a1, 33(sp) ; LMULMAX2-RV32-NEXT: vsetvli zero, a6, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle8.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse8.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v32i8: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: addi a6, zero, 32 @@ -3601,7 +3616,7 @@ ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) +; LMULMAX2-RV64-NEXT: sb a1, 63(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3620,7 +3635,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) +; LMULMAX2-RV64-NEXT: sb a1, 62(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3639,7 +3654,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) +; LMULMAX2-RV64-NEXT: sb a1, 61(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3658,7 +3673,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) +; LMULMAX2-RV64-NEXT: sb a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3677,7 +3692,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) +; LMULMAX2-RV64-NEXT: sb a1, 59(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3696,7 +3711,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) +; LMULMAX2-RV64-NEXT: sb a1, 58(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3715,7 +3730,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) +; LMULMAX2-RV64-NEXT: sb a1, 57(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3734,7 +3749,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) +; LMULMAX2-RV64-NEXT: sb a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3753,7 +3768,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) +; LMULMAX2-RV64-NEXT: sb a1, 55(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3772,7 +3787,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) +; LMULMAX2-RV64-NEXT: sb a1, 54(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3791,7 +3806,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) +; LMULMAX2-RV64-NEXT: sb a1, 53(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3810,7 +3825,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) +; LMULMAX2-RV64-NEXT: sb a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3829,7 +3844,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) +; LMULMAX2-RV64-NEXT: sb a1, 51(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3848,7 +3863,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) +; LMULMAX2-RV64-NEXT: sb a1, 50(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3867,7 +3882,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) +; LMULMAX2-RV64-NEXT: sb a1, 49(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3886,7 +3901,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) +; LMULMAX2-RV64-NEXT: sb a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3905,7 +3920,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) +; LMULMAX2-RV64-NEXT: sb a1, 47(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3924,7 +3939,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) +; LMULMAX2-RV64-NEXT: sb a1, 46(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3943,7 +3958,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) +; LMULMAX2-RV64-NEXT: sb a1, 45(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3962,7 +3977,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) +; LMULMAX2-RV64-NEXT: sb a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -3981,7 +3996,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) +; LMULMAX2-RV64-NEXT: sb a1, 43(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4000,7 +4015,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) +; LMULMAX2-RV64-NEXT: sb a1, 42(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4019,7 +4034,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) +; LMULMAX2-RV64-NEXT: sb a1, 41(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4038,7 +4053,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) +; LMULMAX2-RV64-NEXT: sb a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4057,7 +4072,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) +; LMULMAX2-RV64-NEXT: sb a1, 39(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4076,7 +4091,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) +; LMULMAX2-RV64-NEXT: sb a1, 38(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4095,7 +4110,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) +; LMULMAX2-RV64-NEXT: sb a1, 37(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4114,7 +4129,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) +; LMULMAX2-RV64-NEXT: sb a1, 36(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4133,7 +4148,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) +; LMULMAX2-RV64-NEXT: sb a1, 35(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4152,7 +4167,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) +; LMULMAX2-RV64-NEXT: sb a1, 34(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 @@ -4171,7 +4186,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) +; LMULMAX2-RV64-NEXT: sb a1, 33(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: ori a1, a1, 256 ; LMULMAX2-RV64-NEXT: addi a2, a1, -1 @@ -4189,20 +4204,21 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) +; LMULMAX2-RV64-NEXT: sb a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetvli zero, a6, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle8.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse8.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v32i8: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle8.v v26, (a6) @@ -4232,7 +4248,7 @@ ; LMULMAX1-RV32-NEXT: addi a5, a5, 257 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 16(sp) +; LMULMAX1-RV32-NEXT: sb a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 15 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 @@ -4252,7 +4268,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 31(sp) +; LMULMAX1-RV32-NEXT: sb a1, 47(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 14 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4271,7 +4287,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 30(sp) +; LMULMAX1-RV32-NEXT: sb a1, 46(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 13 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4290,7 +4306,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 29(sp) +; LMULMAX1-RV32-NEXT: sb a1, 45(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 12 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4309,7 +4325,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 28(sp) +; LMULMAX1-RV32-NEXT: sb a1, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 11 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4328,7 +4344,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 27(sp) +; LMULMAX1-RV32-NEXT: sb a1, 43(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 10 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4347,7 +4363,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 26(sp) +; LMULMAX1-RV32-NEXT: sb a1, 42(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 9 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4366,7 +4382,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 25(sp) +; LMULMAX1-RV32-NEXT: sb a1, 41(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 8 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4385,7 +4401,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 24(sp) +; LMULMAX1-RV32-NEXT: sb a1, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4404,7 +4420,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 23(sp) +; LMULMAX1-RV32-NEXT: sb a1, 39(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4423,7 +4439,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 22(sp) +; LMULMAX1-RV32-NEXT: sb a1, 38(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4442,7 +4458,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 21(sp) +; LMULMAX1-RV32-NEXT: sb a1, 37(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4461,7 +4477,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 20(sp) +; LMULMAX1-RV32-NEXT: sb a1, 36(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4480,7 +4496,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 19(sp) +; LMULMAX1-RV32-NEXT: sb a1, 35(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4499,7 +4515,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 18(sp) +; LMULMAX1-RV32-NEXT: sb a1, 34(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4518,7 +4534,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 17(sp) +; LMULMAX1-RV32-NEXT: sb a1, 33(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 @@ -4536,7 +4552,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 0(sp) +; LMULMAX1-RV32-NEXT: sb a1, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4555,7 +4571,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 15(sp) +; LMULMAX1-RV32-NEXT: sb a1, 31(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4574,7 +4590,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 14(sp) +; LMULMAX1-RV32-NEXT: sb a1, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4593,7 +4609,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 13(sp) +; LMULMAX1-RV32-NEXT: sb a1, 29(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4612,7 +4628,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 12(sp) +; LMULMAX1-RV32-NEXT: sb a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4631,7 +4647,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 11(sp) +; LMULMAX1-RV32-NEXT: sb a1, 27(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4650,7 +4666,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 10(sp) +; LMULMAX1-RV32-NEXT: sb a1, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4669,7 +4685,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 9(sp) +; LMULMAX1-RV32-NEXT: sb a1, 25(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4688,7 +4704,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 8(sp) +; LMULMAX1-RV32-NEXT: sb a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4707,7 +4723,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 7(sp) +; LMULMAX1-RV32-NEXT: sb a1, 23(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4726,7 +4742,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 6(sp) +; LMULMAX1-RV32-NEXT: sb a1, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4745,7 +4761,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 5(sp) +; LMULMAX1-RV32-NEXT: sb a1, 21(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4764,7 +4780,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 4(sp) +; LMULMAX1-RV32-NEXT: sb a1, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4783,7 +4799,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 3(sp) +; LMULMAX1-RV32-NEXT: sb a1, 19(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4802,7 +4818,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 2(sp) +; LMULMAX1-RV32-NEXT: sb a1, 18(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: ori a1, a1, 256 @@ -4821,20 +4837,21 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sb a1, 1(sp) +; LMULMAX1-RV32-NEXT: sb a1, 17(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle8.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle8.v v26, (a1) ; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse8.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v32i8: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle8.v v26, (a6) @@ -4886,7 +4903,7 @@ ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 16(sp) +; LMULMAX1-RV64-NEXT: sb a1, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e8,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 15 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 @@ -4906,7 +4923,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 31(sp) +; LMULMAX1-RV64-NEXT: sb a1, 47(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 14 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -4925,7 +4942,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 30(sp) +; LMULMAX1-RV64-NEXT: sb a1, 46(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 13 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -4944,7 +4961,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 29(sp) +; LMULMAX1-RV64-NEXT: sb a1, 45(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 12 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -4963,7 +4980,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 28(sp) +; LMULMAX1-RV64-NEXT: sb a1, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 11 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -4982,7 +4999,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 27(sp) +; LMULMAX1-RV64-NEXT: sb a1, 43(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 10 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5001,7 +5018,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 26(sp) +; LMULMAX1-RV64-NEXT: sb a1, 42(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 9 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5020,7 +5037,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 25(sp) +; LMULMAX1-RV64-NEXT: sb a1, 41(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 8 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5039,7 +5056,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 24(sp) +; LMULMAX1-RV64-NEXT: sb a1, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5058,7 +5075,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 23(sp) +; LMULMAX1-RV64-NEXT: sb a1, 39(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5077,7 +5094,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 22(sp) +; LMULMAX1-RV64-NEXT: sb a1, 38(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5096,7 +5113,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 21(sp) +; LMULMAX1-RV64-NEXT: sb a1, 37(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5115,7 +5132,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 20(sp) +; LMULMAX1-RV64-NEXT: sb a1, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5134,7 +5151,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 19(sp) +; LMULMAX1-RV64-NEXT: sb a1, 35(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5153,7 +5170,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 18(sp) +; LMULMAX1-RV64-NEXT: sb a1, 34(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5172,7 +5189,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 17(sp) +; LMULMAX1-RV64-NEXT: sb a1, 33(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 15 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5191,7 +5208,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 15(sp) +; LMULMAX1-RV64-NEXT: sb a1, 31(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 14 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5210,7 +5227,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 14(sp) +; LMULMAX1-RV64-NEXT: sb a1, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 13 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5229,7 +5246,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 13(sp) +; LMULMAX1-RV64-NEXT: sb a1, 29(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 12 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5248,7 +5265,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 12(sp) +; LMULMAX1-RV64-NEXT: sb a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 11 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5267,7 +5284,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 11(sp) +; LMULMAX1-RV64-NEXT: sb a1, 27(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 10 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5286,7 +5303,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 10(sp) +; LMULMAX1-RV64-NEXT: sb a1, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 9 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5305,7 +5322,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 9(sp) +; LMULMAX1-RV64-NEXT: sb a1, 25(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 8 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5324,7 +5341,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 8(sp) +; LMULMAX1-RV64-NEXT: sb a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5343,7 +5360,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 7(sp) +; LMULMAX1-RV64-NEXT: sb a1, 23(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5362,7 +5379,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 6(sp) +; LMULMAX1-RV64-NEXT: sb a1, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5381,7 +5398,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 5(sp) +; LMULMAX1-RV64-NEXT: sb a1, 21(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5400,7 +5417,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 4(sp) +; LMULMAX1-RV64-NEXT: sb a1, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5419,7 +5436,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 3(sp) +; LMULMAX1-RV64-NEXT: sb a1, 19(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5438,7 +5455,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 2(sp) +; LMULMAX1-RV64-NEXT: sb a1, 18(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 @@ -5457,7 +5474,7 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 1(sp) +; LMULMAX1-RV64-NEXT: sb a1, 17(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: ori a1, a1, 256 ; LMULMAX1-RV64-NEXT: addi a2, a1, -1 @@ -5475,14 +5492,15 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sb a1, 0(sp) +; LMULMAX1-RV64-NEXT: sb a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle8.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV64-NEXT: addi a1, sp, 32 ; LMULMAX1-RV64-NEXT: vle8.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse8.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = load <32 x i8>, <32 x i8>* %y @@ -5495,13 +5513,13 @@ define void @cttz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v16i16: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu @@ -5532,7 +5550,7 @@ ; LMULMAX2-RV32-NEXT: addi a5, a5, 257 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: sh a1, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 @@ -5552,7 +5570,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 30(sp) +; LMULMAX2-RV32-NEXT: sh a1, 62(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5571,7 +5589,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 28(sp) +; LMULMAX2-RV32-NEXT: sh a1, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5590,7 +5608,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 26(sp) +; LMULMAX2-RV32-NEXT: sh a1, 58(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5609,7 +5627,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 24(sp) +; LMULMAX2-RV32-NEXT: sh a1, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5628,7 +5646,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 22(sp) +; LMULMAX2-RV32-NEXT: sh a1, 54(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5647,7 +5665,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 20(sp) +; LMULMAX2-RV32-NEXT: sh a1, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5666,7 +5684,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 18(sp) +; LMULMAX2-RV32-NEXT: sh a1, 50(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5685,7 +5703,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 16(sp) +; LMULMAX2-RV32-NEXT: sh a1, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5704,7 +5722,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 14(sp) +; LMULMAX2-RV32-NEXT: sh a1, 46(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5723,7 +5741,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: sh a1, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5742,7 +5760,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 10(sp) +; LMULMAX2-RV32-NEXT: sh a1, 42(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5761,7 +5779,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: sh a1, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5780,7 +5798,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 6(sp) +; LMULMAX2-RV32-NEXT: sh a1, 38(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5799,7 +5817,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 4(sp) +; LMULMAX2-RV32-NEXT: sh a1, 36(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: or a1, a1, a6 @@ -5818,25 +5836,26 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sh a1, 2(sp) +; LMULMAX2-RV32-NEXT: sh a1, 34(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle16.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle16.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v16i16: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu @@ -5891,7 +5910,7 @@ ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 30(sp) +; LMULMAX2-RV64-NEXT: sh a1, 62(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -5910,7 +5929,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 28(sp) +; LMULMAX2-RV64-NEXT: sh a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -5929,7 +5948,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 26(sp) +; LMULMAX2-RV64-NEXT: sh a1, 58(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -5948,7 +5967,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 24(sp) +; LMULMAX2-RV64-NEXT: sh a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -5967,7 +5986,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 22(sp) +; LMULMAX2-RV64-NEXT: sh a1, 54(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -5986,7 +6005,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 20(sp) +; LMULMAX2-RV64-NEXT: sh a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6005,7 +6024,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 18(sp) +; LMULMAX2-RV64-NEXT: sh a1, 50(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6024,7 +6043,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 16(sp) +; LMULMAX2-RV64-NEXT: sh a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6043,7 +6062,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 14(sp) +; LMULMAX2-RV64-NEXT: sh a1, 46(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6062,7 +6081,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: sh a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6081,7 +6100,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 10(sp) +; LMULMAX2-RV64-NEXT: sh a1, 42(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6100,7 +6119,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: sh a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6119,7 +6138,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 6(sp) +; LMULMAX2-RV64-NEXT: sh a1, 38(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6138,7 +6157,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 4(sp) +; LMULMAX2-RV64-NEXT: sh a1, 36(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -6157,7 +6176,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 2(sp) +; LMULMAX2-RV64-NEXT: sh a1, 34(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 ; LMULMAX2-RV64-NEXT: addi a2, a1, -1 @@ -6175,20 +6194,21 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: sh a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle16.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle16.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v16i16: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle16.v v26, (a6) @@ -6219,7 +6239,7 @@ ; LMULMAX1-RV32-NEXT: addi a1, a1, 257 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 16(sp) +; LMULMAX1-RV32-NEXT: sh a2, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 @@ -6239,7 +6259,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 30(sp) +; LMULMAX1-RV32-NEXT: sh a2, 46(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6258,7 +6278,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 28(sp) +; LMULMAX1-RV32-NEXT: sh a2, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6277,7 +6297,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 26(sp) +; LMULMAX1-RV32-NEXT: sh a2, 42(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6296,7 +6316,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 24(sp) +; LMULMAX1-RV32-NEXT: sh a2, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6315,7 +6335,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 22(sp) +; LMULMAX1-RV32-NEXT: sh a2, 38(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6334,7 +6354,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 20(sp) +; LMULMAX1-RV32-NEXT: sh a2, 36(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6353,7 +6373,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 18(sp) +; LMULMAX1-RV32-NEXT: sh a2, 34(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 ; LMULMAX1-RV32-NEXT: addi a3, a2, -1 @@ -6371,7 +6391,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 0(sp) +; LMULMAX1-RV32-NEXT: sh a2, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6390,7 +6410,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 14(sp) +; LMULMAX1-RV32-NEXT: sh a2, 30(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6409,7 +6429,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 12(sp) +; LMULMAX1-RV32-NEXT: sh a2, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6428,7 +6448,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 10(sp) +; LMULMAX1-RV32-NEXT: sh a2, 26(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6447,7 +6467,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 8(sp) +; LMULMAX1-RV32-NEXT: sh a2, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6466,7 +6486,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 6(sp) +; LMULMAX1-RV32-NEXT: sh a2, 22(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6485,7 +6505,7 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a2, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a2, 24 -; LMULMAX1-RV32-NEXT: sh a2, 4(sp) +; LMULMAX1-RV32-NEXT: sh a2, 20(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: or a2, a2, a7 @@ -6504,20 +6524,21 @@ ; LMULMAX1-RV32-NEXT: and a2, a2, a5 ; LMULMAX1-RV32-NEXT: mul a1, a2, a1 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sh a1, 2(sp) +; LMULMAX1-RV32-NEXT: sh a1, 18(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle16.v v26, (a1) ; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse16.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v16i16: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle16.v v26, (a6) @@ -6570,7 +6591,7 @@ ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 16(sp) +; LMULMAX1-RV64-NEXT: sh a2, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 @@ -6590,7 +6611,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 30(sp) +; LMULMAX1-RV64-NEXT: sh a2, 46(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6609,7 +6630,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 28(sp) +; LMULMAX1-RV64-NEXT: sh a2, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6628,7 +6649,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 26(sp) +; LMULMAX1-RV64-NEXT: sh a2, 42(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6647,7 +6668,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 24(sp) +; LMULMAX1-RV64-NEXT: sh a2, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6666,7 +6687,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 22(sp) +; LMULMAX1-RV64-NEXT: sh a2, 38(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6685,7 +6706,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 20(sp) +; LMULMAX1-RV64-NEXT: sh a2, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6704,7 +6725,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 18(sp) +; LMULMAX1-RV64-NEXT: sh a2, 34(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6723,7 +6744,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 14(sp) +; LMULMAX1-RV64-NEXT: sh a2, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6742,7 +6763,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 12(sp) +; LMULMAX1-RV64-NEXT: sh a2, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6761,7 +6782,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 10(sp) +; LMULMAX1-RV64-NEXT: sh a2, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6780,7 +6801,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 8(sp) +; LMULMAX1-RV64-NEXT: sh a2, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6799,7 +6820,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 6(sp) +; LMULMAX1-RV64-NEXT: sh a2, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6818,7 +6839,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 4(sp) +; LMULMAX1-RV64-NEXT: sh a2, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -6837,7 +6858,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sh a2, 2(sp) +; LMULMAX1-RV64-NEXT: sh a2, 18(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 ; LMULMAX1-RV64-NEXT: addi a3, a2, -1 @@ -6855,14 +6876,15 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sh a1, 0(sp) +; LMULMAX1-RV64-NEXT: sh a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV64-NEXT: addi a1, sp, 32 ; LMULMAX1-RV64-NEXT: vle16.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse16.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y @@ -6875,13 +6897,13 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v8i32: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu @@ -6910,7 +6932,7 @@ ; LMULMAX2-RV32-NEXT: addi a4, a4, 257 ; LMULMAX2-RV32-NEXT: mul a5, a5, a4 ; LMULMAX2-RV32-NEXT: srli a5, a5, 24 -; LMULMAX2-RV32-NEXT: sw a5, 0(sp) +; LMULMAX2-RV32-NEXT: sw a5, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a5, v28 @@ -6929,7 +6951,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32-NEXT: sw a1, 60(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: addi a5, a1, -1 @@ -6947,7 +6969,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 24(sp) +; LMULMAX2-RV32-NEXT: sw a1, 56(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: addi a5, a1, -1 @@ -6965,7 +6987,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32-NEXT: sw a1, 52(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: addi a5, a1, -1 @@ -6983,7 +7005,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32-NEXT: sw a1, 48(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: addi a5, a1, -1 @@ -7001,7 +7023,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 44(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV32-NEXT: addi a5, a1, -1 @@ -7019,7 +7041,7 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 40(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: addi a5, a1, -1 @@ -7037,25 +7059,26 @@ ; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a4 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 36(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV32-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v8i32: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu @@ -7111,7 +7134,7 @@ ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 28(sp) +; LMULMAX2-RV64-NEXT: sw a1, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -7130,7 +7153,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 24(sp) +; LMULMAX2-RV64-NEXT: sw a1, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -7149,7 +7172,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 20(sp) +; LMULMAX2-RV64-NEXT: sw a1, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -7168,7 +7191,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 16(sp) +; LMULMAX2-RV64-NEXT: sw a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -7187,7 +7210,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 12(sp) +; LMULMAX2-RV64-NEXT: sw a1, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -7206,7 +7229,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 8(sp) +; LMULMAX2-RV64-NEXT: sw a1, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 @@ -7225,7 +7248,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 4(sp) +; LMULMAX2-RV64-NEXT: sw a1, 36(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: or a1, a1, a6 ; LMULMAX2-RV64-NEXT: addi a2, a1, -1 @@ -7243,20 +7266,21 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 0(sp) +; LMULMAX2-RV64-NEXT: sw a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v8i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a6) @@ -7285,7 +7309,7 @@ ; LMULMAX1-RV32-NEXT: addi a5, a5, 257 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: sw a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 @@ -7304,7 +7328,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 28(sp) +; LMULMAX1-RV32-NEXT: sw a1, 44(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 @@ -7322,7 +7346,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 24(sp) +; LMULMAX1-RV32-NEXT: sw a1, 40(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 @@ -7340,7 +7364,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 20(sp) +; LMULMAX1-RV32-NEXT: sw a1, 36(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -7357,7 +7381,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 @@ -7375,7 +7399,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 @@ -7393,7 +7417,7 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 @@ -7411,20 +7435,21 @@ ; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse32.v v26, (a6) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v8i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, -48 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle32.v v26, (a6) @@ -7478,7 +7503,7 @@ ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 16(sp) +; LMULMAX1-RV64-NEXT: sw a2, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 @@ -7498,7 +7523,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 28(sp) +; LMULMAX1-RV64-NEXT: sw a2, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -7517,7 +7542,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 24(sp) +; LMULMAX1-RV64-NEXT: sw a2, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -7536,7 +7561,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 20(sp) +; LMULMAX1-RV64-NEXT: sw a2, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -7555,7 +7580,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 12(sp) +; LMULMAX1-RV64-NEXT: sw a2, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -7574,7 +7599,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 8(sp) +; LMULMAX1-RV64-NEXT: sw a2, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 @@ -7593,7 +7618,7 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 4(sp) +; LMULMAX1-RV64-NEXT: sw a2, 20(sp) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: or a2, a2, a7 ; LMULMAX1-RV64-NEXT: addi a3, a2, -1 @@ -7611,14 +7636,15 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sw a1, 0(sp) +; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV64-NEXT: addi a1, sp, 32 ; LMULMAX1-RV64-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse32.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: addi sp, sp, 48 ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y @@ -7631,21 +7657,21 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-LABEL: cttz_v4i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: addi sp, sp, -96 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; LMULMAX2-RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 ; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 +; LMULMAX2-RV32-NEXT: addi s0, sp, 96 ; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle64.v v26, (a0) -; LMULMAX2-RV32-NEXT: sw zero, 28(sp) -; LMULMAX2-RV32-NEXT: sw zero, 20(sp) -; LMULMAX2-RV32-NEXT: sw zero, 12(sp) -; LMULMAX2-RV32-NEXT: sw zero, 4(sp) +; LMULMAX2-RV32-NEXT: sw zero, 60(sp) +; LMULMAX2-RV32-NEXT: sw zero, 52(sp) +; LMULMAX2-RV32-NEXT: sw zero, 44(sp) +; LMULMAX2-RV32-NEXT: sw zero, 36(sp) ; LMULMAX2-RV32-NEXT: addi a6, zero, 32 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a4, a1, 1365 @@ -7695,7 +7721,7 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_3: -; LMULMAX2-RV32-NEXT: sw a5, 0(sp) +; LMULMAX2-RV32-NEXT: sw a5, 32(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a5, v28 @@ -7739,7 +7765,7 @@ ; LMULMAX2-RV32-NEXT: .LBB7_6: ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sw a5, 24(sp) +; LMULMAX2-RV32-NEXT: sw a5, 56(sp) ; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX2-RV32-NEXT: # %bb.7: ; LMULMAX2-RV32-NEXT: vsrl.vx v28, v28, a6 @@ -7780,7 +7806,7 @@ ; LMULMAX2-RV32-NEXT: .LBB7_9: ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sw a5, 16(sp) +; LMULMAX2-RV32-NEXT: sw a5, 48(sp) ; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_11 ; LMULMAX2-RV32-NEXT: # %bb.10: ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v26, a6 @@ -7819,26 +7845,27 @@ ; LMULMAX2-RV32-NEXT: mul a1, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_12: -; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: sw a1, 40(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 32 +; LMULMAX2-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 +; LMULMAX2-RV32-NEXT: addi sp, s0, -96 +; LMULMAX2-RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; LMULMAX2-RV32-NEXT: addi sp, sp, 96 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: cttz_v4i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: addi sp, sp, -96 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 96 +; LMULMAX2-RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; LMULMAX2-RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 ; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 +; LMULMAX2-RV64-NEXT: addi s0, sp, 96 ; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu @@ -7891,7 +7918,7 @@ ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: mul a3, a3, a5 ; LMULMAX2-RV64-NEXT: srli a3, a3, 56 -; LMULMAX2-RV64-NEXT: sd a3, 24(sp) +; LMULMAX2-RV64-NEXT: sd a3, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV64-NEXT: addi a1, a3, -1 @@ -7909,7 +7936,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 16(sp) +; LMULMAX2-RV64-NEXT: sd a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 ; LMULMAX2-RV64-NEXT: addi a3, a1, -1 @@ -7927,7 +7954,7 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 8(sp) +; LMULMAX2-RV64-NEXT: sd a1, 40(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: addi a3, a1, -1 ; LMULMAX2-RV64-NEXT: not a1, a1 @@ -7944,26 +7971,27 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 0(sp) +; LMULMAX2-RV64-NEXT: sd a1, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle64.v v26, (sp) +; LMULMAX2-RV64-NEXT: addi a1, sp, 32 +; LMULMAX2-RV64-NEXT: vle64.v v26, (a1) ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 +; LMULMAX2-RV64-NEXT: addi sp, s0, -96 +; LMULMAX2-RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; LMULMAX2-RV64-NEXT: addi sp, sp, 96 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi sp, sp, -32 -; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, -48 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: addi a7, a0, 16 ; LMULMAX1-RV32-NEXT: vle64.v v26, (a7) -; LMULMAX1-RV32-NEXT: sw zero, 28(sp) -; LMULMAX1-RV32-NEXT: sw zero, 20(sp) +; LMULMAX1-RV32-NEXT: sw zero, 44(sp) +; LMULMAX1-RV32-NEXT: sw zero, 36(sp) ; LMULMAX1-RV32-NEXT: addi a6, zero, 32 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a5, a1, 1365 @@ -8013,7 +8041,7 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_3: -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: sw a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 @@ -8055,10 +8083,10 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_6: -; LMULMAX1-RV32-NEXT: sw a1, 24(sp) -; LMULMAX1-RV32-NEXT: sw zero, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 40(sp) +; LMULMAX1-RV32-NEXT: sw zero, 28(sp) ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV32-NEXT: sw zero, 4(sp) +; LMULMAX1-RV32-NEXT: sw zero, 20(sp) ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX1-RV32-NEXT: # %bb.7: ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 @@ -8099,7 +8127,7 @@ ; LMULMAX1-RV32-NEXT: .LBB7_9: ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11 ; LMULMAX1-RV32-NEXT: # %bb.10: ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a6 @@ -8138,15 +8166,16 @@ ; LMULMAX1-RV32-NEXT: mul a1, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_12: -; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) ; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: addi a1, sp, 32 ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV32-NEXT: vse64.v v26, (a7) -; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: cttz_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-emergency-slot.mir b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-emergency-slot.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-emergency-slot.mir @@ -0,0 +1,59 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +# RUN: llc -mtriple riscv64 -mattr=+experimental-v -start-before=prologepilog -o - \ +# RUN: -verify-machineinstrs %s | FileCheck %s +--- | + target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" + target triple = "riscv64" + + define weak_odr dso_local void @fixedlen_vector_spillslot(i8* %ay) nounwind { + ; CHECK-LABEL: fixedlen_vector_spillslot: + ; CHECK: # %bb.0: # %entry + ; CHECK-NEXT: addi sp, sp, -48 + ; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill + ; CHECK-NEXT: sd a0, 32(sp) + ; CHECK-NEXT: sd a0, 16(sp) + ; CHECK-NEXT: vsetivli a5, 1, e16,m1,ta,mu + ; CHECK-NEXT: sd a1, 8(sp) + ; CHECK-NEXT: addi a1, sp, 24 + ; CHECK-NEXT: vs1r.v v25, (a1) # Unknown-size Folded Spill + ; CHECK-NEXT: ld a1, 8(sp) + ; CHECK-NEXT: call fixedlen_vector_spillslot@plt + ; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload + ; CHECK-NEXT: addi sp, sp, 48 + ; CHECK-NEXT: ret + entry: + ret void + } + +... +--- +name: fixedlen_vector_spillslot +alignment: 2 +tracksRegLiveness: false +fixedStack: [] +stack: + - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 2, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0.entry: + liveins: $x1, $x5, $x6, $x7, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31, $v25 + + SD $x10, %stack.0, 0 + SD $x10, %stack.2, 0 + dead renamable $x15 = PseudoVSETIVLI 1, 72, implicit-def $vl, implicit-def $vtype + PseudoVSPILL_M1 killed renamable $v25, %stack.1 :: (store unknown-size into %stack.1, align 8) + ; This is here just to make all the eligible registers live at this point. + ; This way when we replace the frame index %stack.1 with its actual address + ; we have to allocate a virtual register to compute it. + ; A later run of the the register scavenger won't find an available register + ; either so it will have to spill one to the emergency spill slot. + PseudoCALL target-flags(riscv-plt) @fixedlen_vector_spillslot, csr_ilp32_lp64, implicit-def $x1, implicit-def $x2, implicit $x1, implicit $x5, implicit $x6, implicit $x7, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x28, implicit $x29, implicit $x30, implicit $x31 + PseudoRET +... diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll @@ -151,10 +151,10 @@ define i1 @extractelt_v256i1(<256 x i8>* %x, i64 %idx) nounwind { ; RV32-LABEL: extractelt_v256i1: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -384 -; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill -; RV32-NEXT: addi s0, sp, 384 +; RV32-NEXT: addi sp, sp, -512 +; RV32-NEXT: sw ra, 508(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 504(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 512 ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: andi a1, a1, 255 ; RV32-NEXT: addi a2, a0, 128 @@ -162,30 +162,31 @@ ; RV32-NEXT: vsetvli zero, a3, e8,m8,ta,mu ; RV32-NEXT: vle8.v v8, (a0) ; RV32-NEXT: vle8.v v16, (a2) -; RV32-NEXT: mv a0, sp +; RV32-NEXT: addi a0, sp, 128 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: vmseq.vi v25, v8, 0 ; RV32-NEXT: vmseq.vi v0, v16, 0 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vmerge.vim v16, v8, 1, v0 -; RV32-NEXT: addi a1, sp, 128 +; RV32-NEXT: addi a1, sp, 256 ; RV32-NEXT: vse8.v v16, (a1) ; RV32-NEXT: vmv1r.v v0, v25 ; RV32-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-NEXT: vse8.v v8, (sp) +; RV32-NEXT: addi a1, sp, 128 +; RV32-NEXT: vse8.v v8, (a1) ; RV32-NEXT: lb a0, 0(a0) -; RV32-NEXT: addi sp, s0, -384 -; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 384 +; RV32-NEXT: addi sp, s0, -512 +; RV32-NEXT: lw s0, 504(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 508(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 512 ; RV32-NEXT: ret ; ; RV64-LABEL: extractelt_v256i1: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -384 -; RV64-NEXT: sd ra, 376(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill -; RV64-NEXT: addi s0, sp, 384 +; RV64-NEXT: addi sp, sp, -512 +; RV64-NEXT: sd ra, 504(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 496(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 512 ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: andi a1, a1, 255 ; RV64-NEXT: addi a2, a0, 128 @@ -193,22 +194,23 @@ ; RV64-NEXT: vsetvli zero, a3, e8,m8,ta,mu ; RV64-NEXT: vle8.v v8, (a0) ; RV64-NEXT: vle8.v v16, (a2) -; RV64-NEXT: mv a0, sp +; RV64-NEXT: addi a0, sp, 128 ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vmseq.vi v25, v8, 0 ; RV64-NEXT: vmseq.vi v0, v16, 0 ; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vmerge.vim v16, v8, 1, v0 -; RV64-NEXT: addi a1, sp, 128 +; RV64-NEXT: addi a1, sp, 256 ; RV64-NEXT: vse8.v v16, (a1) ; RV64-NEXT: vmv1r.v v0, v25 ; RV64-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-NEXT: vse8.v v8, (sp) +; RV64-NEXT: addi a1, sp, 128 +; RV64-NEXT: vse8.v v8, (a1) ; RV64-NEXT: lb a0, 0(a0) -; RV64-NEXT: addi sp, s0, -384 -; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 384 +; RV64-NEXT: addi sp, s0, -512 +; RV64-NEXT: ld s0, 496(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 504(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 512 ; RV64-NEXT: ret %a = load <256 x i8>, <256 x i8>* %x %b = icmp eq <256 x i8> %a, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -35,23 +35,24 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize { ; LMULMAX1-LABEL: hang_when_merging_stores_after_legalization: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -16 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; LMULMAX1-NEXT: vfmv.f.s ft0, v10 -; LMULMAX1-NEXT: fsw ft0, 8(sp) +; LMULMAX1-NEXT: fsw ft0, 24(sp) ; LMULMAX1-NEXT: vfmv.f.s ft0, v8 -; LMULMAX1-NEXT: fsw ft0, 0(sp) +; LMULMAX1-NEXT: fsw ft0, 16(sp) ; LMULMAX1-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; LMULMAX1-NEXT: vslidedown.vi v26, v10, 7 ; LMULMAX1-NEXT: vfmv.f.s ft0, v26 -; LMULMAX1-NEXT: fsw ft0, 12(sp) +; LMULMAX1-NEXT: fsw ft0, 28(sp) ; LMULMAX1-NEXT: vslidedown.vi v26, v8, 7 ; LMULMAX1-NEXT: vfmv.f.s ft0, v26 -; LMULMAX1-NEXT: fsw ft0, 4(sp) +; LMULMAX1-NEXT: fsw ft0, 20(sp) ; LMULMAX1-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; LMULMAX1-NEXT: vle32.v v8, (sp) -; LMULMAX1-NEXT: addi sp, sp, 16 +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret ; ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -163,22 +163,24 @@ ; ; LMULMAX1-LABEL: fpround_v8f32_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -16 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetivli zero, 4, e32,m1,ta,mu ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle32.v v25, (a2) ; LMULMAX1-NEXT: vle32.v v26, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v27, v25 -; LMULMAX1-NEXT: addi a0, sp, 8 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v27, (a0) ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 -; LMULMAX1-NEXT: vse16.v v25, (sp) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: vle16.v v25, (sp) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 16 +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret %a = load <8 x float>, <8 x float>* %x %d = fptrunc <8 x float> %a to <8 x half> @@ -200,8 +202,8 @@ ; ; LMULMAX1-LABEL: fpround_v8f64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-NEXT: addi sp, sp, -48 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-NEXT: vle64.v v25, (a0) ; LMULMAX1-NEXT: addi a2, a0, 32 @@ -214,39 +216,41 @@ ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v29, v27 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v27, v29 -; LMULMAX1-NEXT: addi a0, sp, 12 +; LMULMAX1-NEXT: addi a0, sp, 28 ; LMULMAX1-NEXT: vse16.v v27, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 4 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 8 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16,mf2,ta,mu -; LMULMAX1-NEXT: addi a0, sp, 8 -; LMULMAX1-NEXT: vle16.v v26, (a0) ; LMULMAX1-NEXT: addi a0, sp, 24 +; LMULMAX1-NEXT: vle16.v v26, (a0) +; LMULMAX1-NEXT: addi a0, sp, 40 ; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 -; LMULMAX1-NEXT: vse16.v v25, (sp) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16,mf2,ta,mu -; LMULMAX1-NEXT: vle16.v v25, (sp) ; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vle16.v v25, (a0) +; LMULMAX1-NEXT: addi a0, sp, 32 ; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: addi a0, sp, 32 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 +; LMULMAX1-NEXT: addi sp, sp, 48 ; LMULMAX1-NEXT: ret %a = load <8 x double>, <8 x double>* %x %d = fptrunc <8 x double> %a to <8 x half> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -466,8 +466,8 @@ ; ; LMULMAX1-LABEL: si2fp_v8i64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-NEXT: addi sp, sp, -48 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-NEXT: vle64.v v25, (a0) ; LMULMAX1-NEXT: addi a2, a0, 32 @@ -480,39 +480,41 @@ ; LMULMAX1-NEXT: vfncvt.f.x.w v29, v27 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v27, v29 -; LMULMAX1-NEXT: addi a0, sp, 12 +; LMULMAX1-NEXT: addi a0, sp, 28 ; LMULMAX1-NEXT: vse16.v v27, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.f.x.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 4 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.f.x.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 8 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16,mf2,ta,mu -; LMULMAX1-NEXT: addi a0, sp, 8 -; LMULMAX1-NEXT: vle16.v v26, (a0) ; LMULMAX1-NEXT: addi a0, sp, 24 +; LMULMAX1-NEXT: vle16.v v26, (a0) +; LMULMAX1-NEXT: addi a0, sp, 40 ; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.f.x.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 -; LMULMAX1-NEXT: vse16.v v25, (sp) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16,mf2,ta,mu -; LMULMAX1-NEXT: vle16.v v25, (sp) ; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vle16.v v25, (a0) +; LMULMAX1-NEXT: addi a0, sp, 32 ; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: addi a0, sp, 32 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 +; LMULMAX1-NEXT: addi sp, sp, 48 ; LMULMAX1-NEXT: ret %a = load <8 x i64>, <8 x i64>* %x %d = sitofp <8 x i64> %a to <8 x half> @@ -534,8 +536,8 @@ ; ; LMULMAX1-LABEL: ui2fp_v8i64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-NEXT: addi sp, sp, -48 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-NEXT: vsetivli zero, 2, e64,m1,ta,mu ; LMULMAX1-NEXT: vle64.v v25, (a0) ; LMULMAX1-NEXT: addi a2, a0, 32 @@ -548,39 +550,41 @@ ; LMULMAX1-NEXT: vfncvt.f.xu.w v29, v27 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v27, v29 -; LMULMAX1-NEXT: addi a0, sp, 12 +; LMULMAX1-NEXT: addi a0, sp, 28 ; LMULMAX1-NEXT: vse16.v v27, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 4 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 8 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16,mf2,ta,mu -; LMULMAX1-NEXT: addi a0, sp, 8 -; LMULMAX1-NEXT: vle16.v v26, (a0) ; LMULMAX1-NEXT: addi a0, sp, 24 +; LMULMAX1-NEXT: vle16.v v26, (a0) +; LMULMAX1-NEXT: addi a0, sp, 40 ; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32,mf2,ta,mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16,mf4,ta,mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 -; LMULMAX1-NEXT: vse16.v v25, (sp) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16,mf2,ta,mu -; LMULMAX1-NEXT: vle16.v v25, (sp) ; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vle16.v v25, (a0) +; LMULMAX1-NEXT: addi a0, sp, 32 ; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: addi a0, sp, 32 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 +; LMULMAX1-NEXT: addi sp, sp, 48 ; LMULMAX1-NEXT: ret %a = load <8 x i64>, <8 x i64>* %x %d = uitofp <8 x i64> %a to <8 x half> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-fp.ll @@ -171,13 +171,13 @@ define <8 x half> @select_v8f16(i1 zeroext %c, <8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: select_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: bnez a0, .LBB4_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v9 -; CHECK-NEXT: fsh ft0, 0(sp) +; CHECK-NEXT: fsh ft0, 16(sp) ; CHECK-NEXT: beqz a0, .LBB4_4 ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: vsetivli zero, 1, e16,m1,ta,mu @@ -186,14 +186,14 @@ ; CHECK-NEXT: .LBB4_3: ; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fsh ft0, 0(sp) +; CHECK-NEXT: fsh ft0, 16(sp) ; CHECK-NEXT: bnez a0, .LBB4_2 ; CHECK-NEXT: .LBB4_4: ; CHECK-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; CHECK-NEXT: vslidedown.vi v25, v9, 7 ; CHECK-NEXT: .LBB4_5: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 14(sp) +; CHECK-NEXT: fsh ft0, 30(sp) ; CHECK-NEXT: bnez a0, .LBB4_7 ; CHECK-NEXT: # %bb.6: ; CHECK-NEXT: vslidedown.vi v25, v9, 6 @@ -202,7 +202,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 6 ; CHECK-NEXT: .LBB4_8: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 12(sp) +; CHECK-NEXT: fsh ft0, 28(sp) ; CHECK-NEXT: bnez a0, .LBB4_10 ; CHECK-NEXT: # %bb.9: ; CHECK-NEXT: vslidedown.vi v25, v9, 5 @@ -211,7 +211,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 5 ; CHECK-NEXT: .LBB4_11: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 10(sp) +; CHECK-NEXT: fsh ft0, 26(sp) ; CHECK-NEXT: bnez a0, .LBB4_13 ; CHECK-NEXT: # %bb.12: ; CHECK-NEXT: vslidedown.vi v25, v9, 4 @@ -220,7 +220,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 4 ; CHECK-NEXT: .LBB4_14: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 8(sp) +; CHECK-NEXT: fsh ft0, 24(sp) ; CHECK-NEXT: bnez a0, .LBB4_16 ; CHECK-NEXT: # %bb.15: ; CHECK-NEXT: vslidedown.vi v25, v9, 3 @@ -229,7 +229,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 3 ; CHECK-NEXT: .LBB4_17: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 6(sp) +; CHECK-NEXT: fsh ft0, 22(sp) ; CHECK-NEXT: bnez a0, .LBB4_19 ; CHECK-NEXT: # %bb.18: ; CHECK-NEXT: vslidedown.vi v25, v9, 2 @@ -238,7 +238,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 2 ; CHECK-NEXT: .LBB4_20: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 4(sp) +; CHECK-NEXT: fsh ft0, 20(sp) ; CHECK-NEXT: bnez a0, .LBB4_22 ; CHECK-NEXT: # %bb.21: ; CHECK-NEXT: vslidedown.vi v25, v9, 1 @@ -247,10 +247,11 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 1 ; CHECK-NEXT: .LBB4_23: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 2(sp) +; CHECK-NEXT: fsh ft0, 18(sp) ; CHECK-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; CHECK-NEXT: vle16.v v8, (sp) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: ret %v = select i1 %c, <8 x half> %a, <8 x half> %b ret <8 x half> %v @@ -259,14 +260,14 @@ define <8 x half> @selectcc_v8f16(half %a, half %b, <8 x half> %c, <8 x half> %d) { ; CHECK-LABEL: selectcc_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: feq.h a0, fa0, fa1 ; CHECK-NEXT: bnez a0, .LBB5_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v9 -; CHECK-NEXT: fsh ft0, 0(sp) +; CHECK-NEXT: fsh ft0, 16(sp) ; CHECK-NEXT: beqz a0, .LBB5_4 ; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: vsetivli zero, 1, e16,m1,ta,mu @@ -275,14 +276,14 @@ ; CHECK-NEXT: .LBB5_3: ; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fsh ft0, 0(sp) +; CHECK-NEXT: fsh ft0, 16(sp) ; CHECK-NEXT: bnez a0, .LBB5_2 ; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: vsetivli zero, 1, e16,m1,ta,mu ; CHECK-NEXT: vslidedown.vi v25, v9, 7 ; CHECK-NEXT: .LBB5_5: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 14(sp) +; CHECK-NEXT: fsh ft0, 30(sp) ; CHECK-NEXT: bnez a0, .LBB5_7 ; CHECK-NEXT: # %bb.6: ; CHECK-NEXT: vslidedown.vi v25, v9, 6 @@ -291,7 +292,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 6 ; CHECK-NEXT: .LBB5_8: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 12(sp) +; CHECK-NEXT: fsh ft0, 28(sp) ; CHECK-NEXT: bnez a0, .LBB5_10 ; CHECK-NEXT: # %bb.9: ; CHECK-NEXT: vslidedown.vi v25, v9, 5 @@ -300,7 +301,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 5 ; CHECK-NEXT: .LBB5_11: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 10(sp) +; CHECK-NEXT: fsh ft0, 26(sp) ; CHECK-NEXT: bnez a0, .LBB5_13 ; CHECK-NEXT: # %bb.12: ; CHECK-NEXT: vslidedown.vi v25, v9, 4 @@ -309,7 +310,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 4 ; CHECK-NEXT: .LBB5_14: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 8(sp) +; CHECK-NEXT: fsh ft0, 24(sp) ; CHECK-NEXT: bnez a0, .LBB5_16 ; CHECK-NEXT: # %bb.15: ; CHECK-NEXT: vslidedown.vi v25, v9, 3 @@ -318,7 +319,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 3 ; CHECK-NEXT: .LBB5_17: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 6(sp) +; CHECK-NEXT: fsh ft0, 22(sp) ; CHECK-NEXT: bnez a0, .LBB5_19 ; CHECK-NEXT: # %bb.18: ; CHECK-NEXT: vslidedown.vi v25, v9, 2 @@ -327,7 +328,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 2 ; CHECK-NEXT: .LBB5_20: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 4(sp) +; CHECK-NEXT: fsh ft0, 20(sp) ; CHECK-NEXT: bnez a0, .LBB5_22 ; CHECK-NEXT: # %bb.21: ; CHECK-NEXT: vslidedown.vi v25, v9, 1 @@ -336,10 +337,11 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 1 ; CHECK-NEXT: .LBB5_23: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsh ft0, 2(sp) +; CHECK-NEXT: fsh ft0, 18(sp) ; CHECK-NEXT: vsetivli zero, 8, e16,m1,ta,mu -; CHECK-NEXT: vle16.v v8, (sp) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: ret %cmp = fcmp oeq half %a, %b %v = select i1 %cmp, <8 x half> %c, <8 x half> %d @@ -349,20 +351,20 @@ define <16 x half> @select_v16f16(i1 zeroext %c, <16 x half> %a, <16 x half> %b) { ; RV32-LABEL: select_v16f16: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 +; RV32-NEXT: addi s0, sp, 96 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -32 ; RV32-NEXT: bnez a0, .LBB6_3 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v10 -; RV32-NEXT: fsh ft0, 0(sp) +; RV32-NEXT: fsh ft0, 32(sp) ; RV32-NEXT: beqz a0, .LBB6_4 ; RV32-NEXT: .LBB6_2: ; RV32-NEXT: vsetivli zero, 1, e16,m2,ta,mu @@ -371,14 +373,14 @@ ; RV32-NEXT: .LBB6_3: ; RV32-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsh ft0, 0(sp) +; RV32-NEXT: fsh ft0, 32(sp) ; RV32-NEXT: bnez a0, .LBB6_2 ; RV32-NEXT: .LBB6_4: ; RV32-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; RV32-NEXT: vslidedown.vi v26, v10, 15 ; RV32-NEXT: .LBB6_5: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 30(sp) +; RV32-NEXT: fsh ft0, 62(sp) ; RV32-NEXT: bnez a0, .LBB6_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v26, v10, 14 @@ -387,7 +389,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 14 ; RV32-NEXT: .LBB6_8: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 28(sp) +; RV32-NEXT: fsh ft0, 60(sp) ; RV32-NEXT: bnez a0, .LBB6_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v26, v10, 13 @@ -396,7 +398,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 13 ; RV32-NEXT: .LBB6_11: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 26(sp) +; RV32-NEXT: fsh ft0, 58(sp) ; RV32-NEXT: bnez a0, .LBB6_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v26, v10, 12 @@ -405,7 +407,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 12 ; RV32-NEXT: .LBB6_14: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 24(sp) +; RV32-NEXT: fsh ft0, 56(sp) ; RV32-NEXT: bnez a0, .LBB6_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v26, v10, 11 @@ -414,7 +416,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 11 ; RV32-NEXT: .LBB6_17: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 22(sp) +; RV32-NEXT: fsh ft0, 54(sp) ; RV32-NEXT: bnez a0, .LBB6_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v26, v10, 10 @@ -423,7 +425,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 10 ; RV32-NEXT: .LBB6_20: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 20(sp) +; RV32-NEXT: fsh ft0, 52(sp) ; RV32-NEXT: bnez a0, .LBB6_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v26, v10, 9 @@ -432,7 +434,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 9 ; RV32-NEXT: .LBB6_23: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 18(sp) +; RV32-NEXT: fsh ft0, 50(sp) ; RV32-NEXT: bnez a0, .LBB6_25 ; RV32-NEXT: # %bb.24: ; RV32-NEXT: vslidedown.vi v26, v10, 8 @@ -441,7 +443,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 8 ; RV32-NEXT: .LBB6_26: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 16(sp) +; RV32-NEXT: fsh ft0, 48(sp) ; RV32-NEXT: bnez a0, .LBB6_28 ; RV32-NEXT: # %bb.27: ; RV32-NEXT: vslidedown.vi v26, v10, 7 @@ -450,7 +452,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 7 ; RV32-NEXT: .LBB6_29: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 14(sp) +; RV32-NEXT: fsh ft0, 46(sp) ; RV32-NEXT: bnez a0, .LBB6_31 ; RV32-NEXT: # %bb.30: ; RV32-NEXT: vslidedown.vi v26, v10, 6 @@ -459,7 +461,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 6 ; RV32-NEXT: .LBB6_32: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 12(sp) +; RV32-NEXT: fsh ft0, 44(sp) ; RV32-NEXT: bnez a0, .LBB6_34 ; RV32-NEXT: # %bb.33: ; RV32-NEXT: vslidedown.vi v26, v10, 5 @@ -468,7 +470,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 5 ; RV32-NEXT: .LBB6_35: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 10(sp) +; RV32-NEXT: fsh ft0, 42(sp) ; RV32-NEXT: bnez a0, .LBB6_37 ; RV32-NEXT: # %bb.36: ; RV32-NEXT: vslidedown.vi v26, v10, 4 @@ -477,7 +479,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 4 ; RV32-NEXT: .LBB6_38: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 8(sp) +; RV32-NEXT: fsh ft0, 40(sp) ; RV32-NEXT: bnez a0, .LBB6_40 ; RV32-NEXT: # %bb.39: ; RV32-NEXT: vslidedown.vi v26, v10, 3 @@ -486,7 +488,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 3 ; RV32-NEXT: .LBB6_41: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 6(sp) +; RV32-NEXT: fsh ft0, 38(sp) ; RV32-NEXT: bnez a0, .LBB6_43 ; RV32-NEXT: # %bb.42: ; RV32-NEXT: vslidedown.vi v26, v10, 2 @@ -495,7 +497,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 2 ; RV32-NEXT: .LBB6_44: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 4(sp) +; RV32-NEXT: fsh ft0, 36(sp) ; RV32-NEXT: bnez a0, .LBB6_46 ; RV32-NEXT: # %bb.45: ; RV32-NEXT: vslidedown.vi v26, v10, 1 @@ -504,31 +506,32 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 1 ; RV32-NEXT: .LBB6_47: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 2(sp) +; RV32-NEXT: fsh ft0, 34(sp) ; RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; RV32-NEXT: vle16.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: addi sp, s0, -96 +; RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 ; RV32-NEXT: ret ; ; RV64-LABEL: select_v16f16: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 +; RV64-NEXT: addi s0, sp, 96 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -32 ; RV64-NEXT: bnez a0, .LBB6_3 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v10 -; RV64-NEXT: fsh ft0, 0(sp) +; RV64-NEXT: fsh ft0, 32(sp) ; RV64-NEXT: beqz a0, .LBB6_4 ; RV64-NEXT: .LBB6_2: ; RV64-NEXT: vsetivli zero, 1, e16,m2,ta,mu @@ -537,14 +540,14 @@ ; RV64-NEXT: .LBB6_3: ; RV64-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsh ft0, 0(sp) +; RV64-NEXT: fsh ft0, 32(sp) ; RV64-NEXT: bnez a0, .LBB6_2 ; RV64-NEXT: .LBB6_4: ; RV64-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; RV64-NEXT: vslidedown.vi v26, v10, 15 ; RV64-NEXT: .LBB6_5: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 30(sp) +; RV64-NEXT: fsh ft0, 62(sp) ; RV64-NEXT: bnez a0, .LBB6_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v26, v10, 14 @@ -553,7 +556,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 14 ; RV64-NEXT: .LBB6_8: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 28(sp) +; RV64-NEXT: fsh ft0, 60(sp) ; RV64-NEXT: bnez a0, .LBB6_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v26, v10, 13 @@ -562,7 +565,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 13 ; RV64-NEXT: .LBB6_11: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 26(sp) +; RV64-NEXT: fsh ft0, 58(sp) ; RV64-NEXT: bnez a0, .LBB6_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v26, v10, 12 @@ -571,7 +574,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 12 ; RV64-NEXT: .LBB6_14: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 24(sp) +; RV64-NEXT: fsh ft0, 56(sp) ; RV64-NEXT: bnez a0, .LBB6_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v26, v10, 11 @@ -580,7 +583,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 11 ; RV64-NEXT: .LBB6_17: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 22(sp) +; RV64-NEXT: fsh ft0, 54(sp) ; RV64-NEXT: bnez a0, .LBB6_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v26, v10, 10 @@ -589,7 +592,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 10 ; RV64-NEXT: .LBB6_20: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 20(sp) +; RV64-NEXT: fsh ft0, 52(sp) ; RV64-NEXT: bnez a0, .LBB6_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v26, v10, 9 @@ -598,7 +601,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 9 ; RV64-NEXT: .LBB6_23: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 18(sp) +; RV64-NEXT: fsh ft0, 50(sp) ; RV64-NEXT: bnez a0, .LBB6_25 ; RV64-NEXT: # %bb.24: ; RV64-NEXT: vslidedown.vi v26, v10, 8 @@ -607,7 +610,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 8 ; RV64-NEXT: .LBB6_26: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 16(sp) +; RV64-NEXT: fsh ft0, 48(sp) ; RV64-NEXT: bnez a0, .LBB6_28 ; RV64-NEXT: # %bb.27: ; RV64-NEXT: vslidedown.vi v26, v10, 7 @@ -616,7 +619,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 7 ; RV64-NEXT: .LBB6_29: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 14(sp) +; RV64-NEXT: fsh ft0, 46(sp) ; RV64-NEXT: bnez a0, .LBB6_31 ; RV64-NEXT: # %bb.30: ; RV64-NEXT: vslidedown.vi v26, v10, 6 @@ -625,7 +628,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 6 ; RV64-NEXT: .LBB6_32: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 12(sp) +; RV64-NEXT: fsh ft0, 44(sp) ; RV64-NEXT: bnez a0, .LBB6_34 ; RV64-NEXT: # %bb.33: ; RV64-NEXT: vslidedown.vi v26, v10, 5 @@ -634,7 +637,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 5 ; RV64-NEXT: .LBB6_35: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 10(sp) +; RV64-NEXT: fsh ft0, 42(sp) ; RV64-NEXT: bnez a0, .LBB6_37 ; RV64-NEXT: # %bb.36: ; RV64-NEXT: vslidedown.vi v26, v10, 4 @@ -643,7 +646,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 4 ; RV64-NEXT: .LBB6_38: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 8(sp) +; RV64-NEXT: fsh ft0, 40(sp) ; RV64-NEXT: bnez a0, .LBB6_40 ; RV64-NEXT: # %bb.39: ; RV64-NEXT: vslidedown.vi v26, v10, 3 @@ -652,7 +655,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 3 ; RV64-NEXT: .LBB6_41: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 6(sp) +; RV64-NEXT: fsh ft0, 38(sp) ; RV64-NEXT: bnez a0, .LBB6_43 ; RV64-NEXT: # %bb.42: ; RV64-NEXT: vslidedown.vi v26, v10, 2 @@ -661,7 +664,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 2 ; RV64-NEXT: .LBB6_44: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 4(sp) +; RV64-NEXT: fsh ft0, 36(sp) ; RV64-NEXT: bnez a0, .LBB6_46 ; RV64-NEXT: # %bb.45: ; RV64-NEXT: vslidedown.vi v26, v10, 1 @@ -670,13 +673,14 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 1 ; RV64-NEXT: .LBB6_47: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 2(sp) +; RV64-NEXT: fsh ft0, 34(sp) ; RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; RV64-NEXT: vle16.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: addi sp, s0, -96 +; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %v = select i1 %c, <16 x half> %a, <16 x half> %b ret <16 x half> %v @@ -685,13 +689,13 @@ define <16 x half> @selectcc_v16f16(half %a, half %b, <16 x half> %c, <16 x half> %d) { ; RV32-LABEL: selectcc_v16f16: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 +; RV32-NEXT: addi s0, sp, 96 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -32 ; RV32-NEXT: feq.h a0, fa0, fa1 @@ -699,7 +703,7 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v10 -; RV32-NEXT: fsh ft0, 0(sp) +; RV32-NEXT: fsh ft0, 32(sp) ; RV32-NEXT: beqz a0, .LBB7_4 ; RV32-NEXT: .LBB7_2: ; RV32-NEXT: vsetivli zero, 1, e16,m2,ta,mu @@ -708,14 +712,14 @@ ; RV32-NEXT: .LBB7_3: ; RV32-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsh ft0, 0(sp) +; RV32-NEXT: fsh ft0, 32(sp) ; RV32-NEXT: bnez a0, .LBB7_2 ; RV32-NEXT: .LBB7_4: ; RV32-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; RV32-NEXT: vslidedown.vi v26, v10, 15 ; RV32-NEXT: .LBB7_5: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 30(sp) +; RV32-NEXT: fsh ft0, 62(sp) ; RV32-NEXT: bnez a0, .LBB7_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v26, v10, 14 @@ -724,7 +728,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 14 ; RV32-NEXT: .LBB7_8: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 28(sp) +; RV32-NEXT: fsh ft0, 60(sp) ; RV32-NEXT: bnez a0, .LBB7_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v26, v10, 13 @@ -733,7 +737,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 13 ; RV32-NEXT: .LBB7_11: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 26(sp) +; RV32-NEXT: fsh ft0, 58(sp) ; RV32-NEXT: bnez a0, .LBB7_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v26, v10, 12 @@ -742,7 +746,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 12 ; RV32-NEXT: .LBB7_14: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 24(sp) +; RV32-NEXT: fsh ft0, 56(sp) ; RV32-NEXT: bnez a0, .LBB7_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v26, v10, 11 @@ -751,7 +755,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 11 ; RV32-NEXT: .LBB7_17: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 22(sp) +; RV32-NEXT: fsh ft0, 54(sp) ; RV32-NEXT: bnez a0, .LBB7_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v26, v10, 10 @@ -760,7 +764,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 10 ; RV32-NEXT: .LBB7_20: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 20(sp) +; RV32-NEXT: fsh ft0, 52(sp) ; RV32-NEXT: bnez a0, .LBB7_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v26, v10, 9 @@ -769,7 +773,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 9 ; RV32-NEXT: .LBB7_23: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 18(sp) +; RV32-NEXT: fsh ft0, 50(sp) ; RV32-NEXT: bnez a0, .LBB7_25 ; RV32-NEXT: # %bb.24: ; RV32-NEXT: vslidedown.vi v26, v10, 8 @@ -778,7 +782,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 8 ; RV32-NEXT: .LBB7_26: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 16(sp) +; RV32-NEXT: fsh ft0, 48(sp) ; RV32-NEXT: bnez a0, .LBB7_28 ; RV32-NEXT: # %bb.27: ; RV32-NEXT: vslidedown.vi v26, v10, 7 @@ -787,7 +791,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 7 ; RV32-NEXT: .LBB7_29: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 14(sp) +; RV32-NEXT: fsh ft0, 46(sp) ; RV32-NEXT: bnez a0, .LBB7_31 ; RV32-NEXT: # %bb.30: ; RV32-NEXT: vslidedown.vi v26, v10, 6 @@ -796,7 +800,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 6 ; RV32-NEXT: .LBB7_32: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 12(sp) +; RV32-NEXT: fsh ft0, 44(sp) ; RV32-NEXT: bnez a0, .LBB7_34 ; RV32-NEXT: # %bb.33: ; RV32-NEXT: vslidedown.vi v26, v10, 5 @@ -805,7 +809,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 5 ; RV32-NEXT: .LBB7_35: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 10(sp) +; RV32-NEXT: fsh ft0, 42(sp) ; RV32-NEXT: bnez a0, .LBB7_37 ; RV32-NEXT: # %bb.36: ; RV32-NEXT: vslidedown.vi v26, v10, 4 @@ -814,7 +818,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 4 ; RV32-NEXT: .LBB7_38: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 8(sp) +; RV32-NEXT: fsh ft0, 40(sp) ; RV32-NEXT: bnez a0, .LBB7_40 ; RV32-NEXT: # %bb.39: ; RV32-NEXT: vslidedown.vi v26, v10, 3 @@ -823,7 +827,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 3 ; RV32-NEXT: .LBB7_41: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 6(sp) +; RV32-NEXT: fsh ft0, 38(sp) ; RV32-NEXT: bnez a0, .LBB7_43 ; RV32-NEXT: # %bb.42: ; RV32-NEXT: vslidedown.vi v26, v10, 2 @@ -832,7 +836,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 2 ; RV32-NEXT: .LBB7_44: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 4(sp) +; RV32-NEXT: fsh ft0, 36(sp) ; RV32-NEXT: bnez a0, .LBB7_46 ; RV32-NEXT: # %bb.45: ; RV32-NEXT: vslidedown.vi v26, v10, 1 @@ -841,24 +845,25 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 1 ; RV32-NEXT: .LBB7_47: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsh ft0, 2(sp) +; RV32-NEXT: fsh ft0, 34(sp) ; RV32-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; RV32-NEXT: vle16.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: addi sp, s0, -96 +; RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 ; RV32-NEXT: ret ; ; RV64-LABEL: selectcc_v16f16: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 +; RV64-NEXT: addi s0, sp, 96 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -32 ; RV64-NEXT: feq.h a0, fa0, fa1 @@ -866,7 +871,7 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v10 -; RV64-NEXT: fsh ft0, 0(sp) +; RV64-NEXT: fsh ft0, 32(sp) ; RV64-NEXT: beqz a0, .LBB7_4 ; RV64-NEXT: .LBB7_2: ; RV64-NEXT: vsetivli zero, 1, e16,m2,ta,mu @@ -875,14 +880,14 @@ ; RV64-NEXT: .LBB7_3: ; RV64-NEXT: vsetvli zero, zero, e16,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsh ft0, 0(sp) +; RV64-NEXT: fsh ft0, 32(sp) ; RV64-NEXT: bnez a0, .LBB7_2 ; RV64-NEXT: .LBB7_4: ; RV64-NEXT: vsetivli zero, 1, e16,m2,ta,mu ; RV64-NEXT: vslidedown.vi v26, v10, 15 ; RV64-NEXT: .LBB7_5: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 30(sp) +; RV64-NEXT: fsh ft0, 62(sp) ; RV64-NEXT: bnez a0, .LBB7_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v26, v10, 14 @@ -891,7 +896,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 14 ; RV64-NEXT: .LBB7_8: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 28(sp) +; RV64-NEXT: fsh ft0, 60(sp) ; RV64-NEXT: bnez a0, .LBB7_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v26, v10, 13 @@ -900,7 +905,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 13 ; RV64-NEXT: .LBB7_11: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 26(sp) +; RV64-NEXT: fsh ft0, 58(sp) ; RV64-NEXT: bnez a0, .LBB7_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v26, v10, 12 @@ -909,7 +914,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 12 ; RV64-NEXT: .LBB7_14: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 24(sp) +; RV64-NEXT: fsh ft0, 56(sp) ; RV64-NEXT: bnez a0, .LBB7_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v26, v10, 11 @@ -918,7 +923,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 11 ; RV64-NEXT: .LBB7_17: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 22(sp) +; RV64-NEXT: fsh ft0, 54(sp) ; RV64-NEXT: bnez a0, .LBB7_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v26, v10, 10 @@ -927,7 +932,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 10 ; RV64-NEXT: .LBB7_20: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 20(sp) +; RV64-NEXT: fsh ft0, 52(sp) ; RV64-NEXT: bnez a0, .LBB7_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v26, v10, 9 @@ -936,7 +941,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 9 ; RV64-NEXT: .LBB7_23: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 18(sp) +; RV64-NEXT: fsh ft0, 50(sp) ; RV64-NEXT: bnez a0, .LBB7_25 ; RV64-NEXT: # %bb.24: ; RV64-NEXT: vslidedown.vi v26, v10, 8 @@ -945,7 +950,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 8 ; RV64-NEXT: .LBB7_26: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 16(sp) +; RV64-NEXT: fsh ft0, 48(sp) ; RV64-NEXT: bnez a0, .LBB7_28 ; RV64-NEXT: # %bb.27: ; RV64-NEXT: vslidedown.vi v26, v10, 7 @@ -954,7 +959,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 7 ; RV64-NEXT: .LBB7_29: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 14(sp) +; RV64-NEXT: fsh ft0, 46(sp) ; RV64-NEXT: bnez a0, .LBB7_31 ; RV64-NEXT: # %bb.30: ; RV64-NEXT: vslidedown.vi v26, v10, 6 @@ -963,7 +968,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 6 ; RV64-NEXT: .LBB7_32: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 12(sp) +; RV64-NEXT: fsh ft0, 44(sp) ; RV64-NEXT: bnez a0, .LBB7_34 ; RV64-NEXT: # %bb.33: ; RV64-NEXT: vslidedown.vi v26, v10, 5 @@ -972,7 +977,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 5 ; RV64-NEXT: .LBB7_35: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 10(sp) +; RV64-NEXT: fsh ft0, 42(sp) ; RV64-NEXT: bnez a0, .LBB7_37 ; RV64-NEXT: # %bb.36: ; RV64-NEXT: vslidedown.vi v26, v10, 4 @@ -981,7 +986,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 4 ; RV64-NEXT: .LBB7_38: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 8(sp) +; RV64-NEXT: fsh ft0, 40(sp) ; RV64-NEXT: bnez a0, .LBB7_40 ; RV64-NEXT: # %bb.39: ; RV64-NEXT: vslidedown.vi v26, v10, 3 @@ -990,7 +995,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 3 ; RV64-NEXT: .LBB7_41: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 6(sp) +; RV64-NEXT: fsh ft0, 38(sp) ; RV64-NEXT: bnez a0, .LBB7_43 ; RV64-NEXT: # %bb.42: ; RV64-NEXT: vslidedown.vi v26, v10, 2 @@ -999,7 +1004,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 2 ; RV64-NEXT: .LBB7_44: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 4(sp) +; RV64-NEXT: fsh ft0, 36(sp) ; RV64-NEXT: bnez a0, .LBB7_46 ; RV64-NEXT: # %bb.45: ; RV64-NEXT: vslidedown.vi v26, v10, 1 @@ -1008,13 +1013,14 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 1 ; RV64-NEXT: .LBB7_47: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsh ft0, 2(sp) +; RV64-NEXT: fsh ft0, 34(sp) ; RV64-NEXT: vsetivli zero, 16, e16,m2,ta,mu -; RV64-NEXT: vle16.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: addi sp, s0, -96 +; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %cmp = fcmp oeq half %a, %b %v = select i1 %cmp, <16 x half> %c, <16 x half> %d @@ -1080,13 +1086,13 @@ define <4 x float> @select_v4f32(i1 zeroext %c, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: bnez a0, .LBB10_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vsetvli zero, zero, e32,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v9 -; CHECK-NEXT: fsw ft0, 0(sp) +; CHECK-NEXT: fsw ft0, 16(sp) ; CHECK-NEXT: beqz a0, .LBB10_4 ; CHECK-NEXT: .LBB10_2: ; CHECK-NEXT: vsetivli zero, 1, e32,m1,ta,mu @@ -1095,14 +1101,14 @@ ; CHECK-NEXT: .LBB10_3: ; CHECK-NEXT: vsetvli zero, zero, e32,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fsw ft0, 0(sp) +; CHECK-NEXT: fsw ft0, 16(sp) ; CHECK-NEXT: bnez a0, .LBB10_2 ; CHECK-NEXT: .LBB10_4: ; CHECK-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; CHECK-NEXT: vslidedown.vi v25, v9, 3 ; CHECK-NEXT: .LBB10_5: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsw ft0, 12(sp) +; CHECK-NEXT: fsw ft0, 28(sp) ; CHECK-NEXT: bnez a0, .LBB10_7 ; CHECK-NEXT: # %bb.6: ; CHECK-NEXT: vslidedown.vi v25, v9, 2 @@ -1111,7 +1117,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 2 ; CHECK-NEXT: .LBB10_8: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsw ft0, 8(sp) +; CHECK-NEXT: fsw ft0, 24(sp) ; CHECK-NEXT: bnez a0, .LBB10_10 ; CHECK-NEXT: # %bb.9: ; CHECK-NEXT: vslidedown.vi v25, v9, 1 @@ -1120,10 +1126,11 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 1 ; CHECK-NEXT: .LBB10_11: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsw ft0, 4(sp) +; CHECK-NEXT: fsw ft0, 20(sp) ; CHECK-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; CHECK-NEXT: vle32.v v8, (sp) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: ret %v = select i1 %c, <4 x float> %a, <4 x float> %b ret <4 x float> %v @@ -1132,14 +1139,14 @@ define <4 x float> @selectcc_v4f32(float %a, float %b, <4 x float> %c, <4 x float> %d) { ; CHECK-LABEL: selectcc_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: feq.s a0, fa0, fa1 ; CHECK-NEXT: bnez a0, .LBB11_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vsetvli zero, zero, e32,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v9 -; CHECK-NEXT: fsw ft0, 0(sp) +; CHECK-NEXT: fsw ft0, 16(sp) ; CHECK-NEXT: beqz a0, .LBB11_4 ; CHECK-NEXT: .LBB11_2: ; CHECK-NEXT: vsetivli zero, 1, e32,m1,ta,mu @@ -1148,14 +1155,14 @@ ; CHECK-NEXT: .LBB11_3: ; CHECK-NEXT: vsetvli zero, zero, e32,m1,ta,mu ; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fsw ft0, 0(sp) +; CHECK-NEXT: fsw ft0, 16(sp) ; CHECK-NEXT: bnez a0, .LBB11_2 ; CHECK-NEXT: .LBB11_4: ; CHECK-NEXT: vsetivli zero, 1, e32,m1,ta,mu ; CHECK-NEXT: vslidedown.vi v25, v9, 3 ; CHECK-NEXT: .LBB11_5: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsw ft0, 12(sp) +; CHECK-NEXT: fsw ft0, 28(sp) ; CHECK-NEXT: bnez a0, .LBB11_7 ; CHECK-NEXT: # %bb.6: ; CHECK-NEXT: vslidedown.vi v25, v9, 2 @@ -1164,7 +1171,7 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 2 ; CHECK-NEXT: .LBB11_8: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsw ft0, 8(sp) +; CHECK-NEXT: fsw ft0, 24(sp) ; CHECK-NEXT: bnez a0, .LBB11_10 ; CHECK-NEXT: # %bb.9: ; CHECK-NEXT: vslidedown.vi v25, v9, 1 @@ -1173,10 +1180,11 @@ ; CHECK-NEXT: vslidedown.vi v25, v8, 1 ; CHECK-NEXT: .LBB11_11: ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fsw ft0, 4(sp) +; CHECK-NEXT: fsw ft0, 20(sp) ; CHECK-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; CHECK-NEXT: vle32.v v8, (sp) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: ret %cmp = fcmp oeq float %a, %b %v = select i1 %cmp, <4 x float> %c, <4 x float> %d @@ -1186,20 +1194,20 @@ define <8 x float> @select_v8f32(i1 zeroext %c, <8 x float> %a, <8 x float> %b) { ; RV32-LABEL: select_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 +; RV32-NEXT: addi s0, sp, 96 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -32 ; RV32-NEXT: bnez a0, .LBB12_3 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v10 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 32(sp) ; RV32-NEXT: beqz a0, .LBB12_4 ; RV32-NEXT: .LBB12_2: ; RV32-NEXT: vsetivli zero, 1, e32,m2,ta,mu @@ -1208,14 +1216,14 @@ ; RV32-NEXT: .LBB12_3: ; RV32-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 32(sp) ; RV32-NEXT: bnez a0, .LBB12_2 ; RV32-NEXT: .LBB12_4: ; RV32-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; RV32-NEXT: vslidedown.vi v26, v10, 7 ; RV32-NEXT: .LBB12_5: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 28(sp) +; RV32-NEXT: fsw ft0, 60(sp) ; RV32-NEXT: bnez a0, .LBB12_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v26, v10, 6 @@ -1224,7 +1232,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 6 ; RV32-NEXT: .LBB12_8: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 24(sp) +; RV32-NEXT: fsw ft0, 56(sp) ; RV32-NEXT: bnez a0, .LBB12_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v26, v10, 5 @@ -1233,7 +1241,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 5 ; RV32-NEXT: .LBB12_11: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 20(sp) +; RV32-NEXT: fsw ft0, 52(sp) ; RV32-NEXT: bnez a0, .LBB12_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v26, v10, 4 @@ -1242,7 +1250,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 4 ; RV32-NEXT: .LBB12_14: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 16(sp) +; RV32-NEXT: fsw ft0, 48(sp) ; RV32-NEXT: bnez a0, .LBB12_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v26, v10, 3 @@ -1251,7 +1259,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 3 ; RV32-NEXT: .LBB12_17: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 12(sp) +; RV32-NEXT: fsw ft0, 44(sp) ; RV32-NEXT: bnez a0, .LBB12_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v26, v10, 2 @@ -1260,7 +1268,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 2 ; RV32-NEXT: .LBB12_20: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 8(sp) +; RV32-NEXT: fsw ft0, 40(sp) ; RV32-NEXT: bnez a0, .LBB12_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v26, v10, 1 @@ -1269,31 +1277,32 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 1 ; RV32-NEXT: .LBB12_23: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 4(sp) +; RV32-NEXT: fsw ft0, 36(sp) ; RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -96 +; RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 ; RV32-NEXT: ret ; ; RV64-LABEL: select_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 +; RV64-NEXT: addi s0, sp, 96 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -32 ; RV64-NEXT: bnez a0, .LBB12_3 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v10 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 32(sp) ; RV64-NEXT: beqz a0, .LBB12_4 ; RV64-NEXT: .LBB12_2: ; RV64-NEXT: vsetivli zero, 1, e32,m2,ta,mu @@ -1302,14 +1311,14 @@ ; RV64-NEXT: .LBB12_3: ; RV64-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 32(sp) ; RV64-NEXT: bnez a0, .LBB12_2 ; RV64-NEXT: .LBB12_4: ; RV64-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; RV64-NEXT: vslidedown.vi v26, v10, 7 ; RV64-NEXT: .LBB12_5: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 28(sp) +; RV64-NEXT: fsw ft0, 60(sp) ; RV64-NEXT: bnez a0, .LBB12_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v26, v10, 6 @@ -1318,7 +1327,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 6 ; RV64-NEXT: .LBB12_8: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 24(sp) +; RV64-NEXT: fsw ft0, 56(sp) ; RV64-NEXT: bnez a0, .LBB12_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v26, v10, 5 @@ -1327,7 +1336,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 5 ; RV64-NEXT: .LBB12_11: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 20(sp) +; RV64-NEXT: fsw ft0, 52(sp) ; RV64-NEXT: bnez a0, .LBB12_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v26, v10, 4 @@ -1336,7 +1345,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 4 ; RV64-NEXT: .LBB12_14: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 16(sp) +; RV64-NEXT: fsw ft0, 48(sp) ; RV64-NEXT: bnez a0, .LBB12_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v26, v10, 3 @@ -1345,7 +1354,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 3 ; RV64-NEXT: .LBB12_17: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 12(sp) +; RV64-NEXT: fsw ft0, 44(sp) ; RV64-NEXT: bnez a0, .LBB12_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v26, v10, 2 @@ -1354,7 +1363,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 2 ; RV64-NEXT: .LBB12_20: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 8(sp) +; RV64-NEXT: fsw ft0, 40(sp) ; RV64-NEXT: bnez a0, .LBB12_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v26, v10, 1 @@ -1363,13 +1372,14 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 1 ; RV64-NEXT: .LBB12_23: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 4(sp) +; RV64-NEXT: fsw ft0, 36(sp) ; RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; RV64-NEXT: vle32.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -96 +; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %v = select i1 %c, <8 x float> %a, <8 x float> %b ret <8 x float> %v @@ -1378,13 +1388,13 @@ define <8 x float> @selectcc_v8f32(float %a, float %b, <8 x float> %c, <8 x float> %d) { ; RV32-LABEL: selectcc_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 +; RV32-NEXT: addi s0, sp, 96 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -32 ; RV32-NEXT: feq.s a0, fa0, fa1 @@ -1392,7 +1402,7 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v10 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 32(sp) ; RV32-NEXT: beqz a0, .LBB13_4 ; RV32-NEXT: .LBB13_2: ; RV32-NEXT: vsetivli zero, 1, e32,m2,ta,mu @@ -1401,14 +1411,14 @@ ; RV32-NEXT: .LBB13_3: ; RV32-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 32(sp) ; RV32-NEXT: bnez a0, .LBB13_2 ; RV32-NEXT: .LBB13_4: ; RV32-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; RV32-NEXT: vslidedown.vi v26, v10, 7 ; RV32-NEXT: .LBB13_5: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 28(sp) +; RV32-NEXT: fsw ft0, 60(sp) ; RV32-NEXT: bnez a0, .LBB13_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v26, v10, 6 @@ -1417,7 +1427,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 6 ; RV32-NEXT: .LBB13_8: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 24(sp) +; RV32-NEXT: fsw ft0, 56(sp) ; RV32-NEXT: bnez a0, .LBB13_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v26, v10, 5 @@ -1426,7 +1436,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 5 ; RV32-NEXT: .LBB13_11: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 20(sp) +; RV32-NEXT: fsw ft0, 52(sp) ; RV32-NEXT: bnez a0, .LBB13_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v26, v10, 4 @@ -1435,7 +1445,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 4 ; RV32-NEXT: .LBB13_14: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 16(sp) +; RV32-NEXT: fsw ft0, 48(sp) ; RV32-NEXT: bnez a0, .LBB13_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v26, v10, 3 @@ -1444,7 +1454,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 3 ; RV32-NEXT: .LBB13_17: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 12(sp) +; RV32-NEXT: fsw ft0, 44(sp) ; RV32-NEXT: bnez a0, .LBB13_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v26, v10, 2 @@ -1453,7 +1463,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 2 ; RV32-NEXT: .LBB13_20: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 8(sp) +; RV32-NEXT: fsw ft0, 40(sp) ; RV32-NEXT: bnez a0, .LBB13_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v26, v10, 1 @@ -1462,24 +1472,25 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 1 ; RV32-NEXT: .LBB13_23: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsw ft0, 4(sp) +; RV32-NEXT: fsw ft0, 36(sp) ; RV32-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -96 +; RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 ; RV32-NEXT: ret ; ; RV64-LABEL: selectcc_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 +; RV64-NEXT: addi s0, sp, 96 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -32 ; RV64-NEXT: feq.s a0, fa0, fa1 @@ -1487,7 +1498,7 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v10 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 32(sp) ; RV64-NEXT: beqz a0, .LBB13_4 ; RV64-NEXT: .LBB13_2: ; RV64-NEXT: vsetivli zero, 1, e32,m2,ta,mu @@ -1496,14 +1507,14 @@ ; RV64-NEXT: .LBB13_3: ; RV64-NEXT: vsetvli zero, zero, e32,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 32(sp) ; RV64-NEXT: bnez a0, .LBB13_2 ; RV64-NEXT: .LBB13_4: ; RV64-NEXT: vsetivli zero, 1, e32,m2,ta,mu ; RV64-NEXT: vslidedown.vi v26, v10, 7 ; RV64-NEXT: .LBB13_5: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 28(sp) +; RV64-NEXT: fsw ft0, 60(sp) ; RV64-NEXT: bnez a0, .LBB13_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v26, v10, 6 @@ -1512,7 +1523,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 6 ; RV64-NEXT: .LBB13_8: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 24(sp) +; RV64-NEXT: fsw ft0, 56(sp) ; RV64-NEXT: bnez a0, .LBB13_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v26, v10, 5 @@ -1521,7 +1532,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 5 ; RV64-NEXT: .LBB13_11: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 20(sp) +; RV64-NEXT: fsw ft0, 52(sp) ; RV64-NEXT: bnez a0, .LBB13_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v26, v10, 4 @@ -1530,7 +1541,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 4 ; RV64-NEXT: .LBB13_14: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 16(sp) +; RV64-NEXT: fsw ft0, 48(sp) ; RV64-NEXT: bnez a0, .LBB13_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v26, v10, 3 @@ -1539,7 +1550,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 3 ; RV64-NEXT: .LBB13_17: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 12(sp) +; RV64-NEXT: fsw ft0, 44(sp) ; RV64-NEXT: bnez a0, .LBB13_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v26, v10, 2 @@ -1548,7 +1559,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 2 ; RV64-NEXT: .LBB13_20: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 8(sp) +; RV64-NEXT: fsw ft0, 40(sp) ; RV64-NEXT: bnez a0, .LBB13_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v26, v10, 1 @@ -1557,13 +1568,14 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 1 ; RV64-NEXT: .LBB13_23: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsw ft0, 4(sp) +; RV64-NEXT: fsw ft0, 36(sp) ; RV64-NEXT: vsetivli zero, 8, e32,m2,ta,mu -; RV64-NEXT: vle32.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -96 +; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %cmp = fcmp oeq float %a, %b %v = select i1 %cmp, <8 x float> %c, <8 x float> %d @@ -1573,20 +1585,20 @@ define <16 x float> @select_v16f32(i1 zeroext %c, <16 x float> %a, <16 x float> %b) { ; RV32-LABEL: select_v16f32: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -192 +; RV32-NEXT: .cfi_def_cfa_offset 192 +; RV32-NEXT: sw ra, 188(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 184(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: addi s0, sp, 192 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: bnez a0, .LBB14_3 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v12 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 64(sp) ; RV32-NEXT: beqz a0, .LBB14_4 ; RV32-NEXT: .LBB14_2: ; RV32-NEXT: vsetivli zero, 1, e32,m4,ta,mu @@ -1595,14 +1607,14 @@ ; RV32-NEXT: .LBB14_3: ; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 64(sp) ; RV32-NEXT: bnez a0, .LBB14_2 ; RV32-NEXT: .LBB14_4: ; RV32-NEXT: vsetivli zero, 1, e32,m4,ta,mu ; RV32-NEXT: vslidedown.vi v28, v12, 15 ; RV32-NEXT: .LBB14_5: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 60(sp) +; RV32-NEXT: fsw ft0, 124(sp) ; RV32-NEXT: bnez a0, .LBB14_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v28, v12, 14 @@ -1611,7 +1623,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 14 ; RV32-NEXT: .LBB14_8: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 56(sp) +; RV32-NEXT: fsw ft0, 120(sp) ; RV32-NEXT: bnez a0, .LBB14_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v28, v12, 13 @@ -1620,7 +1632,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 13 ; RV32-NEXT: .LBB14_11: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 52(sp) +; RV32-NEXT: fsw ft0, 116(sp) ; RV32-NEXT: bnez a0, .LBB14_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v28, v12, 12 @@ -1629,7 +1641,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 12 ; RV32-NEXT: .LBB14_14: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 48(sp) +; RV32-NEXT: fsw ft0, 112(sp) ; RV32-NEXT: bnez a0, .LBB14_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v28, v12, 11 @@ -1638,7 +1650,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 11 ; RV32-NEXT: .LBB14_17: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 44(sp) +; RV32-NEXT: fsw ft0, 108(sp) ; RV32-NEXT: bnez a0, .LBB14_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v28, v12, 10 @@ -1647,7 +1659,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 10 ; RV32-NEXT: .LBB14_20: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 40(sp) +; RV32-NEXT: fsw ft0, 104(sp) ; RV32-NEXT: bnez a0, .LBB14_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v28, v12, 9 @@ -1656,7 +1668,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 9 ; RV32-NEXT: .LBB14_23: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 36(sp) +; RV32-NEXT: fsw ft0, 100(sp) ; RV32-NEXT: bnez a0, .LBB14_25 ; RV32-NEXT: # %bb.24: ; RV32-NEXT: vslidedown.vi v28, v12, 8 @@ -1665,7 +1677,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 8 ; RV32-NEXT: .LBB14_26: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 32(sp) +; RV32-NEXT: fsw ft0, 96(sp) ; RV32-NEXT: bnez a0, .LBB14_28 ; RV32-NEXT: # %bb.27: ; RV32-NEXT: vslidedown.vi v28, v12, 7 @@ -1674,7 +1686,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 7 ; RV32-NEXT: .LBB14_29: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 28(sp) +; RV32-NEXT: fsw ft0, 92(sp) ; RV32-NEXT: bnez a0, .LBB14_31 ; RV32-NEXT: # %bb.30: ; RV32-NEXT: vslidedown.vi v28, v12, 6 @@ -1683,7 +1695,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 6 ; RV32-NEXT: .LBB14_32: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 24(sp) +; RV32-NEXT: fsw ft0, 88(sp) ; RV32-NEXT: bnez a0, .LBB14_34 ; RV32-NEXT: # %bb.33: ; RV32-NEXT: vslidedown.vi v28, v12, 5 @@ -1692,7 +1704,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 5 ; RV32-NEXT: .LBB14_35: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 20(sp) +; RV32-NEXT: fsw ft0, 84(sp) ; RV32-NEXT: bnez a0, .LBB14_37 ; RV32-NEXT: # %bb.36: ; RV32-NEXT: vslidedown.vi v28, v12, 4 @@ -1701,7 +1713,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 4 ; RV32-NEXT: .LBB14_38: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 16(sp) +; RV32-NEXT: fsw ft0, 80(sp) ; RV32-NEXT: bnez a0, .LBB14_40 ; RV32-NEXT: # %bb.39: ; RV32-NEXT: vslidedown.vi v28, v12, 3 @@ -1710,7 +1722,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 3 ; RV32-NEXT: .LBB14_41: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 12(sp) +; RV32-NEXT: fsw ft0, 76(sp) ; RV32-NEXT: bnez a0, .LBB14_43 ; RV32-NEXT: # %bb.42: ; RV32-NEXT: vslidedown.vi v28, v12, 2 @@ -1719,7 +1731,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 2 ; RV32-NEXT: .LBB14_44: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 8(sp) +; RV32-NEXT: fsw ft0, 72(sp) ; RV32-NEXT: bnez a0, .LBB14_46 ; RV32-NEXT: # %bb.45: ; RV32-NEXT: vslidedown.vi v28, v12, 1 @@ -1728,31 +1740,32 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 1 ; RV32-NEXT: .LBB14_47: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 4(sp) +; RV32-NEXT: fsw ft0, 68(sp) ; RV32-NEXT: vsetivli zero, 16, e32,m4,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -192 +; RV32-NEXT: lw s0, 184(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 188(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 192 ; RV32-NEXT: ret ; ; RV64-LABEL: select_v16f32: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -192 +; RV64-NEXT: .cfi_def_cfa_offset 192 +; RV64-NEXT: sd ra, 184(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 176(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: addi s0, sp, 192 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: bnez a0, .LBB14_3 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v12 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 64(sp) ; RV64-NEXT: beqz a0, .LBB14_4 ; RV64-NEXT: .LBB14_2: ; RV64-NEXT: vsetivli zero, 1, e32,m4,ta,mu @@ -1761,14 +1774,14 @@ ; RV64-NEXT: .LBB14_3: ; RV64-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 64(sp) ; RV64-NEXT: bnez a0, .LBB14_2 ; RV64-NEXT: .LBB14_4: ; RV64-NEXT: vsetivli zero, 1, e32,m4,ta,mu ; RV64-NEXT: vslidedown.vi v28, v12, 15 ; RV64-NEXT: .LBB14_5: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 60(sp) +; RV64-NEXT: fsw ft0, 124(sp) ; RV64-NEXT: bnez a0, .LBB14_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v28, v12, 14 @@ -1777,7 +1790,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 14 ; RV64-NEXT: .LBB14_8: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 56(sp) +; RV64-NEXT: fsw ft0, 120(sp) ; RV64-NEXT: bnez a0, .LBB14_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v28, v12, 13 @@ -1786,7 +1799,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 13 ; RV64-NEXT: .LBB14_11: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 52(sp) +; RV64-NEXT: fsw ft0, 116(sp) ; RV64-NEXT: bnez a0, .LBB14_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v28, v12, 12 @@ -1795,7 +1808,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 12 ; RV64-NEXT: .LBB14_14: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 48(sp) +; RV64-NEXT: fsw ft0, 112(sp) ; RV64-NEXT: bnez a0, .LBB14_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v28, v12, 11 @@ -1804,7 +1817,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 11 ; RV64-NEXT: .LBB14_17: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 44(sp) +; RV64-NEXT: fsw ft0, 108(sp) ; RV64-NEXT: bnez a0, .LBB14_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v28, v12, 10 @@ -1813,7 +1826,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 10 ; RV64-NEXT: .LBB14_20: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 40(sp) +; RV64-NEXT: fsw ft0, 104(sp) ; RV64-NEXT: bnez a0, .LBB14_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v28, v12, 9 @@ -1822,7 +1835,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 9 ; RV64-NEXT: .LBB14_23: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 36(sp) +; RV64-NEXT: fsw ft0, 100(sp) ; RV64-NEXT: bnez a0, .LBB14_25 ; RV64-NEXT: # %bb.24: ; RV64-NEXT: vslidedown.vi v28, v12, 8 @@ -1831,7 +1844,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 8 ; RV64-NEXT: .LBB14_26: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 32(sp) +; RV64-NEXT: fsw ft0, 96(sp) ; RV64-NEXT: bnez a0, .LBB14_28 ; RV64-NEXT: # %bb.27: ; RV64-NEXT: vslidedown.vi v28, v12, 7 @@ -1840,7 +1853,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 7 ; RV64-NEXT: .LBB14_29: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 28(sp) +; RV64-NEXT: fsw ft0, 92(sp) ; RV64-NEXT: bnez a0, .LBB14_31 ; RV64-NEXT: # %bb.30: ; RV64-NEXT: vslidedown.vi v28, v12, 6 @@ -1849,7 +1862,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 6 ; RV64-NEXT: .LBB14_32: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 24(sp) +; RV64-NEXT: fsw ft0, 88(sp) ; RV64-NEXT: bnez a0, .LBB14_34 ; RV64-NEXT: # %bb.33: ; RV64-NEXT: vslidedown.vi v28, v12, 5 @@ -1858,7 +1871,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 5 ; RV64-NEXT: .LBB14_35: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 20(sp) +; RV64-NEXT: fsw ft0, 84(sp) ; RV64-NEXT: bnez a0, .LBB14_37 ; RV64-NEXT: # %bb.36: ; RV64-NEXT: vslidedown.vi v28, v12, 4 @@ -1867,7 +1880,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 4 ; RV64-NEXT: .LBB14_38: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 16(sp) +; RV64-NEXT: fsw ft0, 80(sp) ; RV64-NEXT: bnez a0, .LBB14_40 ; RV64-NEXT: # %bb.39: ; RV64-NEXT: vslidedown.vi v28, v12, 3 @@ -1876,7 +1889,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 3 ; RV64-NEXT: .LBB14_41: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 12(sp) +; RV64-NEXT: fsw ft0, 76(sp) ; RV64-NEXT: bnez a0, .LBB14_43 ; RV64-NEXT: # %bb.42: ; RV64-NEXT: vslidedown.vi v28, v12, 2 @@ -1885,7 +1898,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 2 ; RV64-NEXT: .LBB14_44: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 8(sp) +; RV64-NEXT: fsw ft0, 72(sp) ; RV64-NEXT: bnez a0, .LBB14_46 ; RV64-NEXT: # %bb.45: ; RV64-NEXT: vslidedown.vi v28, v12, 1 @@ -1894,13 +1907,14 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 1 ; RV64-NEXT: .LBB14_47: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 4(sp) +; RV64-NEXT: fsw ft0, 68(sp) ; RV64-NEXT: vsetivli zero, 16, e32,m4,ta,mu -; RV64-NEXT: vle32.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -192 +; RV64-NEXT: ld s0, 176(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 184(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 192 ; RV64-NEXT: ret %v = select i1 %c, <16 x float> %a, <16 x float> %b ret <16 x float> %v @@ -1909,13 +1923,13 @@ define <16 x float> @selectcc_v16f32(float %a, float %b, <16 x float> %c, <16 x float> %d) { ; RV32-LABEL: selectcc_v16f32: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -192 +; RV32-NEXT: .cfi_def_cfa_offset 192 +; RV32-NEXT: sw ra, 188(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 184(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: addi s0, sp, 192 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: feq.s a0, fa0, fa1 @@ -1923,7 +1937,7 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v12 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 64(sp) ; RV32-NEXT: beqz a0, .LBB15_4 ; RV32-NEXT: .LBB15_2: ; RV32-NEXT: vsetivli zero, 1, e32,m4,ta,mu @@ -1932,14 +1946,14 @@ ; RV32-NEXT: .LBB15_3: ; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsw ft0, 0(sp) +; RV32-NEXT: fsw ft0, 64(sp) ; RV32-NEXT: bnez a0, .LBB15_2 ; RV32-NEXT: .LBB15_4: ; RV32-NEXT: vsetivli zero, 1, e32,m4,ta,mu ; RV32-NEXT: vslidedown.vi v28, v12, 15 ; RV32-NEXT: .LBB15_5: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 60(sp) +; RV32-NEXT: fsw ft0, 124(sp) ; RV32-NEXT: bnez a0, .LBB15_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v28, v12, 14 @@ -1948,7 +1962,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 14 ; RV32-NEXT: .LBB15_8: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 56(sp) +; RV32-NEXT: fsw ft0, 120(sp) ; RV32-NEXT: bnez a0, .LBB15_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v28, v12, 13 @@ -1957,7 +1971,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 13 ; RV32-NEXT: .LBB15_11: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 52(sp) +; RV32-NEXT: fsw ft0, 116(sp) ; RV32-NEXT: bnez a0, .LBB15_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v28, v12, 12 @@ -1966,7 +1980,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 12 ; RV32-NEXT: .LBB15_14: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 48(sp) +; RV32-NEXT: fsw ft0, 112(sp) ; RV32-NEXT: bnez a0, .LBB15_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v28, v12, 11 @@ -1975,7 +1989,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 11 ; RV32-NEXT: .LBB15_17: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 44(sp) +; RV32-NEXT: fsw ft0, 108(sp) ; RV32-NEXT: bnez a0, .LBB15_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v28, v12, 10 @@ -1984,7 +1998,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 10 ; RV32-NEXT: .LBB15_20: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 40(sp) +; RV32-NEXT: fsw ft0, 104(sp) ; RV32-NEXT: bnez a0, .LBB15_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v28, v12, 9 @@ -1993,7 +2007,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 9 ; RV32-NEXT: .LBB15_23: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 36(sp) +; RV32-NEXT: fsw ft0, 100(sp) ; RV32-NEXT: bnez a0, .LBB15_25 ; RV32-NEXT: # %bb.24: ; RV32-NEXT: vslidedown.vi v28, v12, 8 @@ -2002,7 +2016,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 8 ; RV32-NEXT: .LBB15_26: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 32(sp) +; RV32-NEXT: fsw ft0, 96(sp) ; RV32-NEXT: bnez a0, .LBB15_28 ; RV32-NEXT: # %bb.27: ; RV32-NEXT: vslidedown.vi v28, v12, 7 @@ -2011,7 +2025,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 7 ; RV32-NEXT: .LBB15_29: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 28(sp) +; RV32-NEXT: fsw ft0, 92(sp) ; RV32-NEXT: bnez a0, .LBB15_31 ; RV32-NEXT: # %bb.30: ; RV32-NEXT: vslidedown.vi v28, v12, 6 @@ -2020,7 +2034,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 6 ; RV32-NEXT: .LBB15_32: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 24(sp) +; RV32-NEXT: fsw ft0, 88(sp) ; RV32-NEXT: bnez a0, .LBB15_34 ; RV32-NEXT: # %bb.33: ; RV32-NEXT: vslidedown.vi v28, v12, 5 @@ -2029,7 +2043,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 5 ; RV32-NEXT: .LBB15_35: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 20(sp) +; RV32-NEXT: fsw ft0, 84(sp) ; RV32-NEXT: bnez a0, .LBB15_37 ; RV32-NEXT: # %bb.36: ; RV32-NEXT: vslidedown.vi v28, v12, 4 @@ -2038,7 +2052,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 4 ; RV32-NEXT: .LBB15_38: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 16(sp) +; RV32-NEXT: fsw ft0, 80(sp) ; RV32-NEXT: bnez a0, .LBB15_40 ; RV32-NEXT: # %bb.39: ; RV32-NEXT: vslidedown.vi v28, v12, 3 @@ -2047,7 +2061,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 3 ; RV32-NEXT: .LBB15_41: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 12(sp) +; RV32-NEXT: fsw ft0, 76(sp) ; RV32-NEXT: bnez a0, .LBB15_43 ; RV32-NEXT: # %bb.42: ; RV32-NEXT: vslidedown.vi v28, v12, 2 @@ -2056,7 +2070,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 2 ; RV32-NEXT: .LBB15_44: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 8(sp) +; RV32-NEXT: fsw ft0, 72(sp) ; RV32-NEXT: bnez a0, .LBB15_46 ; RV32-NEXT: # %bb.45: ; RV32-NEXT: vslidedown.vi v28, v12, 1 @@ -2065,24 +2079,25 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 1 ; RV32-NEXT: .LBB15_47: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsw ft0, 4(sp) +; RV32-NEXT: fsw ft0, 68(sp) ; RV32-NEXT: vsetivli zero, 16, e32,m4,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -192 +; RV32-NEXT: lw s0, 184(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 188(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 192 ; RV32-NEXT: ret ; ; RV64-LABEL: selectcc_v16f32: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -192 +; RV64-NEXT: .cfi_def_cfa_offset 192 +; RV64-NEXT: sd ra, 184(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 176(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: addi s0, sp, 192 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: feq.s a0, fa0, fa1 @@ -2090,7 +2105,7 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v12 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 64(sp) ; RV64-NEXT: beqz a0, .LBB15_4 ; RV64-NEXT: .LBB15_2: ; RV64-NEXT: vsetivli zero, 1, e32,m4,ta,mu @@ -2099,14 +2114,14 @@ ; RV64-NEXT: .LBB15_3: ; RV64-NEXT: vsetvli zero, zero, e32,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsw ft0, 0(sp) +; RV64-NEXT: fsw ft0, 64(sp) ; RV64-NEXT: bnez a0, .LBB15_2 ; RV64-NEXT: .LBB15_4: ; RV64-NEXT: vsetivli zero, 1, e32,m4,ta,mu ; RV64-NEXT: vslidedown.vi v28, v12, 15 ; RV64-NEXT: .LBB15_5: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 60(sp) +; RV64-NEXT: fsw ft0, 124(sp) ; RV64-NEXT: bnez a0, .LBB15_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v28, v12, 14 @@ -2115,7 +2130,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 14 ; RV64-NEXT: .LBB15_8: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 56(sp) +; RV64-NEXT: fsw ft0, 120(sp) ; RV64-NEXT: bnez a0, .LBB15_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v28, v12, 13 @@ -2124,7 +2139,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 13 ; RV64-NEXT: .LBB15_11: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 52(sp) +; RV64-NEXT: fsw ft0, 116(sp) ; RV64-NEXT: bnez a0, .LBB15_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v28, v12, 12 @@ -2133,7 +2148,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 12 ; RV64-NEXT: .LBB15_14: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 48(sp) +; RV64-NEXT: fsw ft0, 112(sp) ; RV64-NEXT: bnez a0, .LBB15_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v28, v12, 11 @@ -2142,7 +2157,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 11 ; RV64-NEXT: .LBB15_17: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 44(sp) +; RV64-NEXT: fsw ft0, 108(sp) ; RV64-NEXT: bnez a0, .LBB15_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v28, v12, 10 @@ -2151,7 +2166,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 10 ; RV64-NEXT: .LBB15_20: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 40(sp) +; RV64-NEXT: fsw ft0, 104(sp) ; RV64-NEXT: bnez a0, .LBB15_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v28, v12, 9 @@ -2160,7 +2175,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 9 ; RV64-NEXT: .LBB15_23: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 36(sp) +; RV64-NEXT: fsw ft0, 100(sp) ; RV64-NEXT: bnez a0, .LBB15_25 ; RV64-NEXT: # %bb.24: ; RV64-NEXT: vslidedown.vi v28, v12, 8 @@ -2169,7 +2184,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 8 ; RV64-NEXT: .LBB15_26: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 32(sp) +; RV64-NEXT: fsw ft0, 96(sp) ; RV64-NEXT: bnez a0, .LBB15_28 ; RV64-NEXT: # %bb.27: ; RV64-NEXT: vslidedown.vi v28, v12, 7 @@ -2178,7 +2193,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 7 ; RV64-NEXT: .LBB15_29: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 28(sp) +; RV64-NEXT: fsw ft0, 92(sp) ; RV64-NEXT: bnez a0, .LBB15_31 ; RV64-NEXT: # %bb.30: ; RV64-NEXT: vslidedown.vi v28, v12, 6 @@ -2187,7 +2202,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 6 ; RV64-NEXT: .LBB15_32: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 24(sp) +; RV64-NEXT: fsw ft0, 88(sp) ; RV64-NEXT: bnez a0, .LBB15_34 ; RV64-NEXT: # %bb.33: ; RV64-NEXT: vslidedown.vi v28, v12, 5 @@ -2196,7 +2211,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 5 ; RV64-NEXT: .LBB15_35: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 20(sp) +; RV64-NEXT: fsw ft0, 84(sp) ; RV64-NEXT: bnez a0, .LBB15_37 ; RV64-NEXT: # %bb.36: ; RV64-NEXT: vslidedown.vi v28, v12, 4 @@ -2205,7 +2220,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 4 ; RV64-NEXT: .LBB15_38: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 16(sp) +; RV64-NEXT: fsw ft0, 80(sp) ; RV64-NEXT: bnez a0, .LBB15_40 ; RV64-NEXT: # %bb.39: ; RV64-NEXT: vslidedown.vi v28, v12, 3 @@ -2214,7 +2229,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 3 ; RV64-NEXT: .LBB15_41: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 12(sp) +; RV64-NEXT: fsw ft0, 76(sp) ; RV64-NEXT: bnez a0, .LBB15_43 ; RV64-NEXT: # %bb.42: ; RV64-NEXT: vslidedown.vi v28, v12, 2 @@ -2223,7 +2238,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 2 ; RV64-NEXT: .LBB15_44: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 8(sp) +; RV64-NEXT: fsw ft0, 72(sp) ; RV64-NEXT: bnez a0, .LBB15_46 ; RV64-NEXT: # %bb.45: ; RV64-NEXT: vslidedown.vi v28, v12, 1 @@ -2232,13 +2247,14 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 1 ; RV64-NEXT: .LBB15_47: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsw ft0, 4(sp) +; RV64-NEXT: fsw ft0, 68(sp) ; RV64-NEXT: vsetivli zero, 16, e32,m4,ta,mu -; RV64-NEXT: vle32.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -192 +; RV64-NEXT: ld s0, 176(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 184(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 192 ; RV64-NEXT: ret %cmp = fcmp oeq float %a, %b %v = select i1 %cmp, <16 x float> %c, <16 x float> %d @@ -2304,20 +2320,20 @@ define <4 x double> @select_v4f64(i1 zeroext %c, <4 x double> %a, <4 x double> %b) { ; RV32-LABEL: select_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 +; RV32-NEXT: addi s0, sp, 96 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -32 ; RV32-NEXT: bnez a0, .LBB18_3 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v10 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 32(sp) ; RV32-NEXT: beqz a0, .LBB18_4 ; RV32-NEXT: .LBB18_2: ; RV32-NEXT: vsetivli zero, 1, e64,m2,ta,mu @@ -2326,14 +2342,14 @@ ; RV32-NEXT: .LBB18_3: ; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 32(sp) ; RV32-NEXT: bnez a0, .LBB18_2 ; RV32-NEXT: .LBB18_4: ; RV32-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; RV32-NEXT: vslidedown.vi v26, v10, 3 ; RV32-NEXT: .LBB18_5: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsd ft0, 24(sp) +; RV32-NEXT: fsd ft0, 56(sp) ; RV32-NEXT: bnez a0, .LBB18_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v26, v10, 2 @@ -2342,7 +2358,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 2 ; RV32-NEXT: .LBB18_8: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsd ft0, 16(sp) +; RV32-NEXT: fsd ft0, 48(sp) ; RV32-NEXT: bnez a0, .LBB18_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v26, v10, 1 @@ -2351,31 +2367,32 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 1 ; RV32-NEXT: .LBB18_11: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsd ft0, 8(sp) +; RV32-NEXT: fsd ft0, 40(sp) ; RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -96 +; RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 ; RV32-NEXT: ret ; ; RV64-LABEL: select_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 +; RV64-NEXT: addi s0, sp, 96 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -32 ; RV64-NEXT: bnez a0, .LBB18_3 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v10 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 32(sp) ; RV64-NEXT: beqz a0, .LBB18_4 ; RV64-NEXT: .LBB18_2: ; RV64-NEXT: vsetivli zero, 1, e64,m2,ta,mu @@ -2384,14 +2401,14 @@ ; RV64-NEXT: .LBB18_3: ; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 32(sp) ; RV64-NEXT: bnez a0, .LBB18_2 ; RV64-NEXT: .LBB18_4: ; RV64-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; RV64-NEXT: vslidedown.vi v26, v10, 3 ; RV64-NEXT: .LBB18_5: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsd ft0, 24(sp) +; RV64-NEXT: fsd ft0, 56(sp) ; RV64-NEXT: bnez a0, .LBB18_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v26, v10, 2 @@ -2400,7 +2417,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 2 ; RV64-NEXT: .LBB18_8: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsd ft0, 16(sp) +; RV64-NEXT: fsd ft0, 48(sp) ; RV64-NEXT: bnez a0, .LBB18_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v26, v10, 1 @@ -2409,13 +2426,14 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 1 ; RV64-NEXT: .LBB18_11: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsd ft0, 8(sp) +; RV64-NEXT: fsd ft0, 40(sp) ; RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -96 +; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %v = select i1 %c, <4 x double> %a, <4 x double> %b ret <4 x double> %v @@ -2424,13 +2442,13 @@ define <4 x double> @selectcc_v4f64(double %a, double %b, <4 x double> %c, <4 x double> %d) { ; RV32-LABEL: selectcc_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -96 +; RV32-NEXT: .cfi_def_cfa_offset 96 +; RV32-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 64 +; RV32-NEXT: addi s0, sp, 96 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -32 ; RV32-NEXT: feq.d a0, fa0, fa1 @@ -2438,7 +2456,7 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v10 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 32(sp) ; RV32-NEXT: beqz a0, .LBB19_4 ; RV32-NEXT: .LBB19_2: ; RV32-NEXT: vsetivli zero, 1, e64,m2,ta,mu @@ -2447,14 +2465,14 @@ ; RV32-NEXT: .LBB19_3: ; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 32(sp) ; RV32-NEXT: bnez a0, .LBB19_2 ; RV32-NEXT: .LBB19_4: ; RV32-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; RV32-NEXT: vslidedown.vi v26, v10, 3 ; RV32-NEXT: .LBB19_5: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsd ft0, 24(sp) +; RV32-NEXT: fsd ft0, 56(sp) ; RV32-NEXT: bnez a0, .LBB19_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v26, v10, 2 @@ -2463,7 +2481,7 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 2 ; RV32-NEXT: .LBB19_8: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsd ft0, 16(sp) +; RV32-NEXT: fsd ft0, 48(sp) ; RV32-NEXT: bnez a0, .LBB19_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v26, v10, 1 @@ -2472,24 +2490,25 @@ ; RV32-NEXT: vslidedown.vi v26, v8, 1 ; RV32-NEXT: .LBB19_11: ; RV32-NEXT: vfmv.f.s ft0, v26 -; RV32-NEXT: fsd ft0, 8(sp) +; RV32-NEXT: fsd ft0, 40(sp) ; RV32-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -64 -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -96 +; RV32-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 96 ; RV32-NEXT: ret ; ; RV64-LABEL: selectcc_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -96 +; RV64-NEXT: .cfi_def_cfa_offset 96 +; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 64 +; RV64-NEXT: addi s0, sp, 96 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -32 ; RV64-NEXT: feq.d a0, fa0, fa1 @@ -2497,7 +2516,7 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v10 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 32(sp) ; RV64-NEXT: beqz a0, .LBB19_4 ; RV64-NEXT: .LBB19_2: ; RV64-NEXT: vsetivli zero, 1, e64,m2,ta,mu @@ -2506,14 +2525,14 @@ ; RV64-NEXT: .LBB19_3: ; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 32(sp) ; RV64-NEXT: bnez a0, .LBB19_2 ; RV64-NEXT: .LBB19_4: ; RV64-NEXT: vsetivli zero, 1, e64,m2,ta,mu ; RV64-NEXT: vslidedown.vi v26, v10, 3 ; RV64-NEXT: .LBB19_5: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsd ft0, 24(sp) +; RV64-NEXT: fsd ft0, 56(sp) ; RV64-NEXT: bnez a0, .LBB19_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v26, v10, 2 @@ -2522,7 +2541,7 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 2 ; RV64-NEXT: .LBB19_8: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsd ft0, 16(sp) +; RV64-NEXT: fsd ft0, 48(sp) ; RV64-NEXT: bnez a0, .LBB19_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v26, v10, 1 @@ -2531,13 +2550,14 @@ ; RV64-NEXT: vslidedown.vi v26, v8, 1 ; RV64-NEXT: .LBB19_11: ; RV64-NEXT: vfmv.f.s ft0, v26 -; RV64-NEXT: fsd ft0, 8(sp) +; RV64-NEXT: fsd ft0, 40(sp) ; RV64-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -64 -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: addi a0, sp, 32 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -96 +; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %cmp = fcmp oeq double %a, %b %v = select i1 %cmp, <4 x double> %c, <4 x double> %d @@ -2547,20 +2567,20 @@ define <8 x double> @select_v8f64(i1 zeroext %c, <8 x double> %a, <8 x double> %b) { ; RV32-LABEL: select_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -192 +; RV32-NEXT: .cfi_def_cfa_offset 192 +; RV32-NEXT: sw ra, 188(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 184(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: addi s0, sp, 192 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: bnez a0, .LBB20_3 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v12 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 64(sp) ; RV32-NEXT: beqz a0, .LBB20_4 ; RV32-NEXT: .LBB20_2: ; RV32-NEXT: vsetivli zero, 1, e64,m4,ta,mu @@ -2569,14 +2589,14 @@ ; RV32-NEXT: .LBB20_3: ; RV32-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 64(sp) ; RV32-NEXT: bnez a0, .LBB20_2 ; RV32-NEXT: .LBB20_4: ; RV32-NEXT: vsetivli zero, 1, e64,m4,ta,mu ; RV32-NEXT: vslidedown.vi v28, v12, 7 ; RV32-NEXT: .LBB20_5: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 56(sp) +; RV32-NEXT: fsd ft0, 120(sp) ; RV32-NEXT: bnez a0, .LBB20_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v28, v12, 6 @@ -2585,7 +2605,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 6 ; RV32-NEXT: .LBB20_8: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 48(sp) +; RV32-NEXT: fsd ft0, 112(sp) ; RV32-NEXT: bnez a0, .LBB20_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v28, v12, 5 @@ -2594,7 +2614,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 5 ; RV32-NEXT: .LBB20_11: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 40(sp) +; RV32-NEXT: fsd ft0, 104(sp) ; RV32-NEXT: bnez a0, .LBB20_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v28, v12, 4 @@ -2603,7 +2623,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 4 ; RV32-NEXT: .LBB20_14: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 32(sp) +; RV32-NEXT: fsd ft0, 96(sp) ; RV32-NEXT: bnez a0, .LBB20_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v28, v12, 3 @@ -2612,7 +2632,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 3 ; RV32-NEXT: .LBB20_17: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 24(sp) +; RV32-NEXT: fsd ft0, 88(sp) ; RV32-NEXT: bnez a0, .LBB20_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v28, v12, 2 @@ -2621,7 +2641,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 2 ; RV32-NEXT: .LBB20_20: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 16(sp) +; RV32-NEXT: fsd ft0, 80(sp) ; RV32-NEXT: bnez a0, .LBB20_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v28, v12, 1 @@ -2630,31 +2650,32 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 1 ; RV32-NEXT: .LBB20_23: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 8(sp) +; RV32-NEXT: fsd ft0, 72(sp) ; RV32-NEXT: vsetivli zero, 8, e64,m4,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -192 +; RV32-NEXT: lw s0, 184(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 188(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 192 ; RV32-NEXT: ret ; ; RV64-LABEL: select_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -192 +; RV64-NEXT: .cfi_def_cfa_offset 192 +; RV64-NEXT: sd ra, 184(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 176(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: addi s0, sp, 192 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: bnez a0, .LBB20_3 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v12 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 64(sp) ; RV64-NEXT: beqz a0, .LBB20_4 ; RV64-NEXT: .LBB20_2: ; RV64-NEXT: vsetivli zero, 1, e64,m4,ta,mu @@ -2663,14 +2684,14 @@ ; RV64-NEXT: .LBB20_3: ; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 64(sp) ; RV64-NEXT: bnez a0, .LBB20_2 ; RV64-NEXT: .LBB20_4: ; RV64-NEXT: vsetivli zero, 1, e64,m4,ta,mu ; RV64-NEXT: vslidedown.vi v28, v12, 7 ; RV64-NEXT: .LBB20_5: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 56(sp) +; RV64-NEXT: fsd ft0, 120(sp) ; RV64-NEXT: bnez a0, .LBB20_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v28, v12, 6 @@ -2679,7 +2700,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 6 ; RV64-NEXT: .LBB20_8: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 48(sp) +; RV64-NEXT: fsd ft0, 112(sp) ; RV64-NEXT: bnez a0, .LBB20_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v28, v12, 5 @@ -2688,7 +2709,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 5 ; RV64-NEXT: .LBB20_11: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 40(sp) +; RV64-NEXT: fsd ft0, 104(sp) ; RV64-NEXT: bnez a0, .LBB20_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v28, v12, 4 @@ -2697,7 +2718,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 4 ; RV64-NEXT: .LBB20_14: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 32(sp) +; RV64-NEXT: fsd ft0, 96(sp) ; RV64-NEXT: bnez a0, .LBB20_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v28, v12, 3 @@ -2706,7 +2727,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 3 ; RV64-NEXT: .LBB20_17: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 24(sp) +; RV64-NEXT: fsd ft0, 88(sp) ; RV64-NEXT: bnez a0, .LBB20_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v28, v12, 2 @@ -2715,7 +2736,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 2 ; RV64-NEXT: .LBB20_20: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 16(sp) +; RV64-NEXT: fsd ft0, 80(sp) ; RV64-NEXT: bnez a0, .LBB20_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v28, v12, 1 @@ -2724,13 +2745,14 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 1 ; RV64-NEXT: .LBB20_23: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 8(sp) +; RV64-NEXT: fsd ft0, 72(sp) ; RV64-NEXT: vsetivli zero, 8, e64,m4,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -192 +; RV64-NEXT: ld s0, 176(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 184(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 192 ; RV64-NEXT: ret %v = select i1 %c, <8 x double> %a, <8 x double> %b ret <8 x double> %v @@ -2739,13 +2761,13 @@ define <8 x double> @selectcc_v8f64(double %a, double %b, <8 x double> %c, <8 x double> %d) { ; RV32-LABEL: selectcc_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -192 +; RV32-NEXT: .cfi_def_cfa_offset 192 +; RV32-NEXT: sw ra, 188(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 184(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: addi s0, sp, 192 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: feq.d a0, fa0, fa1 @@ -2753,7 +2775,7 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v12 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 64(sp) ; RV32-NEXT: beqz a0, .LBB21_4 ; RV32-NEXT: .LBB21_2: ; RV32-NEXT: vsetivli zero, 1, e64,m4,ta,mu @@ -2762,14 +2784,14 @@ ; RV32-NEXT: .LBB21_3: ; RV32-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 64(sp) ; RV32-NEXT: bnez a0, .LBB21_2 ; RV32-NEXT: .LBB21_4: ; RV32-NEXT: vsetivli zero, 1, e64,m4,ta,mu ; RV32-NEXT: vslidedown.vi v28, v12, 7 ; RV32-NEXT: .LBB21_5: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 56(sp) +; RV32-NEXT: fsd ft0, 120(sp) ; RV32-NEXT: bnez a0, .LBB21_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v28, v12, 6 @@ -2778,7 +2800,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 6 ; RV32-NEXT: .LBB21_8: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 48(sp) +; RV32-NEXT: fsd ft0, 112(sp) ; RV32-NEXT: bnez a0, .LBB21_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v28, v12, 5 @@ -2787,7 +2809,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 5 ; RV32-NEXT: .LBB21_11: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 40(sp) +; RV32-NEXT: fsd ft0, 104(sp) ; RV32-NEXT: bnez a0, .LBB21_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v28, v12, 4 @@ -2796,7 +2818,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 4 ; RV32-NEXT: .LBB21_14: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 32(sp) +; RV32-NEXT: fsd ft0, 96(sp) ; RV32-NEXT: bnez a0, .LBB21_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v28, v12, 3 @@ -2805,7 +2827,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 3 ; RV32-NEXT: .LBB21_17: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 24(sp) +; RV32-NEXT: fsd ft0, 88(sp) ; RV32-NEXT: bnez a0, .LBB21_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v28, v12, 2 @@ -2814,7 +2836,7 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 2 ; RV32-NEXT: .LBB21_20: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 16(sp) +; RV32-NEXT: fsd ft0, 80(sp) ; RV32-NEXT: bnez a0, .LBB21_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v28, v12, 1 @@ -2823,24 +2845,25 @@ ; RV32-NEXT: vslidedown.vi v28, v8, 1 ; RV32-NEXT: .LBB21_23: ; RV32-NEXT: vfmv.f.s ft0, v28 -; RV32-NEXT: fsd ft0, 8(sp) +; RV32-NEXT: fsd ft0, 72(sp) ; RV32-NEXT: vsetivli zero, 8, e64,m4,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -192 +; RV32-NEXT: lw s0, 184(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 188(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 192 ; RV32-NEXT: ret ; ; RV64-LABEL: selectcc_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -128 -; RV64-NEXT: .cfi_def_cfa_offset 128 -; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -192 +; RV64-NEXT: .cfi_def_cfa_offset 192 +; RV64-NEXT: sd ra, 184(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 176(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: addi s0, sp, 192 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: feq.d a0, fa0, fa1 @@ -2848,7 +2871,7 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v12 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 64(sp) ; RV64-NEXT: beqz a0, .LBB21_4 ; RV64-NEXT: .LBB21_2: ; RV64-NEXT: vsetivli zero, 1, e64,m4,ta,mu @@ -2857,14 +2880,14 @@ ; RV64-NEXT: .LBB21_3: ; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 64(sp) ; RV64-NEXT: bnez a0, .LBB21_2 ; RV64-NEXT: .LBB21_4: ; RV64-NEXT: vsetivli zero, 1, e64,m4,ta,mu ; RV64-NEXT: vslidedown.vi v28, v12, 7 ; RV64-NEXT: .LBB21_5: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 56(sp) +; RV64-NEXT: fsd ft0, 120(sp) ; RV64-NEXT: bnez a0, .LBB21_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v28, v12, 6 @@ -2873,7 +2896,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 6 ; RV64-NEXT: .LBB21_8: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 48(sp) +; RV64-NEXT: fsd ft0, 112(sp) ; RV64-NEXT: bnez a0, .LBB21_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v28, v12, 5 @@ -2882,7 +2905,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 5 ; RV64-NEXT: .LBB21_11: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 40(sp) +; RV64-NEXT: fsd ft0, 104(sp) ; RV64-NEXT: bnez a0, .LBB21_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v28, v12, 4 @@ -2891,7 +2914,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 4 ; RV64-NEXT: .LBB21_14: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 32(sp) +; RV64-NEXT: fsd ft0, 96(sp) ; RV64-NEXT: bnez a0, .LBB21_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v28, v12, 3 @@ -2900,7 +2923,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 3 ; RV64-NEXT: .LBB21_17: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 24(sp) +; RV64-NEXT: fsd ft0, 88(sp) ; RV64-NEXT: bnez a0, .LBB21_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v28, v12, 2 @@ -2909,7 +2932,7 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 2 ; RV64-NEXT: .LBB21_20: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 16(sp) +; RV64-NEXT: fsd ft0, 80(sp) ; RV64-NEXT: bnez a0, .LBB21_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v28, v12, 1 @@ -2918,13 +2941,14 @@ ; RV64-NEXT: vslidedown.vi v28, v8, 1 ; RV64-NEXT: .LBB21_23: ; RV64-NEXT: vfmv.f.s ft0, v28 -; RV64-NEXT: fsd ft0, 8(sp) +; RV64-NEXT: fsd ft0, 72(sp) ; RV64-NEXT: vsetivli zero, 8, e64,m4,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -128 -; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -192 +; RV64-NEXT: ld s0, 176(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 184(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 192 ; RV64-NEXT: ret %cmp = fcmp oeq double %a, %b %v = select i1 %cmp, <8 x double> %c, <8 x double> %d @@ -2934,20 +2958,20 @@ define <16 x double> @select_v16f64(i1 zeroext %c, <16 x double> %a, <16 x double> %b) { ; RV32-LABEL: select_v16f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -256 -; RV32-NEXT: .cfi_def_cfa_offset 256 -; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -384 +; RV32-NEXT: .cfi_def_cfa_offset 384 +; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 256 +; RV32-NEXT: addi s0, sp, 384 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: bnez a0, .LBB22_3 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v16 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 128(sp) ; RV32-NEXT: beqz a0, .LBB22_4 ; RV32-NEXT: .LBB22_2: ; RV32-NEXT: vsetivli zero, 1, e64,m8,ta,mu @@ -2956,14 +2980,14 @@ ; RV32-NEXT: .LBB22_3: ; RV32-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 128(sp) ; RV32-NEXT: bnez a0, .LBB22_2 ; RV32-NEXT: .LBB22_4: ; RV32-NEXT: vsetivli zero, 1, e64,m8,ta,mu ; RV32-NEXT: vslidedown.vi v24, v16, 15 ; RV32-NEXT: .LBB22_5: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 120(sp) +; RV32-NEXT: fsd ft0, 248(sp) ; RV32-NEXT: bnez a0, .LBB22_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v24, v16, 14 @@ -2972,7 +2996,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 14 ; RV32-NEXT: .LBB22_8: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 112(sp) +; RV32-NEXT: fsd ft0, 240(sp) ; RV32-NEXT: bnez a0, .LBB22_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v24, v16, 13 @@ -2981,7 +3005,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 13 ; RV32-NEXT: .LBB22_11: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 104(sp) +; RV32-NEXT: fsd ft0, 232(sp) ; RV32-NEXT: bnez a0, .LBB22_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v24, v16, 12 @@ -2990,7 +3014,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 12 ; RV32-NEXT: .LBB22_14: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 96(sp) +; RV32-NEXT: fsd ft0, 224(sp) ; RV32-NEXT: bnez a0, .LBB22_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v24, v16, 11 @@ -2999,7 +3023,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 11 ; RV32-NEXT: .LBB22_17: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 88(sp) +; RV32-NEXT: fsd ft0, 216(sp) ; RV32-NEXT: bnez a0, .LBB22_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v24, v16, 10 @@ -3008,7 +3032,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 10 ; RV32-NEXT: .LBB22_20: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 80(sp) +; RV32-NEXT: fsd ft0, 208(sp) ; RV32-NEXT: bnez a0, .LBB22_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v24, v16, 9 @@ -3017,7 +3041,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 9 ; RV32-NEXT: .LBB22_23: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 72(sp) +; RV32-NEXT: fsd ft0, 200(sp) ; RV32-NEXT: bnez a0, .LBB22_25 ; RV32-NEXT: # %bb.24: ; RV32-NEXT: vslidedown.vi v24, v16, 8 @@ -3026,7 +3050,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 8 ; RV32-NEXT: .LBB22_26: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 64(sp) +; RV32-NEXT: fsd ft0, 192(sp) ; RV32-NEXT: bnez a0, .LBB22_28 ; RV32-NEXT: # %bb.27: ; RV32-NEXT: vslidedown.vi v24, v16, 7 @@ -3035,7 +3059,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 7 ; RV32-NEXT: .LBB22_29: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 56(sp) +; RV32-NEXT: fsd ft0, 184(sp) ; RV32-NEXT: bnez a0, .LBB22_31 ; RV32-NEXT: # %bb.30: ; RV32-NEXT: vslidedown.vi v24, v16, 6 @@ -3044,7 +3068,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 6 ; RV32-NEXT: .LBB22_32: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 48(sp) +; RV32-NEXT: fsd ft0, 176(sp) ; RV32-NEXT: bnez a0, .LBB22_34 ; RV32-NEXT: # %bb.33: ; RV32-NEXT: vslidedown.vi v24, v16, 5 @@ -3053,7 +3077,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 5 ; RV32-NEXT: .LBB22_35: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 40(sp) +; RV32-NEXT: fsd ft0, 168(sp) ; RV32-NEXT: bnez a0, .LBB22_37 ; RV32-NEXT: # %bb.36: ; RV32-NEXT: vslidedown.vi v24, v16, 4 @@ -3062,7 +3086,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 4 ; RV32-NEXT: .LBB22_38: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 32(sp) +; RV32-NEXT: fsd ft0, 160(sp) ; RV32-NEXT: bnez a0, .LBB22_40 ; RV32-NEXT: # %bb.39: ; RV32-NEXT: vslidedown.vi v24, v16, 3 @@ -3071,7 +3095,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 3 ; RV32-NEXT: .LBB22_41: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 24(sp) +; RV32-NEXT: fsd ft0, 152(sp) ; RV32-NEXT: bnez a0, .LBB22_43 ; RV32-NEXT: # %bb.42: ; RV32-NEXT: vslidedown.vi v24, v16, 2 @@ -3080,7 +3104,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 2 ; RV32-NEXT: .LBB22_44: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 16(sp) +; RV32-NEXT: fsd ft0, 144(sp) ; RV32-NEXT: bnez a0, .LBB22_46 ; RV32-NEXT: # %bb.45: ; RV32-NEXT: vslidedown.vi v8, v16, 1 @@ -3089,31 +3113,32 @@ ; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: .LBB22_47: ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 8(sp) +; RV32-NEXT: fsd ft0, 136(sp) ; RV32-NEXT: vsetivli zero, 16, e64,m8,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -256 -; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 256 +; RV32-NEXT: addi a0, sp, 128 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -384 +; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 384 ; RV32-NEXT: ret ; ; RV64-LABEL: select_v16f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -256 -; RV64-NEXT: .cfi_def_cfa_offset 256 -; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -384 +; RV64-NEXT: .cfi_def_cfa_offset 384 +; RV64-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 256 +; RV64-NEXT: addi s0, sp, 384 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: bnez a0, .LBB22_3 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v16 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 128(sp) ; RV64-NEXT: beqz a0, .LBB22_4 ; RV64-NEXT: .LBB22_2: ; RV64-NEXT: vsetivli zero, 1, e64,m8,ta,mu @@ -3122,14 +3147,14 @@ ; RV64-NEXT: .LBB22_3: ; RV64-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 128(sp) ; RV64-NEXT: bnez a0, .LBB22_2 ; RV64-NEXT: .LBB22_4: ; RV64-NEXT: vsetivli zero, 1, e64,m8,ta,mu ; RV64-NEXT: vslidedown.vi v24, v16, 15 ; RV64-NEXT: .LBB22_5: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 120(sp) +; RV64-NEXT: fsd ft0, 248(sp) ; RV64-NEXT: bnez a0, .LBB22_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v24, v16, 14 @@ -3138,7 +3163,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 14 ; RV64-NEXT: .LBB22_8: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 112(sp) +; RV64-NEXT: fsd ft0, 240(sp) ; RV64-NEXT: bnez a0, .LBB22_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v24, v16, 13 @@ -3147,7 +3172,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 13 ; RV64-NEXT: .LBB22_11: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 104(sp) +; RV64-NEXT: fsd ft0, 232(sp) ; RV64-NEXT: bnez a0, .LBB22_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v24, v16, 12 @@ -3156,7 +3181,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 12 ; RV64-NEXT: .LBB22_14: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 96(sp) +; RV64-NEXT: fsd ft0, 224(sp) ; RV64-NEXT: bnez a0, .LBB22_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v24, v16, 11 @@ -3165,7 +3190,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 11 ; RV64-NEXT: .LBB22_17: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 88(sp) +; RV64-NEXT: fsd ft0, 216(sp) ; RV64-NEXT: bnez a0, .LBB22_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v24, v16, 10 @@ -3174,7 +3199,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 10 ; RV64-NEXT: .LBB22_20: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 80(sp) +; RV64-NEXT: fsd ft0, 208(sp) ; RV64-NEXT: bnez a0, .LBB22_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v24, v16, 9 @@ -3183,7 +3208,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 9 ; RV64-NEXT: .LBB22_23: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 72(sp) +; RV64-NEXT: fsd ft0, 200(sp) ; RV64-NEXT: bnez a0, .LBB22_25 ; RV64-NEXT: # %bb.24: ; RV64-NEXT: vslidedown.vi v24, v16, 8 @@ -3192,7 +3217,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 8 ; RV64-NEXT: .LBB22_26: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 64(sp) +; RV64-NEXT: fsd ft0, 192(sp) ; RV64-NEXT: bnez a0, .LBB22_28 ; RV64-NEXT: # %bb.27: ; RV64-NEXT: vslidedown.vi v24, v16, 7 @@ -3201,7 +3226,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 7 ; RV64-NEXT: .LBB22_29: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 56(sp) +; RV64-NEXT: fsd ft0, 184(sp) ; RV64-NEXT: bnez a0, .LBB22_31 ; RV64-NEXT: # %bb.30: ; RV64-NEXT: vslidedown.vi v24, v16, 6 @@ -3210,7 +3235,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 6 ; RV64-NEXT: .LBB22_32: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 48(sp) +; RV64-NEXT: fsd ft0, 176(sp) ; RV64-NEXT: bnez a0, .LBB22_34 ; RV64-NEXT: # %bb.33: ; RV64-NEXT: vslidedown.vi v24, v16, 5 @@ -3219,7 +3244,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 5 ; RV64-NEXT: .LBB22_35: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 40(sp) +; RV64-NEXT: fsd ft0, 168(sp) ; RV64-NEXT: bnez a0, .LBB22_37 ; RV64-NEXT: # %bb.36: ; RV64-NEXT: vslidedown.vi v24, v16, 4 @@ -3228,7 +3253,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 4 ; RV64-NEXT: .LBB22_38: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 32(sp) +; RV64-NEXT: fsd ft0, 160(sp) ; RV64-NEXT: bnez a0, .LBB22_40 ; RV64-NEXT: # %bb.39: ; RV64-NEXT: vslidedown.vi v24, v16, 3 @@ -3237,7 +3262,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 3 ; RV64-NEXT: .LBB22_41: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 24(sp) +; RV64-NEXT: fsd ft0, 152(sp) ; RV64-NEXT: bnez a0, .LBB22_43 ; RV64-NEXT: # %bb.42: ; RV64-NEXT: vslidedown.vi v24, v16, 2 @@ -3246,7 +3271,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 2 ; RV64-NEXT: .LBB22_44: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 16(sp) +; RV64-NEXT: fsd ft0, 144(sp) ; RV64-NEXT: bnez a0, .LBB22_46 ; RV64-NEXT: # %bb.45: ; RV64-NEXT: vslidedown.vi v8, v16, 1 @@ -3255,13 +3280,14 @@ ; RV64-NEXT: vslidedown.vi v8, v8, 1 ; RV64-NEXT: .LBB22_47: ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 8(sp) +; RV64-NEXT: fsd ft0, 136(sp) ; RV64-NEXT: vsetivli zero, 16, e64,m8,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -256 -; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 256 +; RV64-NEXT: addi a0, sp, 128 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -384 +; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 384 ; RV64-NEXT: ret %v = select i1 %c, <16 x double> %a, <16 x double> %b ret <16 x double> %v @@ -3270,13 +3296,13 @@ define <16 x double> @selectcc_v16f64(double %a, double %b, <16 x double> %c, <16 x double> %d) { ; RV32-LABEL: selectcc_v16f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -256 -; RV32-NEXT: .cfi_def_cfa_offset 256 -; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -384 +; RV32-NEXT: .cfi_def_cfa_offset 384 +; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 256 +; RV32-NEXT: addi s0, sp, 384 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: feq.d a0, fa0, fa1 @@ -3284,7 +3310,7 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v16 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 128(sp) ; RV32-NEXT: beqz a0, .LBB23_4 ; RV32-NEXT: .LBB23_2: ; RV32-NEXT: vsetivli zero, 1, e64,m8,ta,mu @@ -3293,14 +3319,14 @@ ; RV32-NEXT: .LBB23_3: ; RV32-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 0(sp) +; RV32-NEXT: fsd ft0, 128(sp) ; RV32-NEXT: bnez a0, .LBB23_2 ; RV32-NEXT: .LBB23_4: ; RV32-NEXT: vsetivli zero, 1, e64,m8,ta,mu ; RV32-NEXT: vslidedown.vi v24, v16, 15 ; RV32-NEXT: .LBB23_5: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 120(sp) +; RV32-NEXT: fsd ft0, 248(sp) ; RV32-NEXT: bnez a0, .LBB23_7 ; RV32-NEXT: # %bb.6: ; RV32-NEXT: vslidedown.vi v24, v16, 14 @@ -3309,7 +3335,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 14 ; RV32-NEXT: .LBB23_8: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 112(sp) +; RV32-NEXT: fsd ft0, 240(sp) ; RV32-NEXT: bnez a0, .LBB23_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: vslidedown.vi v24, v16, 13 @@ -3318,7 +3344,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 13 ; RV32-NEXT: .LBB23_11: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 104(sp) +; RV32-NEXT: fsd ft0, 232(sp) ; RV32-NEXT: bnez a0, .LBB23_13 ; RV32-NEXT: # %bb.12: ; RV32-NEXT: vslidedown.vi v24, v16, 12 @@ -3327,7 +3353,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 12 ; RV32-NEXT: .LBB23_14: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 96(sp) +; RV32-NEXT: fsd ft0, 224(sp) ; RV32-NEXT: bnez a0, .LBB23_16 ; RV32-NEXT: # %bb.15: ; RV32-NEXT: vslidedown.vi v24, v16, 11 @@ -3336,7 +3362,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 11 ; RV32-NEXT: .LBB23_17: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 88(sp) +; RV32-NEXT: fsd ft0, 216(sp) ; RV32-NEXT: bnez a0, .LBB23_19 ; RV32-NEXT: # %bb.18: ; RV32-NEXT: vslidedown.vi v24, v16, 10 @@ -3345,7 +3371,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 10 ; RV32-NEXT: .LBB23_20: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 80(sp) +; RV32-NEXT: fsd ft0, 208(sp) ; RV32-NEXT: bnez a0, .LBB23_22 ; RV32-NEXT: # %bb.21: ; RV32-NEXT: vslidedown.vi v24, v16, 9 @@ -3354,7 +3380,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 9 ; RV32-NEXT: .LBB23_23: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 72(sp) +; RV32-NEXT: fsd ft0, 200(sp) ; RV32-NEXT: bnez a0, .LBB23_25 ; RV32-NEXT: # %bb.24: ; RV32-NEXT: vslidedown.vi v24, v16, 8 @@ -3363,7 +3389,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 8 ; RV32-NEXT: .LBB23_26: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 64(sp) +; RV32-NEXT: fsd ft0, 192(sp) ; RV32-NEXT: bnez a0, .LBB23_28 ; RV32-NEXT: # %bb.27: ; RV32-NEXT: vslidedown.vi v24, v16, 7 @@ -3372,7 +3398,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 7 ; RV32-NEXT: .LBB23_29: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 56(sp) +; RV32-NEXT: fsd ft0, 184(sp) ; RV32-NEXT: bnez a0, .LBB23_31 ; RV32-NEXT: # %bb.30: ; RV32-NEXT: vslidedown.vi v24, v16, 6 @@ -3381,7 +3407,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 6 ; RV32-NEXT: .LBB23_32: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 48(sp) +; RV32-NEXT: fsd ft0, 176(sp) ; RV32-NEXT: bnez a0, .LBB23_34 ; RV32-NEXT: # %bb.33: ; RV32-NEXT: vslidedown.vi v24, v16, 5 @@ -3390,7 +3416,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 5 ; RV32-NEXT: .LBB23_35: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 40(sp) +; RV32-NEXT: fsd ft0, 168(sp) ; RV32-NEXT: bnez a0, .LBB23_37 ; RV32-NEXT: # %bb.36: ; RV32-NEXT: vslidedown.vi v24, v16, 4 @@ -3399,7 +3425,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 4 ; RV32-NEXT: .LBB23_38: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 32(sp) +; RV32-NEXT: fsd ft0, 160(sp) ; RV32-NEXT: bnez a0, .LBB23_40 ; RV32-NEXT: # %bb.39: ; RV32-NEXT: vslidedown.vi v24, v16, 3 @@ -3408,7 +3434,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 3 ; RV32-NEXT: .LBB23_41: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 24(sp) +; RV32-NEXT: fsd ft0, 152(sp) ; RV32-NEXT: bnez a0, .LBB23_43 ; RV32-NEXT: # %bb.42: ; RV32-NEXT: vslidedown.vi v24, v16, 2 @@ -3417,7 +3443,7 @@ ; RV32-NEXT: vslidedown.vi v24, v8, 2 ; RV32-NEXT: .LBB23_44: ; RV32-NEXT: vfmv.f.s ft0, v24 -; RV32-NEXT: fsd ft0, 16(sp) +; RV32-NEXT: fsd ft0, 144(sp) ; RV32-NEXT: bnez a0, .LBB23_46 ; RV32-NEXT: # %bb.45: ; RV32-NEXT: vslidedown.vi v8, v16, 1 @@ -3426,24 +3452,25 @@ ; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: .LBB23_47: ; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fsd ft0, 8(sp) +; RV32-NEXT: fsd ft0, 136(sp) ; RV32-NEXT: vsetivli zero, 16, e64,m8,ta,mu -; RV32-NEXT: vle64.v v8, (sp) -; RV32-NEXT: addi sp, s0, -256 -; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 256 +; RV32-NEXT: addi a0, sp, 128 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -384 +; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload +; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 384 ; RV32-NEXT: ret ; ; RV64-LABEL: selectcc_v16f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -256 -; RV64-NEXT: .cfi_def_cfa_offset 256 -; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -384 +; RV64-NEXT: .cfi_def_cfa_offset 384 +; RV64-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 256 +; RV64-NEXT: addi s0, sp, 384 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: feq.d a0, fa0, fa1 @@ -3451,7 +3478,7 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v16 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 128(sp) ; RV64-NEXT: beqz a0, .LBB23_4 ; RV64-NEXT: .LBB23_2: ; RV64-NEXT: vsetivli zero, 1, e64,m8,ta,mu @@ -3460,14 +3487,14 @@ ; RV64-NEXT: .LBB23_3: ; RV64-NEXT: vsetvli zero, zero, e64,m8,ta,mu ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 0(sp) +; RV64-NEXT: fsd ft0, 128(sp) ; RV64-NEXT: bnez a0, .LBB23_2 ; RV64-NEXT: .LBB23_4: ; RV64-NEXT: vsetivli zero, 1, e64,m8,ta,mu ; RV64-NEXT: vslidedown.vi v24, v16, 15 ; RV64-NEXT: .LBB23_5: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 120(sp) +; RV64-NEXT: fsd ft0, 248(sp) ; RV64-NEXT: bnez a0, .LBB23_7 ; RV64-NEXT: # %bb.6: ; RV64-NEXT: vslidedown.vi v24, v16, 14 @@ -3476,7 +3503,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 14 ; RV64-NEXT: .LBB23_8: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 112(sp) +; RV64-NEXT: fsd ft0, 240(sp) ; RV64-NEXT: bnez a0, .LBB23_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: vslidedown.vi v24, v16, 13 @@ -3485,7 +3512,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 13 ; RV64-NEXT: .LBB23_11: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 104(sp) +; RV64-NEXT: fsd ft0, 232(sp) ; RV64-NEXT: bnez a0, .LBB23_13 ; RV64-NEXT: # %bb.12: ; RV64-NEXT: vslidedown.vi v24, v16, 12 @@ -3494,7 +3521,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 12 ; RV64-NEXT: .LBB23_14: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 96(sp) +; RV64-NEXT: fsd ft0, 224(sp) ; RV64-NEXT: bnez a0, .LBB23_16 ; RV64-NEXT: # %bb.15: ; RV64-NEXT: vslidedown.vi v24, v16, 11 @@ -3503,7 +3530,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 11 ; RV64-NEXT: .LBB23_17: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 88(sp) +; RV64-NEXT: fsd ft0, 216(sp) ; RV64-NEXT: bnez a0, .LBB23_19 ; RV64-NEXT: # %bb.18: ; RV64-NEXT: vslidedown.vi v24, v16, 10 @@ -3512,7 +3539,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 10 ; RV64-NEXT: .LBB23_20: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 80(sp) +; RV64-NEXT: fsd ft0, 208(sp) ; RV64-NEXT: bnez a0, .LBB23_22 ; RV64-NEXT: # %bb.21: ; RV64-NEXT: vslidedown.vi v24, v16, 9 @@ -3521,7 +3548,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 9 ; RV64-NEXT: .LBB23_23: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 72(sp) +; RV64-NEXT: fsd ft0, 200(sp) ; RV64-NEXT: bnez a0, .LBB23_25 ; RV64-NEXT: # %bb.24: ; RV64-NEXT: vslidedown.vi v24, v16, 8 @@ -3530,7 +3557,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 8 ; RV64-NEXT: .LBB23_26: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 64(sp) +; RV64-NEXT: fsd ft0, 192(sp) ; RV64-NEXT: bnez a0, .LBB23_28 ; RV64-NEXT: # %bb.27: ; RV64-NEXT: vslidedown.vi v24, v16, 7 @@ -3539,7 +3566,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 7 ; RV64-NEXT: .LBB23_29: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 56(sp) +; RV64-NEXT: fsd ft0, 184(sp) ; RV64-NEXT: bnez a0, .LBB23_31 ; RV64-NEXT: # %bb.30: ; RV64-NEXT: vslidedown.vi v24, v16, 6 @@ -3548,7 +3575,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 6 ; RV64-NEXT: .LBB23_32: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 48(sp) +; RV64-NEXT: fsd ft0, 176(sp) ; RV64-NEXT: bnez a0, .LBB23_34 ; RV64-NEXT: # %bb.33: ; RV64-NEXT: vslidedown.vi v24, v16, 5 @@ -3557,7 +3584,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 5 ; RV64-NEXT: .LBB23_35: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 40(sp) +; RV64-NEXT: fsd ft0, 168(sp) ; RV64-NEXT: bnez a0, .LBB23_37 ; RV64-NEXT: # %bb.36: ; RV64-NEXT: vslidedown.vi v24, v16, 4 @@ -3566,7 +3593,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 4 ; RV64-NEXT: .LBB23_38: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 32(sp) +; RV64-NEXT: fsd ft0, 160(sp) ; RV64-NEXT: bnez a0, .LBB23_40 ; RV64-NEXT: # %bb.39: ; RV64-NEXT: vslidedown.vi v24, v16, 3 @@ -3575,7 +3602,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 3 ; RV64-NEXT: .LBB23_41: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 24(sp) +; RV64-NEXT: fsd ft0, 152(sp) ; RV64-NEXT: bnez a0, .LBB23_43 ; RV64-NEXT: # %bb.42: ; RV64-NEXT: vslidedown.vi v24, v16, 2 @@ -3584,7 +3611,7 @@ ; RV64-NEXT: vslidedown.vi v24, v8, 2 ; RV64-NEXT: .LBB23_44: ; RV64-NEXT: vfmv.f.s ft0, v24 -; RV64-NEXT: fsd ft0, 16(sp) +; RV64-NEXT: fsd ft0, 144(sp) ; RV64-NEXT: bnez a0, .LBB23_46 ; RV64-NEXT: # %bb.45: ; RV64-NEXT: vslidedown.vi v8, v16, 1 @@ -3593,13 +3620,14 @@ ; RV64-NEXT: vslidedown.vi v8, v8, 1 ; RV64-NEXT: .LBB23_47: ; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fsd ft0, 8(sp) +; RV64-NEXT: fsd ft0, 136(sp) ; RV64-NEXT: vsetivli zero, 16, e64,m8,ta,mu -; RV64-NEXT: vle64.v v8, (sp) -; RV64-NEXT: addi sp, s0, -256 -; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 256 +; RV64-NEXT: addi a0, sp, 128 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -384 +; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 384 ; RV64-NEXT: ret %cmp = fcmp oeq double %a, %b %v = select i1 %cmp, <16 x double> %c, <16 x double> %d diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -7,8 +7,8 @@ define <4 x i32> @load_v4i32_align1(<4 x i32>* %ptr) { ; RV32-LABEL: load_v4i32_align1: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lbu a1, 13(a0) ; RV32-NEXT: lbu a2, 12(a0) ; RV32-NEXT: lbu a3, 15(a0) @@ -19,7 +19,7 @@ ; RV32-NEXT: or a2, a2, a4 ; RV32-NEXT: slli a2, a2, 16 ; RV32-NEXT: or a1, a2, a1 -; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lbu a1, 9(a0) ; RV32-NEXT: lbu a2, 8(a0) ; RV32-NEXT: lbu a3, 11(a0) @@ -30,7 +30,7 @@ ; RV32-NEXT: or a2, a2, a4 ; RV32-NEXT: slli a2, a2, 16 ; RV32-NEXT: or a1, a2, a1 -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: lbu a1, 5(a0) ; RV32-NEXT: lbu a2, 4(a0) ; RV32-NEXT: lbu a3, 7(a0) @@ -41,7 +41,7 @@ ; RV32-NEXT: or a2, a2, a4 ; RV32-NEXT: slli a2, a2, 16 ; RV32-NEXT: or a1, a2, a1 -; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lbu a1, 1(a0) ; RV32-NEXT: lbu a2, 0(a0) ; RV32-NEXT: lbu a3, 3(a0) @@ -52,16 +52,17 @@ ; RV32-NEXT: or a0, a2, a0 ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sw a0, 16(sp) ; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: load_v4i32_align1: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: lbu a1, 9(a0) ; RV64-NEXT: lbu a2, 8(a0) ; RV64-NEXT: lbu a3, 11(a0) @@ -84,7 +85,7 @@ ; RV64-NEXT: or a2, a3, a2 ; RV64-NEXT: slli a2, a2, 32 ; RV64-NEXT: or a1, a2, a1 -; RV64-NEXT: sd a1, 8(sp) +; RV64-NEXT: sd a1, 24(sp) ; RV64-NEXT: lbu a1, 1(a0) ; RV64-NEXT: lbu a2, 0(a0) ; RV64-NEXT: lbu a3, 3(a0) @@ -107,10 +108,11 @@ ; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: or a0, a0, a1 -; RV64-NEXT: sd a0, 0(sp) +; RV64-NEXT: sd a0, 16(sp) ; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV64-NEXT: vle32.v v8, (sp) -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi sp, sp, 32 ; RV64-NEXT: ret %z = load <4 x i32>, <4 x i32>* %ptr, align 1 ret <4 x i32> %z @@ -119,37 +121,38 @@ define <4 x i32> @load_v4i32_align2(<4 x i32>* %ptr) { ; RV32-LABEL: load_v4i32_align2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lhu a1, 14(a0) ; RV32-NEXT: lhu a2, 12(a0) ; RV32-NEXT: slli a1, a1, 16 ; RV32-NEXT: or a1, a1, a2 -; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lhu a1, 10(a0) ; RV32-NEXT: lhu a2, 8(a0) ; RV32-NEXT: slli a1, a1, 16 ; RV32-NEXT: or a1, a1, a2 -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: lhu a1, 6(a0) ; RV32-NEXT: lhu a2, 4(a0) ; RV32-NEXT: slli a1, a1, 16 ; RV32-NEXT: or a1, a1, a2 -; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lhu a1, 2(a0) ; RV32-NEXT: lhu a0, 0(a0) ; RV32-NEXT: slli a1, a1, 16 ; RV32-NEXT: or a0, a1, a0 -; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sw a0, 16(sp) ; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV32-NEXT: vle32.v v8, (sp) -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: load_v4i32_align2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: lhu a1, 10(a0) ; RV64-NEXT: lhu a2, 8(a0) ; RV64-NEXT: lhu a3, 14(a0) @@ -160,7 +163,7 @@ ; RV64-NEXT: or a2, a2, a4 ; RV64-NEXT: slli a2, a2, 32 ; RV64-NEXT: or a1, a2, a1 -; RV64-NEXT: sd a1, 8(sp) +; RV64-NEXT: sd a1, 24(sp) ; RV64-NEXT: lhu a1, 2(a0) ; RV64-NEXT: lhu a2, 0(a0) ; RV64-NEXT: lhu a3, 6(a0) @@ -171,10 +174,11 @@ ; RV64-NEXT: or a0, a2, a0 ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: or a0, a0, a1 -; RV64-NEXT: sd a0, 0(sp) +; RV64-NEXT: sd a0, 16(sp) ; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV64-NEXT: vle32.v v8, (sp) -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi sp, sp, 32 ; RV64-NEXT: ret %z = load <4 x i32>, <4 x i32>* %ptr, align 2 ret <4 x i32> %z @@ -183,17 +187,18 @@ define void @store_v4i32_align1(<4 x i32> %x, <4 x i32>* %ptr) { ; RV32-LABEL: store_v4i32_align1: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV32-NEXT: vse32.v v8, (sp) -; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: lw a1, 28(sp) ; RV32-NEXT: sb a1, 12(a0) -; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: lw a2, 24(sp) ; RV32-NEXT: sb a2, 8(a0) -; RV32-NEXT: lw a3, 4(sp) +; RV32-NEXT: lw a3, 20(sp) ; RV32-NEXT: sb a3, 4(a0) -; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: lw a4, 16(sp) ; RV32-NEXT: sb a4, 0(a0) ; RV32-NEXT: srli a5, a1, 24 ; RV32-NEXT: sb a5, 15(a0) @@ -219,18 +224,19 @@ ; RV32-NEXT: sb a1, 2(a0) ; RV32-NEXT: srli a1, a4, 8 ; RV32-NEXT: sb a1, 1(a0) -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: store_v4i32_align1: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV64-NEXT: vse32.v v8, (sp) -; RV64-NEXT: ld a1, 8(sp) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vse32.v v8, (a1) +; RV64-NEXT: ld a1, 24(sp) ; RV64-NEXT: sb a1, 8(a0) -; RV64-NEXT: ld a2, 0(sp) +; RV64-NEXT: ld a2, 16(sp) ; RV64-NEXT: sb a2, 0(a0) ; RV64-NEXT: srli a3, a1, 56 ; RV64-NEXT: sb a3, 15(a0) @@ -260,7 +266,7 @@ ; RV64-NEXT: sb a1, 2(a0) ; RV64-NEXT: srli a1, a2, 8 ; RV64-NEXT: sb a1, 1(a0) -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: addi sp, sp, 32 ; RV64-NEXT: ret store <4 x i32> %x, <4 x i32>* %ptr, align 1 ret void @@ -269,17 +275,18 @@ define void @store_v4i32_align2(<4 x i32> %x, <4 x i32>* %ptr) { ; RV32-LABEL: store_v4i32_align2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV32-NEXT: vse32.v v8, (sp) -; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: lw a1, 28(sp) ; RV32-NEXT: sh a1, 12(a0) -; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: lw a2, 24(sp) ; RV32-NEXT: sh a2, 8(a0) -; RV32-NEXT: lw a3, 4(sp) +; RV32-NEXT: lw a3, 20(sp) ; RV32-NEXT: sh a3, 4(a0) -; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: lw a4, 16(sp) ; RV32-NEXT: sh a4, 0(a0) ; RV32-NEXT: srli a1, a1, 16 ; RV32-NEXT: sh a1, 14(a0) @@ -289,18 +296,19 @@ ; RV32-NEXT: sh a1, 6(a0) ; RV32-NEXT: srli a1, a4, 16 ; RV32-NEXT: sh a1, 2(a0) -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: store_v4i32_align2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu -; RV64-NEXT: vse32.v v8, (sp) -; RV64-NEXT: ld a1, 8(sp) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vse32.v v8, (a1) +; RV64-NEXT: ld a1, 24(sp) ; RV64-NEXT: sh a1, 8(a0) -; RV64-NEXT: ld a2, 0(sp) +; RV64-NEXT: ld a2, 16(sp) ; RV64-NEXT: sh a2, 0(a0) ; RV64-NEXT: srli a3, a1, 48 ; RV64-NEXT: sh a3, 14(a0) @@ -314,7 +322,7 @@ ; RV64-NEXT: sh a1, 4(a0) ; RV64-NEXT: srli a1, a2, 16 ; RV64-NEXT: sh a1, 2(a0) -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: addi sp, sp, 32 ; RV64-NEXT: ret store <4 x i32> %x, <4 x i32>* %ptr, align 2 ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir b/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir --- a/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir +++ b/llvm/test/CodeGen/RISCV/rvv/get-vlen-debugloc.mir @@ -25,8 +25,7 @@ ; CHECK-LABEL: name: foo ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: $x2 = frame-setup ADDI $x2, -16 - ; CHECK: CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK: CFI_INSTRUCTION def_cfa_offset 0 ; CHECK: $x10 = PseudoReadVLENB ; CHECK: $x10 = SLLI killed $x10, 1 ; CHECK: $x2 = SUB $x2, killed $x10 @@ -34,7 +33,6 @@ ; CHECK: $x10 = PseudoReadVLENB ; CHECK: $x10 = SLLI killed $x10, 1 ; CHECK: $x2 = ADD $x2, killed $x10 - ; CHECK: $x2 = frame-destroy ADDI $x2, 16 ; CHECK: PseudoRET bb.0: bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/localvar.ll b/llvm/test/CodeGen/RISCV/rvv/localvar.ll --- a/llvm/test/CodeGen/RISCV/rvv/localvar.ll +++ b/llvm/test/CodeGen/RISCV/rvv/localvar.ll @@ -290,18 +290,18 @@ define i64 @fixed_object(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8) nounwind { ; RV64IV-LABEL: fixed_object: ; RV64IV: # %bb.0: -; RV64IV-NEXT: addi sp, sp, -32 +; RV64IV-NEXT: addi sp, sp, -16 ; RV64IV-NEXT: csrr a0, vlenb ; RV64IV-NEXT: slli a0, a0, 3 ; RV64IV-NEXT: sub sp, sp, a0 ; RV64IV-NEXT: csrr a0, vlenb ; RV64IV-NEXT: slli a0, a0, 3 ; RV64IV-NEXT: add a0, sp, a0 -; RV64IV-NEXT: ld a0, 32(a0) +; RV64IV-NEXT: ld a0, 16(a0) ; RV64IV-NEXT: csrr a1, vlenb ; RV64IV-NEXT: slli a1, a1, 3 ; RV64IV-NEXT: add sp, sp, a1 -; RV64IV-NEXT: addi sp, sp, 32 +; RV64IV-NEXT: addi sp, sp, 16 ; RV64IV-NEXT: ret %fixed_size = alloca i32 %rvv_vector = alloca , align 8 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -663,15 +663,15 @@ ; ; RV32MV-LABEL: test_srem_vec: ; RV32MV: # %bb.0: -; RV32MV-NEXT: addi sp, sp, -64 -; RV32MV-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32MV-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32MV-NEXT: addi s0, sp, 64 +; RV32MV-NEXT: addi sp, sp, -96 +; RV32MV-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s1, 84(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s2, 80(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s3, 76(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s4, 72(sp) # 4-byte Folded Spill +; RV32MV-NEXT: sw s5, 68(sp) # 4-byte Folded Spill +; RV32MV-NEXT: addi s0, sp, 96 ; RV32MV-NEXT: andi sp, sp, -32 ; RV32MV-NEXT: mv s1, a0 ; RV32MV-NEXT: lw a0, 8(a0) @@ -695,28 +695,29 @@ ; RV32MV-NEXT: addi a2, zero, 6 ; RV32MV-NEXT: mv a3, zero ; RV32MV-NEXT: call __moddi3@plt -; RV32MV-NEXT: sw a1, 4(sp) -; RV32MV-NEXT: sw a0, 0(sp) +; RV32MV-NEXT: sw a1, 36(sp) +; RV32MV-NEXT: sw a0, 32(sp) ; RV32MV-NEXT: addi a2, zero, -5 ; RV32MV-NEXT: addi a3, zero, -1 ; RV32MV-NEXT: mv a0, s4 ; RV32MV-NEXT: mv a1, s5 ; RV32MV-NEXT: call __moddi3@plt -; RV32MV-NEXT: sw a1, 20(sp) -; RV32MV-NEXT: sw a0, 16(sp) +; RV32MV-NEXT: sw a1, 52(sp) +; RV32MV-NEXT: sw a0, 48(sp) ; RV32MV-NEXT: addi a2, zero, 7 ; RV32MV-NEXT: mv a0, s2 ; RV32MV-NEXT: mv a1, s3 ; RV32MV-NEXT: mv a3, zero ; RV32MV-NEXT: call __moddi3@plt -; RV32MV-NEXT: sw a1, 12(sp) -; RV32MV-NEXT: sw a0, 8(sp) +; RV32MV-NEXT: sw a1, 44(sp) +; RV32MV-NEXT: sw a0, 40(sp) ; RV32MV-NEXT: addi a0, zero, 85 ; RV32MV-NEXT: vsetivli zero, 1, e8,mf8,ta,mu ; RV32MV-NEXT: vmv.s.x v0, a0 ; RV32MV-NEXT: vsetivli zero, 8, e32,m2,ta,mu ; RV32MV-NEXT: vmv.v.i v26, 1 -; RV32MV-NEXT: vle32.v v28, (sp) +; RV32MV-NEXT: addi a0, sp, 32 +; RV32MV-NEXT: vle32.v v28, (a0) ; RV32MV-NEXT: lui a0, %hi(.LCPI3_0) ; RV32MV-NEXT: addi a0, a0, %lo(.LCPI3_0) ; RV32MV-NEXT: vle32.v v30, (a0) @@ -755,23 +756,23 @@ ; RV32MV-NEXT: slli a0, a0, 2 ; RV32MV-NEXT: or a0, a1, a0 ; RV32MV-NEXT: sw a0, 8(s1) -; RV32MV-NEXT: addi sp, s0, -64 -; RV32MV-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32MV-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32MV-NEXT: addi sp, sp, 64 +; RV32MV-NEXT: addi sp, s0, -96 +; RV32MV-NEXT: lw s5, 68(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s4, 72(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s3, 76(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s2, 80(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s1, 84(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32MV-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32MV-NEXT: addi sp, sp, 96 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_srem_vec: ; RV64MV: # %bb.0: -; RV64MV-NEXT: addi sp, sp, -64 -; RV64MV-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64MV-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64MV-NEXT: addi s0, sp, 64 +; RV64MV-NEXT: addi sp, sp, -96 +; RV64MV-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; RV64MV-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; RV64MV-NEXT: addi s0, sp, 96 ; RV64MV-NEXT: andi sp, sp, -32 ; RV64MV-NEXT: lb a1, 12(a0) ; RV64MV-NEXT: lwu a2, 8(a0) @@ -804,7 +805,7 @@ ; RV64MV-NEXT: addi a5, zero, 6 ; RV64MV-NEXT: mul a1, a1, a5 ; RV64MV-NEXT: sub a1, a3, a1 -; RV64MV-NEXT: sd a1, 0(sp) +; RV64MV-NEXT: sd a1, 32(sp) ; RV64MV-NEXT: lui a1, 1035469 ; RV64MV-NEXT: addiw a1, a1, -819 ; RV64MV-NEXT: slli a1, a1, 12 @@ -820,7 +821,7 @@ ; RV64MV-NEXT: slli a3, a1, 2 ; RV64MV-NEXT: add a1, a3, a1 ; RV64MV-NEXT: add a1, a2, a1 -; RV64MV-NEXT: sd a1, 16(sp) +; RV64MV-NEXT: sd a1, 48(sp) ; RV64MV-NEXT: lui a1, 18725 ; RV64MV-NEXT: addiw a1, a1, -1755 ; RV64MV-NEXT: slli a1, a1, 12 @@ -836,9 +837,10 @@ ; RV64MV-NEXT: slli a2, a1, 3 ; RV64MV-NEXT: sub a1, a1, a2 ; RV64MV-NEXT: add a1, a4, a1 -; RV64MV-NEXT: sd a1, 8(sp) +; RV64MV-NEXT: sd a1, 40(sp) ; RV64MV-NEXT: vsetivli zero, 4, e64,m2,ta,mu -; RV64MV-NEXT: vle64.v v26, (sp) +; RV64MV-NEXT: addi a1, sp, 32 +; RV64MV-NEXT: vle64.v v26, (a1) ; RV64MV-NEXT: lui a1, %hi(.LCPI3_0) ; RV64MV-NEXT: addi a1, a1, %lo(.LCPI3_0) ; RV64MV-NEXT: vle64.v v28, (a1) @@ -865,10 +867,10 @@ ; RV64MV-NEXT: slli a2, a3, 33 ; RV64MV-NEXT: or a1, a1, a2 ; RV64MV-NEXT: sd a1, 0(a0) -; RV64MV-NEXT: addi sp, s0, -64 -; RV64MV-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64MV-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64MV-NEXT: addi sp, sp, 64 +; RV64MV-NEXT: addi sp, s0, -96 +; RV64MV-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; RV64MV-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; RV64MV-NEXT: addi sp, sp, 96 ; RV64MV-NEXT: ret %ld = load <3 x i33>, <3 x i33>* %X %srem = srem <3 x i33> %ld,