diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -39,6 +39,7 @@ RISCVRedundantCopyElimination.cpp RISCVRegisterBankInfo.cpp RISCVRegisterInfo.cpp + RISCVRVVInitUndef.cpp RISCVSExtWRemoval.cpp RISCVSubtarget.cpp RISCVTargetMachine.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -59,6 +59,9 @@ FunctionPass *createRISCVInsertVSETVLIPass(); void initializeRISCVInsertVSETVLIPass(PassRegistry &); +FunctionPass *createRISCVInitUndefPass(); +void initializeRISCVInitUndefPass(PassRegistry &); + FunctionPass *createRISCVRedundantCopyEliminationPass(); void initializeRISCVRedundantCopyEliminationPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp b/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp @@ -0,0 +1,161 @@ +//===- RISCVInitUndef.cpp - Initialize undef vector value to zero ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function pass that initializes undef vector value to +// zero to prevent register allocation resulting in a constraint violated result +// for vector instruction. +// +// RISC-V vector instruction has register overlapping constraint for certain +// instructions, and will cause illegal instruction trap if violated, we use +// early clobber to model this constraint, but it can't prevent register +// allocator allocated same or overlapped if the input register is undef value, +// so convert IMPLICIT_DEF to zero initialized could prevent that happen, it's +// not best way to resolve this, and it might emit redundant zero initialized +// instruction for undef value, so ideally we should model the constraint right, +// but before we model the constraint right, it's the only way to prevent that +// happen. +// +// See also: https://github.com/llvm/llvm-project/issues/50157 +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +using namespace llvm; + +#define DEBUG_TYPE "riscv-init-undef" +#define RISCV_INIT_UNDEF_NAME "RISCV init undef pass" + +namespace { + +class RISCVInitUndef : public MachineFunctionPass { + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + +public: + static char ID; + + RISCVInitUndef() : MachineFunctionPass(ID) { + initializeRISCVInitUndefPass(*PassRegistry::getPassRegistry()); + } + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return RISCV_INIT_UNDEF_NAME; } + +private: + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + void handleImplicitDef(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &Inst); + bool isVectorRegClass(const Register &R); +}; + +} // end anonymous namespace + +char RISCVInitUndef::ID = 0; + +INITIALIZE_PASS(RISCVInitUndef, DEBUG_TYPE, RISCV_INIT_UNDEF_NAME, false, false) + +bool RISCVInitUndef::isVectorRegClass(const Register &R) { + unsigned RegClassID = MRI->getRegClass(R)->getID(); + switch (RegClassID) { + case RISCV::VRRegClassID: + case RISCV::VRM2RegClassID: + case RISCV::VRM4RegClassID: + case RISCV::VRM8RegClassID: + return true; + default: + return false; + } +} + +void RISCVInitUndef::handleImplicitDef(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &Inst) { + MachineInstr &MI = *Inst; + + assert(MI.getOpcode() == TargetOpcode::IMPLICIT_DEF); + // All vector registers must be explicitly defined to prevent violate vector + // register constaint. + unsigned Reg = MI.getOperand(0).getReg(); + LLVM_DEBUG( + dbgs() + << "Emitting vmv.v.i vd, 0 with VLMAX for implicit vector register " + << Reg << '\n'); + + unsigned Opcode; + unsigned RegClassID = MRI->getRegClass(Reg)->getID(); + switch (RegClassID) { + case RISCV::VRRegClassID: + Opcode = RISCV::PseudoVMV_V_I_M1; + break; + case RISCV::VRM2RegClassID: + Opcode = RISCV::PseudoVMV_V_I_M2; + break; + case RISCV::VRM4RegClassID: + Opcode = RISCV::PseudoVMV_V_I_M4; + break; + case RISCV::VRM8RegClassID: + Opcode = RISCV::PseudoVMV_V_I_M8; + break; + default: + llvm_unreachable("Unexpected register class?"); + } + + BuildMI(MBB, Inst, MI.getDebugLoc(), TII->get(Opcode), Reg) + .addImm(0) + .addImm(/* AVL=VLMAX */ -1) + .addImm(/* SEW */ 4); + + Inst = MBB.erase(Inst); // Remove the pseudo instruction + + // We want to leave I pointing to the previous instruction, but what if we + // just erased the first instruction? + if (Inst == MBB.begin()) { + LLVM_DEBUG(dbgs() << "Inserting dummy KILL\n"); + Inst = BuildMI(MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL)); + } else + --Inst; +} + +bool RISCVInitUndef::processBasicBlock(MachineFunction &MF, + MachineBasicBlock &MBB) { + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + MachineInstr &MI = *I; + if (MI.isImplicitDef()) { + auto DstReg = MI.getOperand(0).getReg(); + if (isVectorRegClass(DstReg)) { + handleImplicitDef(MBB, I); + Changed = true; + } + } + } + return Changed; +} + +bool RISCVInitUndef::runOnMachineFunction(MachineFunction &MF) { + const RISCVSubtarget &ST = MF.getSubtarget(); + if (!ST.hasVInstructions()) + return false; + + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + + bool Changed = false; + for (MachineBasicBlock &BB : MF) + Changed |= processBasicBlock(MF, BB); + + return Changed; +} + +FunctionPass *llvm::createRISCVInitUndefPass() { return new RISCVInitUndef(); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -53,6 +53,7 @@ initializeRISCVSExtWRemovalPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeRISCVInsertVSETVLIPass(*PR); + initializeRISCVInitUndefPass(*PR); } static StringRef computeDataLayout(const Triple &TT) { @@ -253,6 +254,9 @@ void RISCVPassConfig::addPreRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createRISCVMergeBaseOffsetOptPass()); + + if (getOptimizeRegAlloc()) + addPass(createRISCVInitUndefPass()); addPass(createRISCVInsertVSETVLIPass()); } diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll --- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll +++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll @@ -20,68 +20,68 @@ ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: li a0, 55 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vloxseg2ei32.v v8, (a0), v8 +; CHECK-NEXT: vloxseg2ei32.v v16, (a0), v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: vs4r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs4r.v v12, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; CHECK-NEXT: vs4r.v v20, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li s0, 36 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, s0, e16, m4, tu, mu -; CHECK-NEXT: vfwadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwadd.vv v16, v8, v12, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: call func@plt ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vrgather.vv v4, v8, v8, v0.t +; CHECK-NEXT: vrgather.vv v16, v8, v12, v0.t ; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, mu +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfwsub.wv v16, v8, v24 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwsub.wv v8, v24, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v20, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vssubu.vv v4, v4, v8, v0.t +; CHECK-NEXT: vssubu.vv v16, v16, v20, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, s0, e32, m8, tu, mu -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfdiv.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfdiv.vv v8, v24, v16, v0.t ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -100,45 +100,62 @@ ; SUBREGLIVENESS-NEXT: slli a0, a0, 4 ; SUBREGLIVENESS-NEXT: sub sp, sp, a0 ; SUBREGLIVENESS-NEXT: li a0, 55 +; SUBREGLIVENESS-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; SUBREGLIVENESS-NEXT: vmv.v.i v8, 0 ; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; SUBREGLIVENESS-NEXT: vloxseg2ei32.v v8, (a0), v8 +; SUBREGLIVENESS-NEXT: vloxseg2ei32.v v16, (a0), v8 ; SUBREGLIVENESS-NEXT: csrr a0, vlenb ; SUBREGLIVENESS-NEXT: slli a0, a0, 3 ; SUBREGLIVENESS-NEXT: add a0, sp, a0 ; SUBREGLIVENESS-NEXT: addi a0, a0, 16 ; SUBREGLIVENESS-NEXT: csrr a1, vlenb ; SUBREGLIVENESS-NEXT: slli a1, a1, 2 -; SUBREGLIVENESS-NEXT: vs4r.v v8, (a0) # Unknown-size Folded Spill +; SUBREGLIVENESS-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill ; SUBREGLIVENESS-NEXT: add a0, a0, a1 -; SUBREGLIVENESS-NEXT: vs4r.v v12, (a0) # Unknown-size Folded Spill -; SUBREGLIVENESS-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; SUBREGLIVENESS-NEXT: vs4r.v v20, (a0) # Unknown-size Folded Spill +; SUBREGLIVENESS-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; SUBREGLIVENESS-NEXT: vmclr.m v0 ; SUBREGLIVENESS-NEXT: li s0, 36 +; SUBREGLIVENESS-NEXT: vmv.v.i v8, 0 +; SUBREGLIVENESS-NEXT: vmv.v.i v12, 0 ; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, tu, mu -; SUBREGLIVENESS-NEXT: vfwadd.vv v8, v8, v8, v0.t +; SUBREGLIVENESS-NEXT: vfwadd.vv v16, v8, v12, v0.t ; SUBREGLIVENESS-NEXT: addi a0, sp, 16 -; SUBREGLIVENESS-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; SUBREGLIVENESS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; SUBREGLIVENESS-NEXT: call func@plt ; SUBREGLIVENESS-NEXT: li a0, 32 +; SUBREGLIVENESS-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; SUBREGLIVENESS-NEXT: vmv.v.i v0, 0 +; SUBREGLIVENESS-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; SUBREGLIVENESS-NEXT: vmv.v.i v8, 0 +; SUBREGLIVENESS-NEXT: vmv.v.i v12, 0 ; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; SUBREGLIVENESS-NEXT: vrgather.vv v16, v8, v8, v0.t +; SUBREGLIVENESS-NEXT: vrgather.vv v16, v8, v12, v0.t ; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, mu +; SUBREGLIVENESS-NEXT: addi a1, sp, 16 +; SUBREGLIVENESS-NEXT: vl8re8.v v24, (a1) # Unknown-size Folded Reload ; SUBREGLIVENESS-NEXT: csrr a1, vlenb ; SUBREGLIVENESS-NEXT: slli a1, a1, 3 ; SUBREGLIVENESS-NEXT: add a1, sp, a1 ; SUBREGLIVENESS-NEXT: addi a1, a1, 16 ; SUBREGLIVENESS-NEXT: csrr a2, vlenb ; SUBREGLIVENESS-NEXT: slli a2, a2, 2 -; SUBREGLIVENESS-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; SUBREGLIVENESS-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; SUBREGLIVENESS-NEXT: add a1, a1, a2 -; SUBREGLIVENESS-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload -; SUBREGLIVENESS-NEXT: addi a1, sp, 16 -; SUBREGLIVENESS-NEXT: vl8re8.v v24, (a1) # Unknown-size Folded Reload -; SUBREGLIVENESS-NEXT: vfwsub.wv v8, v24, v20 +; SUBREGLIVENESS-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; SUBREGLIVENESS-NEXT: vfwsub.wv v8, v24, v4 +; SUBREGLIVENESS-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; SUBREGLIVENESS-NEXT: vmv.v.i v0, 0 +; SUBREGLIVENESS-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; SUBREGLIVENESS-NEXT: vmv.v.i v20, 0 ; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; SUBREGLIVENESS-NEXT: vssubu.vv v16, v16, v8, v0.t +; SUBREGLIVENESS-NEXT: vssubu.vv v16, v16, v20, v0.t +; SUBREGLIVENESS-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; SUBREGLIVENESS-NEXT: vmv.v.i v0, 0 +; SUBREGLIVENESS-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; SUBREGLIVENESS-NEXT: vmv.v.i v16, 0 ; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e32, m8, tu, mu -; SUBREGLIVENESS-NEXT: vfdiv.vv v8, v24, v8, v0.t +; SUBREGLIVENESS-NEXT: vfdiv.vv v8, v24, v16, v0.t ; SUBREGLIVENESS-NEXT: vse32.v v8, (a0) ; SUBREGLIVENESS-NEXT: csrr a0, vlenb ; SUBREGLIVENESS-NEXT: slli a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/allone-masked-to-unmasked.ll b/llvm/test/CodeGen/RISCV/rvv/allone-masked-to-unmasked.ll --- a/llvm/test/CodeGen/RISCV/rvv/allone-masked-to-unmasked.ll +++ b/llvm/test/CodeGen/RISCV/rvv/allone-masked-to-unmasked.ll @@ -35,8 +35,11 @@ define @test1( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vadd.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %allone = call @llvm.riscv.vmset.nxv1i1( diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-splats.ll b/llvm/test/CodeGen/RISCV/rvv/combine-splats.ll --- a/llvm/test/CodeGen/RISCV/rvv/combine-splats.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-splats.ll @@ -60,10 +60,16 @@ ; CHECK-LABEL: combine_vec_shl_shl: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 2 -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vsll.vv v8, v8, v10 ; CHECK-NEXT: vsll.vv v8, v8, v12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -190,8 +190,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv1i32.nxv16i32( %vec, i64 1) ret %c @@ -202,6 +205,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu ; CHECK-NEXT: vslidedown.vx v8, v9, a0 ; CHECK-NEXT: ret @@ -214,6 +219,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu ; CHECK-NEXT: vslidedown.vx v8, v15, a0 ; CHECK-NEXT: ret @@ -251,8 +258,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv2i8.nxv32i8( %vec, i64 2) ret %c @@ -263,8 +273,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv2i8.nxv32i8( %vec, i64 4) ret %c @@ -277,8 +290,11 @@ ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: li a1, 6 ; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv2i8.nxv32i8( %vec, i64 6) ret %c @@ -300,6 +316,8 @@ ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: li a1, 6 ; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu ; CHECK-NEXT: vslidedown.vx v8, v10, a0 ; CHECK-NEXT: ret @@ -313,8 +331,11 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a1, a0, 3 ; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv1i8.nxv8i8( %vec, i64 7) ret %c @@ -327,8 +348,11 @@ ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv1i8.nxv4i8( %vec, i64 3) ret %c @@ -348,7 +372,9 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv2f16.nxv16f16( %vec, i64 2) ret %c @@ -376,8 +402,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv8i1( %mask, i64 8) ret %c @@ -399,10 +428,12 @@ ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv2i1( %mask, i64 2) ret %c @@ -424,10 +455,12 @@ ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv4i1( %x, i64 4) ret %c @@ -446,8 +479,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %c = call @llvm.vector.extract.nxv16i1( %x, i64 16) ret %c @@ -470,8 +506,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vslidedown.vx v11, v10, a0 -; CHECK-NEXT: vslidedown.vx v8, v9, a0 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vslidedown.vx v12, v9, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv1r.v v8, v12 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vi v9, v11, 0 ; CHECK-NEXT: add a1, a0, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv32.ll @@ -15,9 +15,11 @@ define half @extractelt_nxv1f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv1f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -26,9 +28,11 @@ define half @extractelt_nxv1f16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv1f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -47,9 +51,11 @@ define half @extractelt_nxv2f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv2f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -58,9 +64,11 @@ define half @extractelt_nxv2f16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv2f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -79,9 +87,11 @@ define half @extractelt_nxv4f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv4f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -90,9 +100,11 @@ define half @extractelt_nxv4f16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv4f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -111,9 +123,11 @@ define half @extractelt_nxv8f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv8f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -122,9 +136,11 @@ define half @extractelt_nxv8f16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv8f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -143,9 +159,11 @@ define half @extractelt_nxv16f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv16f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -154,9 +172,11 @@ define half @extractelt_nxv16f16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv16f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -175,9 +195,11 @@ define half @extractelt_nxv32f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv32f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -186,9 +208,11 @@ define half @extractelt_nxv32f16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv32f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -207,9 +231,11 @@ define float @extractelt_nxv1f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv1f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -218,9 +244,11 @@ define float @extractelt_nxv1f32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv1f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -239,9 +267,11 @@ define float @extractelt_nxv2f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv2f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -250,9 +280,11 @@ define float @extractelt_nxv2f32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv2f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -271,9 +303,11 @@ define float @extractelt_nxv4f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv4f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -282,9 +316,11 @@ define float @extractelt_nxv4f32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv4f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -303,9 +339,11 @@ define float @extractelt_nxv8f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv8f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -314,9 +352,11 @@ define float @extractelt_nxv8f32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv8f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -335,9 +375,11 @@ define float @extractelt_nxv16f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv16f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -346,9 +388,11 @@ define float @extractelt_nxv16f32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv16f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -367,9 +411,11 @@ define double @extractelt_nxv1f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv1f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -378,9 +424,11 @@ define double @extractelt_nxv1f64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv1f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -399,9 +447,11 @@ define double @extractelt_nxv2f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv2f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -410,9 +460,11 @@ define double @extractelt_nxv2f64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv2f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -431,9 +483,11 @@ define double @extractelt_nxv4f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv4f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -442,9 +496,11 @@ define double @extractelt_nxv4f64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv4f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -463,9 +519,11 @@ define double @extractelt_nxv8f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv8f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -474,9 +532,11 @@ define double @extractelt_nxv8f64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv8f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -489,9 +549,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI45_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfadd.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %head = insertelement poison, float 3.0, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -507,9 +569,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI46_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfrsub.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %head = insertelement poison, float 3.0, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -525,9 +589,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI47_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfmul.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %head = insertelement poison, float 3.0, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll @@ -15,9 +15,11 @@ define half @extractelt_nxv1f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv1f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -26,9 +28,11 @@ define half @extractelt_nxv1f16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv1f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -47,9 +51,11 @@ define half @extractelt_nxv2f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv2f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -58,9 +64,11 @@ define half @extractelt_nxv2f16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv2f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -79,9 +87,11 @@ define half @extractelt_nxv4f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv4f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -90,9 +100,11 @@ define half @extractelt_nxv4f16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv4f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -111,9 +123,11 @@ define half @extractelt_nxv8f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv8f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -122,9 +136,11 @@ define half @extractelt_nxv8f16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv8f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -143,9 +159,11 @@ define half @extractelt_nxv16f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv16f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -154,9 +172,11 @@ define half @extractelt_nxv16f16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv16f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -175,9 +195,11 @@ define half @extractelt_nxv32f16_imm( %v) { ; CHECK-LABEL: extractelt_nxv32f16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret half %r @@ -186,9 +208,11 @@ define half @extractelt_nxv32f16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv32f16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret half %r @@ -207,9 +231,11 @@ define float @extractelt_nxv1f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv1f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -218,9 +244,11 @@ define float @extractelt_nxv1f32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv1f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -239,9 +267,11 @@ define float @extractelt_nxv2f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv2f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -250,9 +280,11 @@ define float @extractelt_nxv2f32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv2f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -271,9 +303,11 @@ define float @extractelt_nxv4f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv4f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -282,9 +316,11 @@ define float @extractelt_nxv4f32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv4f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -303,9 +339,11 @@ define float @extractelt_nxv8f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv8f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -314,9 +352,11 @@ define float @extractelt_nxv8f32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv8f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -335,9 +375,11 @@ define float @extractelt_nxv16f32_imm( %v) { ; CHECK-LABEL: extractelt_nxv16f32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret float %r @@ -346,9 +388,11 @@ define float @extractelt_nxv16f32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv16f32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret float %r @@ -367,9 +411,11 @@ define double @extractelt_nxv1f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv1f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -378,9 +424,11 @@ define double @extractelt_nxv1f64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv1f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -399,9 +447,11 @@ define double @extractelt_nxv2f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv2f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -410,9 +460,11 @@ define double @extractelt_nxv2f64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv2f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -431,9 +483,11 @@ define double @extractelt_nxv4f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv4f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -442,9 +496,11 @@ define double @extractelt_nxv4f64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv4f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -463,9 +519,11 @@ define double @extractelt_nxv8f64_imm( %v) { ; CHECK-LABEL: extractelt_nxv8f64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret double %r @@ -474,9 +532,11 @@ define double @extractelt_nxv8f64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv8f64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret double %r @@ -486,9 +546,11 @@ ; CHECK-LABEL: store_extractelt_nxv8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: vslidedown.vi v16, v8, 1 +; CHECK-NEXT: vse64.v v16, (a1) ; CHECK-NEXT: ret %a = load , * %x %b = extractelement %a, i64 1 @@ -518,9 +580,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI47_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfadd.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %head = insertelement poison, float 3.0, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -536,9 +600,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI48_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfrsub.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %head = insertelement poison, float 3.0, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -554,9 +620,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI49_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfmul.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %head = insertelement poison, float 3.0, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll @@ -10,9 +10,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load , * %x %b = icmp eq %a, zeroinitializer @@ -28,9 +30,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load , * %x %b = icmp eq %a, zeroinitializer @@ -46,9 +50,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load , * %x %b = icmp eq %a, zeroinitializer @@ -64,9 +70,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load , * %x %b = icmp eq %a, zeroinitializer @@ -82,9 +90,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %a = load , * %x %b = icmp eq %a, zeroinitializer @@ -100,9 +110,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %a = load , * %x %b = icmp eq %a, zeroinitializer @@ -118,9 +130,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %a = load , * %x %b = icmp eq %a, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll @@ -15,9 +15,11 @@ define signext i8 @extractelt_nxv1i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -26,9 +28,11 @@ define signext i8 @extractelt_nxv1i8_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv1i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -47,9 +51,11 @@ define signext i8 @extractelt_nxv2i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -58,9 +64,11 @@ define signext i8 @extractelt_nxv2i8_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv2i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -79,9 +87,11 @@ define signext i8 @extractelt_nxv4i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -90,9 +100,11 @@ define signext i8 @extractelt_nxv4i8_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv4i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -111,9 +123,11 @@ define signext i8 @extractelt_nxv8i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -122,9 +136,11 @@ define signext i8 @extractelt_nxv8i8_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv8i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -143,9 +159,11 @@ define signext i8 @extractelt_nxv16i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv16i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -154,9 +172,11 @@ define signext i8 @extractelt_nxv16i8_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv16i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -175,9 +195,11 @@ define signext i8 @extractelt_nxv32i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv32i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -186,9 +208,11 @@ define signext i8 @extractelt_nxv32i8_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv32i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -207,9 +231,11 @@ define signext i8 @extractelt_nxv64i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv64i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -218,9 +244,11 @@ define signext i8 @extractelt_nxv64i8_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv64i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -239,9 +267,11 @@ define signext i16 @extractelt_nxv1i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -250,9 +280,11 @@ define signext i16 @extractelt_nxv1i16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv1i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -271,9 +303,11 @@ define signext i16 @extractelt_nxv2i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -282,9 +316,11 @@ define signext i16 @extractelt_nxv2i16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv2i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -303,9 +339,11 @@ define signext i16 @extractelt_nxv4i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -314,9 +352,11 @@ define signext i16 @extractelt_nxv4i16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv4i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -335,9 +375,11 @@ define signext i16 @extractelt_nxv8i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -346,9 +388,11 @@ define signext i16 @extractelt_nxv8i16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv8i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -367,9 +411,11 @@ define signext i16 @extractelt_nxv16i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv16i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -378,9 +424,11 @@ define signext i16 @extractelt_nxv16i16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv16i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -399,9 +447,11 @@ define signext i16 @extractelt_nxv32i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv32i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -410,9 +460,11 @@ define signext i16 @extractelt_nxv32i16_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv32i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -431,9 +483,11 @@ define i32 @extractelt_nxv1i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -442,9 +496,11 @@ define i32 @extractelt_nxv1i32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv1i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -463,9 +519,11 @@ define i32 @extractelt_nxv2i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -474,9 +532,11 @@ define i32 @extractelt_nxv2i32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv2i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -495,9 +555,11 @@ define i32 @extractelt_nxv4i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -506,9 +568,11 @@ define i32 @extractelt_nxv4i32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv4i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -527,9 +591,11 @@ define i32 @extractelt_nxv8i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -538,9 +604,11 @@ define i32 @extractelt_nxv8i32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv8i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -559,9 +627,11 @@ define i32 @extractelt_nxv16i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv16i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -570,9 +640,11 @@ define i32 @extractelt_nxv16i32_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv16i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -594,11 +666,13 @@ define i64 @extractelt_nxv1i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v9, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 @@ -608,11 +682,13 @@ define i64 @extractelt_nxv1i64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv1i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v9, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx @@ -635,11 +711,13 @@ define i64 @extractelt_nxv2i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v10, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 @@ -649,11 +727,13 @@ define i64 @extractelt_nxv2i64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv2i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v10, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx @@ -676,11 +756,13 @@ define i64 @extractelt_nxv4i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v12, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 @@ -690,11 +772,13 @@ define i64 @extractelt_nxv4i64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv4i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v12, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx @@ -717,11 +801,13 @@ define i64 @extractelt_nxv8i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v16, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 @@ -731,11 +817,13 @@ define i64 @extractelt_nxv8i64_idx( %v, i32 %idx) { ; CHECK-LABEL: extractelt_nxv8i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: vsrl.vx v8, v16, a1 ; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx @@ -747,9 +835,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, 3 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -763,9 +853,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vrsub.vi v8, v8, 3 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -780,9 +872,11 @@ ; CHECK-NEXT: li a0, 3 ; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu ; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll @@ -15,9 +15,11 @@ define signext i8 @extractelt_nxv1i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -26,9 +28,11 @@ define signext i8 @extractelt_nxv1i8_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv1i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -47,9 +51,11 @@ define signext i8 @extractelt_nxv2i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -58,9 +64,11 @@ define signext i8 @extractelt_nxv2i8_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv2i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -79,9 +87,11 @@ define signext i8 @extractelt_nxv4i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -90,9 +100,11 @@ define signext i8 @extractelt_nxv4i8_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv4i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -111,9 +123,11 @@ define signext i8 @extractelt_nxv8i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -122,9 +136,11 @@ define signext i8 @extractelt_nxv8i8_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv8i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -143,9 +159,11 @@ define signext i8 @extractelt_nxv16i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv16i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -154,9 +172,11 @@ define signext i8 @extractelt_nxv16i8_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv16i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -175,9 +195,11 @@ define signext i8 @extractelt_nxv32i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv32i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -186,9 +208,11 @@ define signext i8 @extractelt_nxv32i8_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv32i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -207,9 +231,11 @@ define signext i8 @extractelt_nxv64i8_imm( %v) { ; CHECK-LABEL: extractelt_nxv64i8_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i8 %r @@ -218,9 +244,11 @@ define signext i8 @extractelt_nxv64i8_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv64i8_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i8 %r @@ -239,9 +267,11 @@ define signext i16 @extractelt_nxv1i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -250,9 +280,11 @@ define signext i16 @extractelt_nxv1i16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv1i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -271,9 +303,11 @@ define signext i16 @extractelt_nxv2i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -282,9 +316,11 @@ define signext i16 @extractelt_nxv2i16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv2i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -303,9 +339,11 @@ define signext i16 @extractelt_nxv4i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -314,9 +352,11 @@ define signext i16 @extractelt_nxv4i16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv4i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -335,9 +375,11 @@ define signext i16 @extractelt_nxv8i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -346,9 +388,11 @@ define signext i16 @extractelt_nxv8i16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv8i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -367,9 +411,11 @@ define signext i16 @extractelt_nxv16i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv16i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -378,9 +424,11 @@ define signext i16 @extractelt_nxv16i16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv16i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -399,9 +447,11 @@ define signext i16 @extractelt_nxv32i16_imm( %v) { ; CHECK-LABEL: extractelt_nxv32i16_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i16 %r @@ -410,9 +460,11 @@ define signext i16 @extractelt_nxv32i16_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv32i16_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i16 %r @@ -431,9 +483,11 @@ define signext i32 @extractelt_nxv1i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -442,9 +496,11 @@ define signext i32 @extractelt_nxv1i32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv1i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -463,9 +519,11 @@ define signext i32 @extractelt_nxv2i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -474,9 +532,11 @@ define signext i32 @extractelt_nxv2i32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv2i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -495,9 +555,11 @@ define signext i32 @extractelt_nxv4i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -506,9 +568,11 @@ define signext i32 @extractelt_nxv4i32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv4i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -527,9 +591,11 @@ define signext i32 @extractelt_nxv8i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -538,9 +604,11 @@ define signext i32 @extractelt_nxv8i32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv8i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -559,9 +627,11 @@ define signext i32 @extractelt_nxv16i32_imm( %v) { ; CHECK-LABEL: extractelt_nxv16i32_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i32 %r @@ -570,9 +640,11 @@ define signext i32 @extractelt_nxv16i32_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv16i32_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i32 %r @@ -591,9 +663,11 @@ define i64 @extractelt_nxv1i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv1i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i64 %r @@ -602,9 +676,11 @@ define i64 @extractelt_nxv1i64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv1i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i64 %r @@ -623,9 +699,11 @@ define i64 @extractelt_nxv2i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv2i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i64 %r @@ -634,9 +712,11 @@ define i64 @extractelt_nxv2i64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv2i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i64 %r @@ -655,9 +735,11 @@ define i64 @extractelt_nxv4i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv4i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i64 %r @@ -666,9 +748,11 @@ define i64 @extractelt_nxv4i64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv4i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v12 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i64 %r @@ -687,9 +771,11 @@ define i64 @extractelt_nxv8i64_imm( %v) { ; CHECK-LABEL: extractelt_nxv8i64_imm: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 2 ret i64 %r @@ -698,9 +784,11 @@ define i64 @extractelt_nxv8i64_idx( %v, i32 signext %idx) { ; CHECK-LABEL: extractelt_nxv8i64_idx: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: ret %r = extractelement %v, i32 %idx ret i64 %r @@ -711,9 +799,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, 3 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -727,9 +817,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vrsub.vi v8, v8, 3 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -744,9 +836,11 @@ ; CHECK-NEXT: li a0, 3 ; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu ; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fpext-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fpext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fpext-vp.ll @@ -96,13 +96,17 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v1, 2 ; CHECK-NEXT: bltu a0, a2, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fptrunc-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-fptrunc-vp.ll @@ -94,42 +94,36 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vfptrunc_v32f32_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: bltu a0, a2, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB7_2: ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vfncvt.f.f.w v28, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: vfncvt.f.f.w v8, v16, v0.t +; CHECK-NEXT: vmv4r.v v16, v28 ; CHECK-NEXT: bltu a0, a1, .LBB7_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfncvt.f.f.w v16, v24, v0.t +; CHECK-NEXT: vfncvt.f.f.w v24, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv4r.v v8, v24 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vi v16, v8, 16 -; CHECK-NEXT: vmv8r.v v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vslideup.vi v8, v16, 16 ; CHECK-NEXT: ret %v = call <32 x float> @llvm.vp.fptrunc.v32f64.v32f32(<32 x double> %a, <32 x i1> %m, i32 %vl) ret <32 x float> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -60,7 +60,9 @@ ; V-NEXT: li a2, 0 ; V-NEXT: lui a3, 983765 ; V-NEXT: addiw a3, a3, 873 -; V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; V-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; V-NEXT: vmv.v.i v0, 0 +; V-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; V-NEXT: vmv.s.x v0, a3 ; V-NEXT: li a3, 32 ; V-NEXT: li a4, 5 @@ -85,7 +87,9 @@ ; ZVE32F-NEXT: li a2, 0 ; ZVE32F-NEXT: lui a3, 983765 ; ZVE32F-NEXT: addiw a3, a3, 873 -; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a3 ; ZVE32F-NEXT: li a3, 32 ; ZVE32F-NEXT: li a4, 5 @@ -274,7 +278,9 @@ ; V-NEXT: li a3, 32 ; V-NEXT: lui a4, 983765 ; V-NEXT: addiw a4, a4, 873 -; V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; V-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; V-NEXT: vmv.v.i v0, 0 +; V-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; V-NEXT: vmv.s.x v0, a4 ; V-NEXT: li a4, 5 ; V-NEXT: li a5, 1024 @@ -299,7 +305,9 @@ ; ZVE32F-NEXT: li a3, 32 ; ZVE32F-NEXT: lui a4, 983765 ; ZVE32F-NEXT: addiw a4, a4, 873 -; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; ZVE32F-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a4 ; ZVE32F-NEXT: li a4, 5 ; ZVE32F-NEXT: li a5, 1024 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp-mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp-mask.ll @@ -8,9 +8,12 @@ ; CHECK-LABEL: vtrunc_nxv2i1_nxv2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vand.vi v8, v8, 1, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vand.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call <2 x i1> @llvm.vp.trunc.nxv2i16.nxv2i1(<2 x i16> %a, <2 x i1> %m, i32 %vl) ret <2 x i1> %v @@ -33,9 +36,12 @@ ; CHECK-LABEL: vtrunc_nxv2i1_nxv2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vand.vi v8, v8, 1, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vand.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call <2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<2 x i32> %a, <2 x i1> %m, i32 %vl) ret <2 x i1> %v @@ -58,9 +64,12 @@ ; CHECK-LABEL: vtrunc_nxv2i1_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vand.vi v8, v8, 1, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vand.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %v = call <2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<2 x i64> %a, <2 x i1> %m, i32 %vl) ret <2 x i1> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-trunc-vp.ll @@ -53,42 +53,36 @@ define <128 x i7> @vtrunc_nxv128i7_nxv128i16(<128 x i16> %a, <128 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vtrunc_nxv128i7_nxv128i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; CHECK-NEXT: addi a2, a0, -64 -; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vslidedown.vi v0, v24, 8 ; CHECK-NEXT: bltu a0, a2, .LBB4_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v28, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vncvt.x.x.w v8, v16, v0.t +; CHECK-NEXT: vmv4r.v v16, v28 ; CHECK-NEXT: bltu a0, a1, .LBB4_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: .LBB4_4: ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vncvt.x.x.w v16, v24, v0.t +; CHECK-NEXT: vncvt.x.x.w v24, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv4r.v v8, v24 ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, mu -; CHECK-NEXT: vslideup.vx v16, v8, a1 -; CHECK-NEXT: vmv8r.v v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vslideup.vx v8, v16, a1 ; CHECK-NEXT: ret %v = call <128 x i7> @llvm.vp.trunc.nxv128i7.nxv128i16(<128 x i16> %a, <128 x i1> %m, i32 %vl) ret <128 x i7> %v @@ -237,259 +231,6 @@ declare <128 x i32> @llvm.vp.trunc.nxv128i64.nxv128i32(<128 x i64>, <128 x i1>, i32) define <128 x i32> @vtrunc_nxv128i32_nxv128i64(<128 x i64> %a, <128 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: vtrunc_nxv128i32_nxv128i64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 56 -; CHECK-NEXT: mul a2, a2, a3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: vmv1r.v v1, v0 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a2, a2, a3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; CHECK-NEXT: addi a3, a7, -64 -; CHECK-NEXT: vslidedown.vi v2, v0, 8 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: bltu a7, a3, .LBB16_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: .LBB16_2: -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v28, v2, 4 -; CHECK-NEXT: addi a6, a4, -32 -; CHECK-NEXT: addi a3, a1, 640 -; CHECK-NEXT: mv a5, a2 -; CHECK-NEXT: bltu a4, a6, .LBB16_4 -; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: mv a5, a6 -; CHECK-NEXT: .LBB16_4: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v28, 2 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v16, (a3) -; CHECK-NEXT: addi t0, a5, -16 -; CHECK-NEXT: addi a6, a1, 512 -; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: bltu a5, t0, .LBB16_6 -; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: mv a3, t0 -; CHECK-NEXT: .LBB16_6: -; CHECK-NEXT: vle64.v v8, (a6) -; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, mu -; CHECK-NEXT: li a3, 16 -; CHECK-NEXT: vncvt.x.x.w v24, v16, v0.t -; CHECK-NEXT: csrr a6, vlenb -; CHECK-NEXT: slli a6, a6, 4 -; CHECK-NEXT: add a6, sp, a6 -; CHECK-NEXT: addi a6, a6, 16 -; CHECK-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a5, a3, .LBB16_8 -; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: li a5, 16 -; CHECK-NEXT: .LBB16_8: -; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, mu -; CHECK-NEXT: li a5, 64 -; CHECK-NEXT: vmv1r.v v0, v28 -; CHECK-NEXT: vncvt.x.x.w v16, v8, v0.t -; CHECK-NEXT: csrr a6, vlenb -; CHECK-NEXT: li t0, 48 -; CHECK-NEXT: mul a6, a6, t0 -; CHECK-NEXT: add a6, sp, a6 -; CHECK-NEXT: addi a6, a6, 16 -; CHECK-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a7, a5, .LBB16_10 -; CHECK-NEXT: # %bb.9: -; CHECK-NEXT: li a7, 64 -; CHECK-NEXT: .LBB16_10: -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v28, v1, 4 -; CHECK-NEXT: addi t0, a7, -32 -; CHECK-NEXT: addi a5, a1, 128 -; CHECK-NEXT: mv a6, a2 -; CHECK-NEXT: bltu a7, t0, .LBB16_12 -; CHECK-NEXT: # %bb.11: -; CHECK-NEXT: mv a6, t0 -; CHECK-NEXT: .LBB16_12: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v28, 2 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v16, (a5) -; CHECK-NEXT: addi a5, a6, -16 -; CHECK-NEXT: mv t0, a2 -; CHECK-NEXT: bltu a6, a5, .LBB16_14 -; CHECK-NEXT: # %bb.13: -; CHECK-NEXT: mv t0, a5 -; CHECK-NEXT: .LBB16_14: -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: li a5, 32 -; CHECK-NEXT: vsetvli zero, t0, e32, m4, ta, mu -; CHECK-NEXT: vncvt.x.x.w v24, v16, v0.t -; CHECK-NEXT: csrr t0, vlenb -; CHECK-NEXT: slli t0, t0, 3 -; CHECK-NEXT: add t0, sp, t0 -; CHECK-NEXT: addi t0, t0, 16 -; CHECK-NEXT: vs8r.v v24, (t0) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a6, a3, .LBB16_16 -; CHECK-NEXT: # %bb.15: -; CHECK-NEXT: li a6, 16 -; CHECK-NEXT: .LBB16_16: -; CHECK-NEXT: addi t0, a1, 384 -; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, mu -; CHECK-NEXT: vmv1r.v v0, v28 -; CHECK-NEXT: vncvt.x.x.w v16, v8, v0.t -; CHECK-NEXT: csrr a6, vlenb -; CHECK-NEXT: li t1, 40 -; CHECK-NEXT: mul a6, a6, t1 -; CHECK-NEXT: add a6, sp, a6 -; CHECK-NEXT: addi a6, a6, 16 -; CHECK-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a4, a5, .LBB16_18 -; CHECK-NEXT: # %bb.17: -; CHECK-NEXT: li a4, 32 -; CHECK-NEXT: .LBB16_18: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v2, 2 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v24, (t0) -; CHECK-NEXT: addi t0, a4, -16 -; CHECK-NEXT: addi a6, a1, 256 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: bltu a4, t0, .LBB16_20 -; CHECK-NEXT: # %bb.19: -; CHECK-NEXT: mv a1, t0 -; CHECK-NEXT: .LBB16_20: -; CHECK-NEXT: vle64.v v8, (a6) -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; CHECK-NEXT: vncvt.x.x.w v16, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a4, a3, .LBB16_22 -; CHECK-NEXT: # %bb.21: -; CHECK-NEXT: li a4, 16 -; CHECK-NEXT: .LBB16_22: -; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, mu -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vncvt.x.x.w v24, v8, v0.t -; CHECK-NEXT: bltu a7, a5, .LBB16_24 -; CHECK-NEXT: # %bb.23: -; CHECK-NEXT: li a7, 32 -; CHECK-NEXT: .LBB16_24: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: addi a1, a7, -16 -; CHECK-NEXT: vslidedown.vi v0, v1, 2 -; CHECK-NEXT: bltu a7, a1, .LBB16_26 -; CHECK-NEXT: # %bb.25: -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: .LBB16_26: -; CHECK-NEXT: vsetvli zero, a5, e32, m8, tu, mu -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 48 -; CHECK-NEXT: mul a1, a1, a4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vslideup.vi v8, v16, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 48 -; CHECK-NEXT: mul a1, a1, a4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 40 -; CHECK-NEXT: mul a1, a1, a4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vslideup.vi v8, v16, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a4, 40 -; CHECK-NEXT: mul a1, a1, a4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vslideup.vi v24, v8, 16 -; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, mu -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vncvt.x.x.w v16, v8, v0.t -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a7, a3, .LBB16_28 -; CHECK-NEXT: # %bb.27: -; CHECK-NEXT: li a7, 16 -; CHECK-NEXT: .LBB16_28: -; CHECK-NEXT: vsetvli zero, a7, e32, m4, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vncvt.x.x.w v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, a5, e32, m8, tu, mu -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vslideup.vi v16, v8, 16 -; CHECK-NEXT: vse32.v v16, (a0) -; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vse32.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 40 -; CHECK-NEXT: mul a2, a2, a3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8re8.v v8, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vse32.v v8, (a1) -; CHECK-NEXT: addi a0, a0, 384 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 48 -; CHECK-NEXT: mul a1, a1, a2 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 56 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret %v = call <128 x i32> @llvm.vp.trunc.nxv128i64.nxv128i32(<128 x i64> %a, <128 x i1> %m, i32 %vl) ret <128 x i32> %v } @@ -499,42 +240,36 @@ define <32 x i32> @vtrunc_nxv32i32_nxv32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vtrunc_nxv32i32_nxv32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: bltu a0, a2, .LBB17_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB17_2: ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v28, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: vncvt.x.x.w v8, v16, v0.t +; CHECK-NEXT: vmv4r.v v16, v28 ; CHECK-NEXT: bltu a0, a1, .LBB17_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: .LBB17_4: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vncvt.x.x.w v16, v24, v0.t +; CHECK-NEXT: vncvt.x.x.w v24, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv4r.v v8, v24 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vi v16, v8, 16 -; CHECK-NEXT: vmv8r.v v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vslideup.vi v8, v16, 16 ; CHECK-NEXT: ret %v = call <32 x i32> @llvm.vp.trunc.nxv32i64.nxv32i32(<32 x i64> %a, <32 x i1> %m, i32 %vl) ret <32 x i32> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll @@ -145,6 +145,8 @@ ; ; RV32ELEN32-LABEL: bitcast_v8i8_i64: ; RV32ELEN32: # %bb.0: +; RV32ELEN32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ELEN32-NEXT: vmv.v.i v9, 0 ; RV32ELEN32-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ELEN32-NEXT: vslidedown.vi v9, v8, 1 ; RV32ELEN32-NEXT: vmv.x.s a1, v9 @@ -183,6 +185,8 @@ ; ; RV32ELEN32-LABEL: bitcast_v4i16_i64: ; RV32ELEN32: # %bb.0: +; RV32ELEN32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ELEN32-NEXT: vmv.v.i v9, 0 ; RV32ELEN32-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ELEN32-NEXT: vslidedown.vi v9, v8, 1 ; RV32ELEN32-NEXT: vmv.x.s a1, v9 @@ -221,6 +225,8 @@ ; ; RV32ELEN32-LABEL: bitcast_v2i32_i64: ; RV32ELEN32: # %bb.0: +; RV32ELEN32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ELEN32-NEXT: vmv.v.i v9, 0 ; RV32ELEN32-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ELEN32-NEXT: vslidedown.vi v9, v8, 1 ; RV32ELEN32-NEXT: vmv.x.s a1, v9 @@ -451,7 +457,9 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a) { ; RV32-LABEL: bitcast_i32_v2i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-NEXT: vmv.s.x v8, a0 ; RV32-NEXT: ret ; @@ -463,7 +471,9 @@ ; ; RV32ELEN32-LABEL: bitcast_i32_v2i16: ; RV32ELEN32: # %bb.0: -; RV32ELEN32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ELEN32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ELEN32-NEXT: vmv.v.i v8, 0 +; RV32ELEN32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32ELEN32-NEXT: vmv.s.x v8, a0 ; RV32ELEN32-NEXT: ret ; @@ -479,7 +489,9 @@ define <1 x i32> @bitcast_i32_v1i32(i32 %a) { ; RV32-LABEL: bitcast_i32_v1i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-NEXT: vmv.s.x v8, a0 ; RV32-NEXT: ret ; @@ -491,7 +503,9 @@ ; ; RV32ELEN32-LABEL: bitcast_i32_v1i32: ; RV32ELEN32: # %bb.0: -; RV32ELEN32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ELEN32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ELEN32-NEXT: vmv.v.i v8, 0 +; RV32ELEN32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32ELEN32-NEXT: vmv.s.x v8, a0 ; RV32ELEN32-NEXT: ret ; @@ -511,13 +525,17 @@ ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vslide1up.vx v9, v8, a1 ; RV32-NEXT: vslide1up.vx v10, v9, a0 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vslideup.vi v8, v10, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: bitcast_i64_v4i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: ret ; @@ -550,13 +568,17 @@ ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vslide1up.vx v9, v8, a1 ; RV32-NEXT: vslide1up.vx v10, v9, a0 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vslideup.vi v8, v10, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: bitcast_i64_v2i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: ret ; @@ -589,13 +611,17 @@ ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vslide1up.vx v9, v8, a1 ; RV32-NEXT: vslide1up.vx v10, v9, a0 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vslideup.vi v8, v10, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: bitcast_i64_v1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -179,6 +179,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v10, v10, a4 ; RV32-NEXT: li a5, 5 +; RV32-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v11, 0 @@ -725,6 +728,9 @@ ; LMULMAX2-RV32-NEXT: lui a4, 4080 ; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4 ; LMULMAX2-RV32-NEXT: li a5, 85 +; LMULMAX2-RV32-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a5 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0 @@ -869,6 +875,9 @@ ; LMULMAX1-RV32-NEXT: lui a5, 4080 ; LMULMAX1-RV32-NEXT: vand.vx v11, v9, a5 ; LMULMAX1-RV32-NEXT: li a6, 5 +; LMULMAX1-RV32-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v0, a6 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -85,6 +85,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v10, v10, a4 ; RV32-NEXT: li a5, 5 +; RV32-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v11, 0 @@ -352,6 +355,9 @@ ; LMULMAX2-RV32-NEXT: lui a4, 4080 ; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4 ; LMULMAX2-RV32-NEXT: li a5, 85 +; LMULMAX2-RV32-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a5 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0 @@ -445,6 +451,9 @@ ; LMULMAX1-RV32-NEXT: lui a5, 4080 ; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5 ; LMULMAX1-RV32-NEXT: li a6, 5 +; LMULMAX1-RV32-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v0, a6 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.i v12, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll @@ -142,6 +142,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -165,6 +167,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -212,6 +216,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -235,6 +241,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -258,18 +266,24 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf8 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf8 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vsext.vf8 v11, v8 +; LMULMAX1-NEXT: vsext.vf8 v11, v13 ; LMULMAX1-NEXT: vsext.vf8 v8, v12 ; LMULMAX1-NEXT: ret ; @@ -289,18 +303,24 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf8 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf8 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vzext.vf8 v11, v8 +; LMULMAX1-NEXT: vzext.vf8 v11, v13 ; LMULMAX1-NEXT: vzext.vf8 v8, v12 ; LMULMAX1-NEXT: ret ; @@ -320,6 +340,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu @@ -343,6 +365,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu @@ -366,18 +390,24 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 8 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-NEXT: vsext.vf4 v11, v8 +; LMULMAX1-NEXT: vsext.vf4 v11, v13 ; LMULMAX1-NEXT: vsext.vf4 v8, v12 ; LMULMAX1-NEXT: ret ; @@ -397,18 +427,24 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 8 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-NEXT: vzext.vf4 v11, v8 +; LMULMAX1-NEXT: vzext.vf4 v11, v13 ; LMULMAX1-NEXT: vzext.vf4 v8, v12 ; LMULMAX1-NEXT: ret ; @@ -428,32 +464,46 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vle8.v v16, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf8 v12, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf8 v9, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v16, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf8 v10, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v14, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v14, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf8 v13, v14 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v15, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v15, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vsext.vf8 v14, v8 +; LMULMAX1-NEXT: vsext.vf8 v14, v15 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v15, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vsext.vf8 v11, v15 +; LMULMAX1-NEXT: vsext.vf8 v11, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf8 v15, v8 ; LMULMAX1-NEXT: vsext.vf8 v8, v16 @@ -463,6 +513,8 @@ ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX4-NEXT: vle8.v v16, (a0) +; LMULMAX4-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 ; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, mu @@ -479,32 +531,46 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vle8.v v16, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf8 v12, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf8 v9, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v16, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf8 v10, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v14, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v14, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf8 v13, v14 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v15, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v15, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vzext.vf8 v14, v8 +; LMULMAX1-NEXT: vzext.vf8 v14, v15 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v15, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vzext.vf8 v11, v15 +; LMULMAX1-NEXT: vzext.vf8 v11, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf8 v15, v8 ; LMULMAX1-NEXT: vzext.vf8 v8, v16 @@ -514,6 +580,8 @@ ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX4-NEXT: vle8.v v16, (a0) +; LMULMAX4-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 ; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, mu @@ -647,6 +715,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -670,6 +740,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -705,6 +777,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -728,6 +802,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -751,18 +827,24 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vsext.vf4 v11, v8 +; LMULMAX1-NEXT: vsext.vf4 v11, v13 ; LMULMAX1-NEXT: vsext.vf4 v8, v12 ; LMULMAX1-NEXT: ret ; @@ -782,18 +864,24 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vzext.vf4 v11, v8 +; LMULMAX1-NEXT: vzext.vf4 v11, v13 ; LMULMAX1-NEXT: vzext.vf4 v8, v12 ; LMULMAX1-NEXT: ret ; @@ -833,14 +921,18 @@ define <16 x i32> @sextload_v16i16_v16i32(<16 x i16>* %x) { ; LMULMAX1-LABEL: sextload_v16i16_v16i32: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-NEXT: vle16.v v12, (a1) ; LMULMAX1-NEXT: vle16.v v10, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -863,14 +955,18 @@ define <16 x i32> @zextload_v16i16_v16i32(<16 x i16>* %x) { ; LMULMAX1-LABEL: zextload_v16i16_v16i32: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-NEXT: vle16.v v12, (a1) ; LMULMAX1-NEXT: vle16.v v10, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -893,30 +989,42 @@ define <16 x i64> @sextload_v16i16_v16i64(<16 x i16>* %x) { ; LMULMAX1-LABEL: sextload_v16i16_v16i64: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-NEXT: vle16.v v16, (a1) ; LMULMAX1-NEXT: vle16.v v12, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v16, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v15, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v15, v16, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v14, v15 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vsext.vf4 v11, v8 +; LMULMAX1-NEXT: vsext.vf4 v11, v13 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v13, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -929,6 +1037,8 @@ ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX4-NEXT: vle16.v v16, (a0) +; LMULMAX4-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 ; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, mu ; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, mu @@ -943,30 +1053,42 @@ define <16 x i64> @zextload_v16i16_v16i64(<16 x i16>* %x) { ; LMULMAX1-LABEL: zextload_v16i16_v16i64: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-NEXT: vle16.v v16, (a1) ; LMULMAX1-NEXT: vle16.v v12, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v16, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v10, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v15, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v15, v16, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v14, v15 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v9, v11 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-NEXT: vzext.vf4 v11, v8 +; LMULMAX1-NEXT: vzext.vf4 v11, v13 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v13, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -979,6 +1101,8 @@ ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX4-NEXT: vle16.v v16, (a0) +; LMULMAX4-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 ; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, mu ; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, mu @@ -1071,6 +1195,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -1094,6 +1220,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -1166,14 +1294,18 @@ define <8 x i64> @sextload_v8i32_v8i64(<8 x i32>* %x) { ; LMULMAX1-LABEL: sextload_v8i32_v8i64: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-NEXT: vle32.v v12, (a1) ; LMULMAX1-NEXT: vle32.v v10, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -1196,14 +1328,18 @@ define <8 x i64> @zextload_v8i32_v8i64(<8 x i32>* %x) { ; LMULMAX1-LABEL: zextload_v8i32_v8i64: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-NEXT: vle32.v v12, (a1) ; LMULMAX1-NEXT: vle32.v v10, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -1301,21 +1437,29 @@ ; LMULMAX1-NEXT: vle32.v v16, (a1) ; LMULMAX1-NEXT: addi a1, a0, 32 ; LMULMAX1-NEXT: vle32.v v14, (a1) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v12, (a1) ; LMULMAX1-NEXT: vle32.v v10, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf2 v11, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v14, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf2 v13, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -1330,6 +1474,8 @@ ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; LMULMAX4-NEXT: vle32.v v16, (a0) +; LMULMAX4-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 ; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, mu ; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, mu @@ -1349,21 +1495,29 @@ ; LMULMAX1-NEXT: vle32.v v16, (a1) ; LMULMAX1-NEXT: addi a1, a0, 32 ; LMULMAX1-NEXT: vle32.v v14, (a1) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v12, (a1) ; LMULMAX1-NEXT: vle32.v v10, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf2 v11, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v14, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf2 v13, v8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -1378,6 +1532,8 @@ ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; LMULMAX4-NEXT: vle32.v v16, (a0) +; LMULMAX4-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 ; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, mu ; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, mu @@ -1787,13 +1943,19 @@ ; LMULMAX4-NEXT: vncvt.x.x.w v16, v12 ; LMULMAX4-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; LMULMAX4-NEXT: vncvt.x.x.w v12, v16 -; LMULMAX4-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; LMULMAX4-NEXT: vncvt.x.x.w v14, v8 +; LMULMAX4-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX4-NEXT: vmv.v.i v14, 0 +; LMULMAX4-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX4-NEXT: vncvt.x.x.w v16, v8 ; LMULMAX4-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; LMULMAX4-NEXT: vncvt.x.x.w v8, v14 +; LMULMAX4-NEXT: vncvt.x.x.w v8, v16 +; LMULMAX4-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX4-NEXT: vmv.v.i v10, 0 +; LMULMAX4-NEXT: vmv1r.v v14, v12 +; LMULMAX4-NEXT: vmv1r.v v10, v8 ; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, tu, mu -; LMULMAX4-NEXT: vslideup.vi v8, v12, 8 -; LMULMAX4-NEXT: vse16.v v8, (a0) +; LMULMAX4-NEXT: vslideup.vi v10, v14, 8 +; LMULMAX4-NEXT: vse16.v v10, (a0) ; LMULMAX4-NEXT: ret %y = trunc <16 x i64> %x to <16 x i16> store <16 x i16> %y, <16 x i16>* %z @@ -1836,10 +1998,17 @@ ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX4-NEXT: vncvt.x.x.w v16, v12 -; LMULMAX4-NEXT: vncvt.x.x.w v12, v8 +; LMULMAX4-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; LMULMAX4-NEXT: vmv.v.i v12, 0 +; LMULMAX4-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX4-NEXT: vncvt.x.x.w v18, v8 +; LMULMAX4-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 +; LMULMAX4-NEXT: vmv2r.v v12, v16 +; LMULMAX4-NEXT: vmv2r.v v8, v18 ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, tu, mu -; LMULMAX4-NEXT: vslideup.vi v12, v16, 8 -; LMULMAX4-NEXT: vse32.v v12, (a0) +; LMULMAX4-NEXT: vslideup.vi v8, v12, 8 +; LMULMAX4-NEXT: vse32.v v8, (a0) ; LMULMAX4-NEXT: ret %y = trunc <16 x i64> %x to <16 x i32> store <16 x i32> %y, <16 x i32>* %z @@ -2136,7 +2305,9 @@ ; CHECK-NEXT: vfncvt.rod.f.f.w v24, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vfncvt.f.f.w v8, v24 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu ; CHECK-NEXT: vfncvt.rod.f.f.w v12, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vfncvt.f.f.w v10, v12 @@ -2152,8 +2323,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu ; CHECK-NEXT: vfncvt.f.f.w v24, v8 -; CHECK-NEXT: vfncvt.f.f.w v28, v16 -; CHECK-NEXT: vs8r.v v24, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; CHECK-NEXT: vfncvt.f.f.w v12, v16 +; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: ret %y = fptrunc %x to store %y, * %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll @@ -13,8 +13,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <1 x i8>, <1 x i8>* %x %b = icmp eq <1 x i8> %a, zeroinitializer @@ -30,9 +33,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, <2 x i8>* %x %b = icmp eq <2 x i8> %a, zeroinitializer @@ -48,9 +53,11 @@ ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %b = icmp eq <4 x i8> %a, zeroinitializer @@ -210,6 +217,8 @@ ; RV32-NEXT: vle8.v v8, (a0) ; RV32-NEXT: vmseq.vi v12, v8, 0 ; RV32-NEXT: srli a0, a1, 5 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; RV32-NEXT: vslidedown.vx v8, v12, a0 ; RV32-NEXT: vmv.x.s a0, v8 @@ -236,6 +245,8 @@ ; RV32ZBS-NEXT: vle8.v v8, (a0) ; RV32ZBS-NEXT: vmseq.vi v12, v8, 0 ; RV32ZBS-NEXT: srli a0, a1, 5 +; RV32ZBS-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32ZBS-NEXT: vmv.v.i v8, 0 ; RV32ZBS-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; RV32ZBS-NEXT: vslidedown.vx v8, v12, a0 ; RV32ZBS-NEXT: vmv.x.s a0, v8 @@ -266,6 +277,8 @@ ; RV32-NEXT: vle8.v v8, (a0) ; RV32-NEXT: vmseq.vi v16, v8, 0 ; RV32-NEXT: srli a0, a1, 5 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32-NEXT: vslidedown.vx v8, v16, a0 ; RV32-NEXT: vmv.x.s a0, v8 @@ -280,6 +293,8 @@ ; RV64-NEXT: vle8.v v8, (a0) ; RV64-NEXT: vmseq.vi v16, v8, 0 ; RV64-NEXT: srli a0, a1, 6 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vslidedown.vx v8, v16, a0 ; RV64-NEXT: vmv.x.s a0, v8 @@ -294,6 +309,8 @@ ; RV32ZBS-NEXT: vle8.v v8, (a0) ; RV32ZBS-NEXT: vmseq.vi v16, v8, 0 ; RV32ZBS-NEXT: srli a0, a1, 5 +; RV32ZBS-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32ZBS-NEXT: vmv.v.i v8, 0 ; RV32ZBS-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZBS-NEXT: vslidedown.vx v8, v16, a0 ; RV32ZBS-NEXT: vmv.x.s a0, v8 @@ -307,6 +324,8 @@ ; RV64ZBS-NEXT: vle8.v v8, (a0) ; RV64ZBS-NEXT: vmseq.vi v16, v8, 0 ; RV64ZBS-NEXT: srli a0, a1, 6 +; RV64ZBS-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZBS-NEXT: vmv.v.i v8, 0 ; RV64ZBS-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64ZBS-NEXT: vslidedown.vx v8, v16, a0 ; RV64ZBS-NEXT: vmv.x.s a0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll @@ -21,10 +21,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret %a = load <4 x i8>, <4 x i8>* %x %c = call <2 x i8> @llvm.vector.extract.v2i8.v4i8(<4 x i8> %a, i64 2) @@ -51,10 +53,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 6 +; CHECK-NEXT: vslidedown.vi v9, v8, 6 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x %c = call <2 x i8> @llvm.vector.extract.v2i8.v8i8(<8 x i8> %a, i64 6) @@ -89,20 +93,24 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, ta, mu -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX2-NEXT: vslidedown.vi v10, v8, 2 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX2-NEXT: vse32.v v8, (a1) +; LMULMAX2-NEXT: vse32.v v10, (a1) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: extract_v2i32_v8i32_2: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v9, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vse32.v v8, (a1) +; LMULMAX1-NEXT: vse32.v v9, (a1) ; LMULMAX1-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 2) @@ -115,10 +123,12 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, ta, mu -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 6 +; LMULMAX2-NEXT: vslidedown.vi v10, v8, 6 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX2-NEXT: vse32.v v8, (a1) +; LMULMAX2-NEXT: vse32.v v10, (a1) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: extract_v2i32_v8i32_6: @@ -126,10 +136,12 @@ ; LMULMAX1-NEXT: addi a0, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v9, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vse32.v v8, (a1) +; LMULMAX1-NEXT: vse32.v v9, (a1) ; LMULMAX1-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 6) @@ -151,10 +163,12 @@ define void @extract_v2i32_nxv16i32_8( %x, <2 x i32>* %y) { ; CHECK-LABEL: extract_v2i32_nxv16i32_8: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 2, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 6 +; CHECK-NEXT: vslidedown.vi v16, v8, 6 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v16, (a0) ; CHECK-NEXT: ret %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32( %x, i64 6) store <2 x i32> %c, <2 x i32>* %y @@ -175,10 +189,12 @@ define void @extract_v2i8_nxv2i8_2( %x, <2 x i8>* %y) { ; CHECK-LABEL: extract_v2i8_nxv2i8_2: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: vse8.v v9, (a0) ; CHECK-NEXT: ret %c = call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8( %x, i64 2) store <2 x i8> %c, <2 x i8>* %y @@ -188,20 +204,27 @@ define void @extract_v8i32_nxv16i32_8( %x, <8 x i32>* %y) { ; LMULMAX2-LABEL: extract_v8i32_nxv16i32_8: ; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; LMULMAX2-NEXT: vmv.v.i v16, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m8, ta, mu -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 8 +; LMULMAX2-NEXT: vslidedown.vi v16, v8, 8 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-NEXT: vse32.v v8, (a0) +; LMULMAX2-NEXT: vse32.v v16, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: extract_v8i32_nxv16i32_8: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; LMULMAX1-NEXT: vmv.v.i v16, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m8, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v16, v8, 8 -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 12 +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; LMULMAX1-NEXT: vmv.v.i v24, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m8, ta, mu +; LMULMAX1-NEXT: vslidedown.vi v24, v8, 12 ; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-NEXT: vse32.v v8, (a1) +; LMULMAX1-NEXT: vse32.v v24, (a1) ; LMULMAX1-NEXT: vse32.v v16, (a0) ; LMULMAX1-NEXT: ret %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32( %x, i64 8) @@ -238,20 +261,24 @@ ; LMULMAX2-NEXT: li a2, 32 ; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; LMULMAX2-NEXT: vlm.v v8, (a0) +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX2-NEXT: vmv.v.i v9, 0 ; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 1 +; LMULMAX2-NEXT: vslidedown.vi v9, v8, 1 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX2-NEXT: vsm.v v8, (a1) +; LMULMAX2-NEXT: vsm.v v9, (a1) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: extract_v8i1_v64i1_8: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vlm.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 1 +; LMULMAX1-NEXT: vslidedown.vi v9, v8, 1 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vsm.v v8, (a1) +; LMULMAX1-NEXT: vsm.v v9, (a1) ; LMULMAX1-NEXT: ret %a = load <64 x i1>, <64 x i1>* %x %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 8) @@ -266,10 +293,12 @@ ; LMULMAX2-NEXT: li a2, 32 ; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; LMULMAX2-NEXT: vlm.v v8, (a0) +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX2-NEXT: vmv.v.i v9, 0 ; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX2-NEXT: vslidedown.vi v9, v8, 2 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX2-NEXT: vsm.v v8, (a1) +; LMULMAX2-NEXT: vsm.v v9, (a1) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: extract_v8i1_v64i1_48: @@ -311,6 +340,8 @@ define void @extract_v8i1_nxv64i1_8( %x, <8 x i1>* %y) { ; CHECK-LABEL: extract_v8i1_nxv64i1_8: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v0, 1 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -324,6 +355,8 @@ define void @extract_v8i1_nxv64i1_48( %x, <8 x i1>* %y) { ; CHECK-LABEL: extract_v8i1_nxv64i1_48: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v0, 6 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -382,10 +415,12 @@ ; LMULMAX2-NEXT: vlm.v v0, (a0) ; LMULMAX2-NEXT: vmv.v.i v8, 0 ; LMULMAX2-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 2, e8, m2, ta, mu -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX2-NEXT: vslidedown.vi v10, v8, 2 ; LMULMAX2-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX2-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX2-NEXT: vmsne.vi v0, v10, 0 ; LMULMAX2-NEXT: vmv.v.i v8, 0 ; LMULMAX2-NEXT: vmerge.vim v8, v8, 1, v0 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -403,10 +438,12 @@ ; LMULMAX1-NEXT: vlm.v v0, (a0) ; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v9, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v9, 0 ; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -432,10 +469,12 @@ ; LMULMAX2-NEXT: vlm.v v0, (a0) ; LMULMAX2-NEXT: vmv.v.i v8, 0 ; LMULMAX2-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 2, e8, m2, ta, mu -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 10 +; LMULMAX2-NEXT: vslidedown.vi v10, v8, 10 ; LMULMAX2-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX2-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX2-NEXT: vmsne.vi v0, v10, 0 ; LMULMAX2-NEXT: vmv.v.i v8, 0 ; LMULMAX2-NEXT: vmerge.vim v8, v8, 1, v0 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -454,10 +493,12 @@ ; LMULMAX1-NEXT: vlm.v v0, (a0) ; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 10 +; LMULMAX1-NEXT: vslidedown.vi v9, v8, 10 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v9, 0 ; LMULMAX1-NEXT: vmv.v.i v8, 0 ; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -499,10 +540,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -543,10 +586,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, mu ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v16, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -569,10 +614,12 @@ ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: li a1, 42 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vx v16, v8, a1 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v16, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -594,10 +641,12 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, mu ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 26 +; CHECK-NEXT: vslidedown.vi v12, v8, 26 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -616,6 +665,8 @@ define void @extract_v8i1_nxv32i1_16( %x, <8 x i1>* %y) { ; CHECK-LABEL: extract_v8i1_nxv32i1_16: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v0, 2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -7,9 +7,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = extractelement <16 x i8> %a, i32 7 @@ -21,9 +23,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = extractelement <8 x i16> %a, i32 7 @@ -35,9 +39,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = extractelement <4 x i32> %a, i32 2 @@ -72,9 +78,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %a = load <8 x half>, <8 x half>* %x %b = extractelement <8 x half> %a, i32 7 @@ -86,9 +94,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %a = load <4 x float>, <4 x float>* %x %b = extractelement <4 x float> %a, i32 2 @@ -113,9 +123,11 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 7 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = extractelement <32 x i8> %a, i32 7 @@ -127,9 +139,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 7 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = extractelement <16 x i16> %a, i32 7 @@ -141,9 +155,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 6 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 6 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = extractelement <8 x i32> %a, i32 6 @@ -155,11 +171,13 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 3 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v10, v8, 3 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -167,9 +185,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 3 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 3 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = extractelement <4 x i64> %a, i32 3 @@ -181,9 +201,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 7 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %a = load <16 x half>, <16 x half>* %x %b = extractelement <16 x half> %a, i32 7 @@ -195,9 +217,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %a = load <8 x float>, <8 x float>* %x %b = extractelement <8 x float> %a, i32 2 @@ -225,20 +249,27 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32-NEXT: vslidedown.vi v10, v8, 4 ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vslidedown.vi v8, v8, 5 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32-NEXT: vslidedown.vi v10, v8, 5 +; RV32-NEXT: vmv.x.s a1, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: extractelt_v3i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 2 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %a = load <3 x i64>, <3 x i64>* %x %b = extractelement <3 x i64> %a, i32 2 @@ -250,9 +281,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = extractelement <16 x i8> %a, i32 %idx @@ -264,9 +297,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = extractelement <8 x i16> %a, i32 %idx @@ -279,9 +314,11 @@ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = add <4 x i32> %a, %a @@ -295,11 +332,13 @@ ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vslidedown.vx v8, v8, a1 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vx v9, v8, a1 +; RV32-NEXT: vmv.x.s a0, v9 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v9, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -308,9 +347,11 @@ ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vslidedown.vx v8, v8, a1 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vx v9, v8, a1 +; RV64-NEXT: vmv.x.s a0, v9 ; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = add <2 x i64> %a, %a @@ -324,9 +365,11 @@ ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %a = load <8 x half>, <8 x half>* %x %b = fadd <8 x half> %a, %a @@ -340,9 +383,11 @@ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %a = load <4 x float>, <4 x float>* %x %b = fadd <4 x float> %a, %a @@ -356,9 +401,11 @@ ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %a = load <2 x double>, <2 x double>* %x %b = fadd <2 x double> %a, %a @@ -372,9 +419,11 @@ ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = extractelement <32 x i8> %a, i32 %idx @@ -386,9 +435,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = extractelement <16 x i16> %a, i32 %idx @@ -401,9 +452,11 @@ ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a1 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = add <8 x i32> %a, %a @@ -417,11 +470,13 @@ ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; RV32-NEXT: vslidedown.vx v8, v8, a1 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vx v10, v8, a1 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -430,9 +485,11 @@ ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; RV64-NEXT: vslidedown.vx v8, v8, a1 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vx v10, v8, a1 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = add <4 x i64> %a, %a @@ -446,9 +503,11 @@ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a1 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %a = load <16 x half>, <16 x half>* %x %b = fadd <16 x half> %a, %a @@ -462,9 +521,11 @@ ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a1 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %a = load <8 x float>, <8 x float>* %x %b = fadd <8 x float> %a, %a @@ -478,9 +539,11 @@ ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vx v10, v8, a1 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %a = load <4 x double>, <4 x double>* %x %b = fadd <4 x double> %a, %a @@ -499,12 +562,17 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: add a1, a1, a1 +; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32-NEXT: vslidedown.vx v10, v8, a1 ; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: addi a1, a1, 1 -; RV32-NEXT: vslidedown.vx v8, v8, a1 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32-NEXT: vslidedown.vx v10, v8, a1 +; RV32-NEXT: vmv.x.s a1, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: extractelt_v3i64_idx: @@ -512,9 +580,11 @@ ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; RV64-NEXT: vslidedown.vx v8, v8, a1 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vx v10, v8, a1 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %a = load <3 x i64>, <3 x i64>* %x %b = add <3 x i64> %a, %a @@ -527,9 +597,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = extractelement <16 x i8> %a, i32 7 @@ -542,9 +614,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vse16.v v9, (a1) ; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = extractelement <8 x i16> %a, i32 7 @@ -557,9 +631,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vse32.v v8, (a1) +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vse32.v v9, (a1) ; CHECK-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = extractelement <4 x i32> %a, i32 2 @@ -573,12 +649,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslidedown.vi v9, v8, 1 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vsrl.vx v8, v9, a0 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a2, v9 ; RV32-NEXT: sw a2, 0(a1) ; RV32-NEXT: sw a0, 4(a1) ; RV32-NEXT: ret @@ -587,9 +665,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: vslidedown.vi v9, v8, 1 +; RV64-NEXT: vse64.v v9, (a1) ; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = extractelement <2 x i64> %a, i64 1 @@ -602,9 +682,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vse64.v v9, (a1) ; CHECK-NEXT: ret %a = load <2 x double>, <2 x double>* %x %b = extractelement <2 x double> %a, i64 1 @@ -617,9 +699,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, 13 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %bo = add <4 x i32> %x, %ext = extractelement <4 x i32> %bo, i32 2 @@ -631,9 +715,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vrsub.vi v8, v8, 13 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %bo = sub <4 x i32> , %x %ext = extractelement <4 x i32> %bo, i32 2 @@ -646,9 +732,11 @@ ; CHECK-NEXT: li a0, 13 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %bo = mul <4 x i32> %x, %ext = extractelement <4 x i32> %bo, i32 2 @@ -659,8 +747,11 @@ ; RV32-LABEL: extractelt_sdiv_v4i32: ; RV32: # %bb.0: ; RV32-NEXT: li a0, -1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV32-NEXT: vslideup.vi v10, v9, 3 @@ -677,16 +768,21 @@ ; RV32-NEXT: vsra.vv v9, v8, v11 ; RV32-NEXT: vsrl.vi v8, v8, 31 ; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 2 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vmv.x.s a0, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: extractelt_sdiv_v4i32: ; RV64: # %bb.0: ; RV64-NEXT: li a0, -1 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64-NEXT: vslideup.vi v10, v9, 3 @@ -703,9 +799,11 @@ ; RV64-NEXT: vsra.vv v8, v8, v11 ; RV64-NEXT: vsrl.vi v9, v8, 31 ; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 2 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vmv.x.s a0, v9 ; RV64-NEXT: ret %bo = sdiv <4 x i32> %x, %ext = extractelement <4 x i32> %bo, i32 2 @@ -716,8 +814,11 @@ ; CHECK-LABEL: extractelt_udiv_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v10, v9, 3 @@ -728,9 +829,11 @@ ; CHECK-NEXT: vsrl.vv v8, v8, v10 ; CHECK-NEXT: vmulhu.vv v8, v8, v9 ; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: ret %bo = udiv <4 x i32> %x, %ext = extractelement <4 x i32> %bo, i32 2 @@ -744,9 +847,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI40_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfadd.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %bo = fadd <4 x float> %x, %ext = extractelement <4 x float> %bo, i32 2 @@ -760,9 +865,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfrsub.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %bo = fsub <4 x float> , %x %ext = extractelement <4 x float> %bo, i32 2 @@ -776,9 +883,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI42_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfmul.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %bo = fmul <4 x float> %x, %ext = extractelement <4 x float> %bo, i32 2 @@ -792,9 +901,11 @@ ; CHECK-NEXT: flw ft0, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfdiv.vf v8, v8, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %bo = fdiv <4 x float> %x, %ext = extractelement <4 x float> %bo, i32 2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll @@ -166,7 +166,9 @@ define <2 x half> @bitcast_i32_v2f16(i32 %a) { ; RV32-FP-LABEL: bitcast_i32_v2f16: ; RV32-FP: # %bb.0: -; RV32-FP-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-FP-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-FP-NEXT: vmv.v.i v8, 0 +; RV32-FP-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-FP-NEXT: vmv.s.x v8, a0 ; RV32-FP-NEXT: ret ; @@ -182,7 +184,9 @@ define <1 x float> @bitcast_i32_v1f32(i32 %a) { ; RV32-FP-LABEL: bitcast_i32_v1f32: ; RV32-FP: # %bb.0: -; RV32-FP-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-FP-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-FP-NEXT: vmv.v.i v8, 0 +; RV32-FP-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-FP-NEXT: vmv.s.x v8, a0 ; RV32-FP-NEXT: ret ; @@ -202,13 +206,17 @@ ; RV32-FP-NEXT: vmv.v.i v8, 0 ; RV32-FP-NEXT: vslide1up.vx v9, v8, a1 ; RV32-FP-NEXT: vslide1up.vx v10, v9, a0 +; RV32-FP-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-FP-NEXT: vmv.v.i v8, 0 ; RV32-FP-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-FP-NEXT: vslideup.vi v8, v10, 0 ; RV32-FP-NEXT: ret ; ; RV64-FP-LABEL: bitcast_i64_v4f16: ; RV64-FP: # %bb.0: -; RV64-FP-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-FP-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-FP-NEXT: vmv.v.i v8, 0 +; RV64-FP-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-FP-NEXT: vmv.s.x v8, a0 ; RV64-FP-NEXT: ret %b = bitcast i64 %a to <4 x half> @@ -222,13 +230,17 @@ ; RV32-FP-NEXT: vmv.v.i v8, 0 ; RV32-FP-NEXT: vslide1up.vx v9, v8, a1 ; RV32-FP-NEXT: vslide1up.vx v10, v9, a0 +; RV32-FP-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-FP-NEXT: vmv.v.i v8, 0 ; RV32-FP-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-FP-NEXT: vslideup.vi v8, v10, 0 ; RV32-FP-NEXT: ret ; ; RV64-FP-LABEL: bitcast_i64_v2f32: ; RV64-FP: # %bb.0: -; RV64-FP-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-FP-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-FP-NEXT: vmv.v.i v8, 0 +; RV64-FP-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-FP-NEXT: vmv.s.x v8, a0 ; RV64-FP-NEXT: ret %b = bitcast i64 %a to <2 x float> @@ -242,13 +254,17 @@ ; RV32-FP-NEXT: vmv.v.i v8, 0 ; RV32-FP-NEXT: vslide1up.vx v9, v8, a1 ; RV32-FP-NEXT: vslide1up.vx v10, v9, a0 +; RV32-FP-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-FP-NEXT: vmv.v.i v8, 0 ; RV32-FP-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-FP-NEXT: vslideup.vi v8, v10, 0 ; RV32-FP-NEXT: ret ; ; RV64-FP-LABEL: bitcast_i64_v1f64: ; RV64-FP: # %bb.0: -; RV64-FP-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-FP-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-FP-NEXT: vmv.v.i v8, 0 +; RV64-FP-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-FP-NEXT: vmv.s.x v8, a0 ; RV64-FP-NEXT: ret %b = bitcast i64 %a to <1 x double> @@ -258,7 +274,9 @@ define <1 x i16> @bitcast_f16_v1i16(half %a) { ; CHECK-LABEL: bitcast_f16_v1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast half %a to <1 x i16> @@ -268,7 +286,9 @@ define <1 x half> @bitcast_f16_v1f16(half %a) { ; CHECK-LABEL: bitcast_f16_v1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast half %a to <1 x half> @@ -278,7 +298,9 @@ define <2 x i16> @bitcast_f32_v2i16(float %a) { ; CHECK-LABEL: bitcast_f32_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast float %a to <2 x i16> @@ -288,7 +310,9 @@ define <2 x half> @bitcast_f32_v2f16(float %a) { ; CHECK-LABEL: bitcast_f32_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast float %a to <2 x half> @@ -298,7 +322,9 @@ define <1 x i32> @bitcast_f32_v1i32(float %a) { ; CHECK-LABEL: bitcast_f32_v1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast float %a to <1 x i32> @@ -308,7 +334,9 @@ define <1 x float> @bitcast_f32_v1f32(float %a) { ; CHECK-LABEL: bitcast_f32_v1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast float %a to <1 x float> @@ -318,7 +346,9 @@ define <4 x i16> @bitcast_f64_v4i16(double %a) { ; CHECK-LABEL: bitcast_f64_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast double %a to <4 x i16> @@ -328,7 +358,9 @@ define <4 x half> @bitcast_f64_v4f16(double %a) { ; CHECK-LABEL: bitcast_f64_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast double %a to <4 x half> @@ -338,7 +370,9 @@ define <2 x i32> @bitcast_f64_v2i32(double %a) { ; CHECK-LABEL: bitcast_f64_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast double %a to <2 x i32> @@ -348,7 +382,9 @@ define <2 x float> @bitcast_f64_v2f32(double %a) { ; CHECK-LABEL: bitcast_f64_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast double %a to <2 x float> @@ -358,7 +394,9 @@ define <1 x i64> @bitcast_f64_v1i64(double %a) { ; CHECK-LABEL: bitcast_f64_v1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast double %a to <1 x i64> @@ -368,7 +406,9 @@ define <1 x double> @bitcast_f64_v1f64(double %a) { ; CHECK-LABEL: bitcast_f64_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret %b = bitcast double %a to <1 x double> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -36,19 +36,28 @@ ; LMULMAX1-LABEL: hang_when_merging_stores_after_legalization: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: li a0, 2 -; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v0, 0 +; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-NEXT: vmv.s.x v0, a0 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vrgather.vi v12, v8, 0 ; LMULMAX1-NEXT: vrgather.vi v12, v9, 3, v0.t ; LMULMAX1-NEXT: li a0, 8 +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v0, 0 +; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-NEXT: vmv.s.x v0, a0 -; LMULMAX1-NEXT: vrgather.vi v9, v10, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-NEXT: vrgather.vi v8, v10, 0 +; LMULMAX1-NEXT: vrgather.vi v8, v11, 3, v0.t ; LMULMAX1-NEXT: li a0, 3 -; LMULMAX1-NEXT: vmv.s.x v8, a0 -; LMULMAX1-NEXT: vrgather.vi v9, v11, 3, v0.t -; LMULMAX1-NEXT: vmv.v.v v0, v8 -; LMULMAX1-NEXT: vmerge.vvm v8, v9, v12, v0 +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v0, 0 +; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, tu, mu +; LMULMAX1-NEXT: vmv.s.x v0, a0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-NEXT: vmerge.vvm v8, v8, v12, v0 ; LMULMAX1-NEXT: ret ; ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization: @@ -60,12 +69,18 @@ ; LMULMAX2-NEXT: vse32.v v10, (a0) ; LMULMAX2-NEXT: mv a0, sp ; LMULMAX2-NEXT: vse32.v v8, (a0) -; LMULMAX2-NEXT: vslidedown.vi v10, v10, 7 +; LMULMAX2-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v12, 0 +; LMULMAX2-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; LMULMAX2-NEXT: vslidedown.vi v12, v10, 7 ; LMULMAX2-NEXT: addi a1, sp, 12 -; LMULMAX2-NEXT: vse32.v v10, (a1) -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 7 +; LMULMAX2-NEXT: vse32.v v12, (a1) +; LMULMAX2-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 +; LMULMAX2-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; LMULMAX2-NEXT: vslidedown.vi v10, v8, 7 ; LMULMAX2-NEXT: addi a1, sp, 4 -; LMULMAX2-NEXT: vse32.v v8, (a1) +; LMULMAX2-NEXT: vse32.v v10, (a1) ; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a0) ; LMULMAX2-NEXT: addi sp, sp, 16 @@ -108,15 +123,18 @@ define void @buildvec_dominant0_v4f32(<4 x float>* %x) { ; CHECK-LABEL: buildvec_dominant0_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vmv.s.x v8, zero ; CHECK-NEXT: lui a1, %hi(.LCPI4_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI4_0) -; CHECK-NEXT: vlse32.v v8, (a1), zero -; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vlse32.v v9, (a1), zero ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v9, v8, 2 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: ret store <4 x float> , <4 x float>* %x ret void @@ -125,8 +143,11 @@ define void @buildvec_dominant1_v4f32(<4 x float>* %x, float %f) { ; CHECK-LABEL: buildvec_dominant1_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v9, v8, 1 @@ -146,8 +167,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI6_0) ; CHECK-NEXT: flw ft0, %lo(.LCPI6_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, ft0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v9, v8, 1 @@ -163,31 +187,20 @@ } define void @buildvec_merge0_v4f32(<4 x float>* %x, float %f) { -; RV32-LABEL: buildvec_merge0_v4f32: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 6 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV32-NEXT: lui a2, %hi(.LCPI7_0) -; RV32-NEXT: flw ft0, %lo(.LCPI7_0)(a2) -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vfmv.v.f v8, fa0 -; RV32-NEXT: vfmerge.vfm v8, v8, ft0, v0 -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_merge0_v4f32: -; RV64: # %bb.0: -; RV64-NEXT: lui a1, %hi(.LCPI7_0) -; RV64-NEXT: flw ft0, %lo(.LCPI7_0)(a1) -; RV64-NEXT: li a1, 6 -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vfmv.v.f v8, fa0 -; RV64-NEXT: vfmerge.vfm v8, v8, ft0, v0 -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_merge0_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu +; CHECK-NEXT: lui a2, %hi(.LCPI7_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI7_0)(a2) +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vfmerge.vfm v8, v8, ft0, v0 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret %v0 = insertelement <4 x float> poison, float %f, i32 0 %v1 = insertelement <4 x float> %v0, float 2.0, i32 1 %v2 = insertelement <4 x float> %v1, float 2.0, i32 2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -47,6 +47,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v9, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu @@ -77,14 +79,20 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v9, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v9 ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfwcvt.f.f.v v9, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf4, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+m -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+m -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 ; Test optimizing interleaves to widening arithmetic. @@ -38,29 +38,43 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { ; RV32-V128-LABEL: interleave_v2f64: ; RV32-V128: # %bb.0: -; RV32-V128-NEXT: vmv1r.v v12, v9 +; RV32-V128-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-V128-NEXT: vmv.v.i v10, 0 +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmv1r.v v10, v9 +; RV32-V128-NEXT: vmv1r.v v12, v8 ; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV32-V128-NEXT: vid.v v9 -; RV32-V128-NEXT: vsrl.vi v9, v9, 1 +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v14, v8, 1 ; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v8, v12, v14 ; RV32-V128-NEXT: li a0, 10 +; RV32-V128-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-V128-NEXT: vmv.v.i v0, 0 +; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-V128-NEXT: vmv.s.x v0, a0 -; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v9 -; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v9, v0.t -; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v8, v10, v14, v0.t ; RV32-V128-NEXT: ret ; ; RV64-V128-LABEL: interleave_v2f64: ; RV64-V128: # %bb.0: -; RV64-V128-NEXT: vmv1r.v v12, v9 +; RV64-V128-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-V128-NEXT: vmv.v.i v10, 0 +; RV64-V128-NEXT: vmv.v.i v12, 0 +; RV64-V128-NEXT: vmv1r.v v10, v9 +; RV64-V128-NEXT: vmv1r.v v12, v8 ; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-V128-NEXT: vid.v v10 -; RV64-V128-NEXT: vsrl.vi v14, v10, 1 +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v14, v8, 1 +; RV64-V128-NEXT: vrgather.vv v8, v12, v14 ; RV64-V128-NEXT: li a0, 10 +; RV64-V128-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-V128-NEXT: vmv.v.i v0, 0 +; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-V128-NEXT: vmv.s.x v0, a0 -; RV64-V128-NEXT: vrgather.vv v10, v8, v14 -; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t -; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-V128-NEXT: vrgather.vv v8, v10, v14, v0.t ; RV64-V128-NEXT: ret ; ; RV32-V512-LABEL: interleave_v2f64: @@ -69,9 +83,13 @@ ; RV32-V512-NEXT: vid.v v10 ; RV32-V512-NEXT: vsrl.vi v11, v10, 1 ; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 ; RV32-V512-NEXT: li a0, 10 +; RV32-V512-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-V512-NEXT: vmv.v.i v0, 0 +; RV32-V512-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-V512-NEXT: vmv.s.x v0, a0 -; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t ; RV32-V512-NEXT: vmv.v.v v8, v10 ; RV32-V512-NEXT: ret @@ -81,9 +99,13 @@ ; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV64-V512-NEXT: vid.v v10 ; RV64-V512-NEXT: vsrl.vi v11, v10, 1 +; RV64-V512-NEXT: vrgather.vv v10, v8, v11 ; RV64-V512-NEXT: li a0, 10 +; RV64-V512-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-V512-NEXT: vmv.v.i v0, 0 +; RV64-V512-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-V512-NEXT: vmv.s.x v0, a0 -; RV64-V512-NEXT: vrgather.vv v10, v8, v11 +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t ; RV64-V512-NEXT: vmv.v.v v8, v10 ; RV64-V512-NEXT: ret @@ -259,47 +281,59 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: li a1, 24 +; RV32-V128-NEXT: mul a0, a0, a1 ; RV32-V128-NEXT: sub sp, sp, a0 ; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) ; RV32-V128-NEXT: li a1, 32 ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 ; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vrgather.vv v16, v8, v0 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) ; RV32-V128-NEXT: vle32.v v24, (a0) ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, 699051 ; RV32-V128-NEXT: addi a0, a0, -1366 -; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-V128-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-V128-NEXT: vmv.v.i v0, 0 +; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-V128-NEXT: vmv.s.x v0, a0 ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV32-V128-NEXT: vrgather.vv v16, v8, v24, v0.t +; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV32-V128-NEXT: vmv4r.v v24, v8 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vwaddu.vv v0, v8, v24 ; RV32-V128-NEXT: li a0, -1 -; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV32-V128-NEXT: vwmaccu.vx v0, a0, v24 ; RV32-V128-NEXT: vmv8r.v v8, v0 -; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: li a1, 24 +; RV32-V128-NEXT: mul a0, a0, a1 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -309,47 +343,59 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: li a1, 24 +; RV64-V128-NEXT: mul a0, a0, a1 ; RV64-V128-NEXT: sub sp, sp, a0 ; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) ; RV64-V128-NEXT: li a1, 32 ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 ; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vrgather.vv v16, v8, v0 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) ; RV64-V128-NEXT: vle32.v v24, (a0) ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, 699051 ; RV64-V128-NEXT: addiw a0, a0, -1366 -; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-V128-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-V128-NEXT: vmv.v.i v0, 0 +; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-V128-NEXT: vmv.s.x v0, a0 ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV64-V128-NEXT: vrgather.vv v16, v8, v24, v0.t +; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV64-V128-NEXT: vmv4r.v v24, v8 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vwaddu.vv v0, v8, v24 ; RV64-V128-NEXT: li a0, -1 -; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV64-V128-NEXT: vwmaccu.vx v0, a0, v24 ; RV64-V128-NEXT: vmv8r.v v8, v0 -; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: li a1, 24 +; RV64-V128-NEXT: mul a0, a0, a1 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -6,7 +6,9 @@ ; CHECK-LABEL: shuffle_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 11 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 @@ -19,7 +21,9 @@ ; CHECK-LABEL: shuffle_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 236 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 @@ -29,53 +33,35 @@ } define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) { -; RV32-LABEL: shuffle_fv_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: li a0, 9 -; RV32-NEXT: lui a1, %hi(.LCPI2_0) -; RV32-NEXT: fld ft0, %lo(.LCPI2_0)(a1) -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vfmerge.vfm v8, v8, ft0, v0 -; RV32-NEXT: ret -; -; RV64-LABEL: shuffle_fv_v4f64: -; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI2_0) -; RV64-NEXT: fld ft0, %lo(.LCPI2_0)(a0) -; RV64-NEXT: li a0, 9 -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vfmerge.vfm v8, v8, ft0, v0 -; RV64-NEXT: ret +; CHECK-LABEL: shuffle_fv_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: lui a1, %hi(.LCPI2_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI2_0)(a1) +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vfmerge.vfm v8, v8, ft0, v0 +; CHECK-NEXT: ret %s = shufflevector <4 x double> , <4 x double> %x, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) { -; RV32-LABEL: shuffle_vf_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: li a0, 6 -; RV32-NEXT: lui a1, %hi(.LCPI3_0) -; RV32-NEXT: fld ft0, %lo(.LCPI3_0)(a1) -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vfmerge.vfm v8, v8, ft0, v0 -; RV32-NEXT: ret -; -; RV64-LABEL: shuffle_vf_v4f64: -; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI3_0) -; RV64-NEXT: fld ft0, %lo(.LCPI3_0)(a0) -; RV64-NEXT: li a0, 6 -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vfmerge.vfm v8, v8, ft0, v0 -; RV64-NEXT: ret +; CHECK-LABEL: shuffle_vf_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 6 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: lui a1, %hi(.LCPI3_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI3_0)(a1) +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vfmerge.vfm v8, v8, ft0, v0 +; CHECK-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s } @@ -135,9 +121,13 @@ ; RV32-NEXT: addi a0, a0, %lo(.LCPI6_0) ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle16.v v14, (a0) +; RV32-NEXT: vrgatherei16.vv v12, v8, v14 ; RV32-NEXT: li a0, 8 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgatherei16.vv v12, v8, v14 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vrgather.vi v12, v10, 1, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret @@ -148,9 +138,13 @@ ; RV64-NEXT: addi a0, a0, %lo(.LCPI6_0) ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v14, (a0) +; RV64-NEXT: vrgather.vv v12, v8, v14 ; RV64-NEXT: li a0, 8 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vrgather.vv v12, v8, v14 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vrgather.vi v12, v10, 1, v0.t ; RV64-NEXT: vmv.v.v v8, v12 ; RV64-NEXT: ret @@ -162,7 +156,9 @@ ; RV32-LABEL: vrgather_shuffle_xv_v4f64: ; RV32: # %bb.0: ; RV32-NEXT: li a0, 12 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV32-NEXT: lui a0, %hi(.LCPI7_0) @@ -178,7 +174,9 @@ ; RV64-LABEL: vrgather_shuffle_xv_v4f64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 12 -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: lui a0, %hi(.LCPI7_0) @@ -197,14 +195,17 @@ ; RV32-LABEL: vrgather_shuffle_vx_v4f64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV32-NEXT: vid.v v12 +; RV32-NEXT: vid.v v10 ; RV32-NEXT: li a0, 3 -; RV32-NEXT: lui a1, %hi(.LCPI8_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV32-NEXT: vlse64.v v10, (a1), zero -; RV32-NEXT: vmul.vx v12, v12, a0 +; RV32-NEXT: vmul.vx v12, v10, a0 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: lui a0, %hi(.LCPI8_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI8_0) +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vlse64.v v10, (a0), zero ; RV32-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret @@ -212,13 +213,17 @@ ; RV64-LABEL: vrgather_shuffle_vx_v4f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vid.v v12 +; RV64-NEXT: vid.v v10 +; RV64-NEXT: li a0, 3 +; RV64-NEXT: vmul.vx v12, v10, a0 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, %hi(.LCPI8_0) ; RV64-NEXT: addi a0, a0, %lo(.LCPI8_0) +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vlse64.v v10, (a0), zero -; RV64-NEXT: li a0, 3 -; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vmul.vx v12, v12, a0 ; RV64-NEXT: vrgather.vv v10, v8, v12, v0.t ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret @@ -229,8 +234,11 @@ define <4 x half> @slidedown_v4f16(<4 x half> %x) { ; CHECK-LABEL: slidedown_v4f16: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %s = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> ret <4 x half> %s @@ -239,8 +247,11 @@ define <8 x float> @slidedown_v8f32(<8 x float> %x) { ; CHECK-LABEL: slidedown_v8f32: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %s = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> ret <8 x float> %s @@ -249,6 +260,8 @@ define <4 x half> @slideup_v4f16(<4 x half> %x) { ; CHECK-LABEL: slideup_v4f16: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: vmv1r.v v8, v9 @@ -260,6 +273,8 @@ define <8 x float> @slideup_v8f32(<8 x float> %x) { ; CHECK-LABEL: slideup_v8f32: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; CHECK-NEXT: vslideup.vi v10, v8, 3 ; CHECK-NEXT: vmv2r.v v8, v10 @@ -271,6 +286,8 @@ define <8 x float> @splice_unary(<8 x float> %x) { ; CHECK-LABEL: splice_unary: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu @@ -284,6 +301,8 @@ define <8 x double> @splice_unary2(<8 x double> %x) { ; CHECK-LABEL: splice_unary2: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 2, e64, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 6 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, tu, mu @@ -297,10 +316,13 @@ define <8 x float> @splice_binary(<8 x float> %x, <8 x float> %y) { ; CHECK-LABEL: splice_binary: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: vslideup.vi v12, v10, 6 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %s = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> ret <8 x float> %s @@ -309,11 +331,13 @@ define <8 x double> @splice_binary2(<8 x double> %x, <8 x double> %y) { ; CHECK-LABEL: splice_binary2: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 3, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v12, v12, 5 +; CHECK-NEXT: vslidedown.vi v16, v12, 5 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, tu, mu -; CHECK-NEXT: vslideup.vi v12, v8, 3 -; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: vslideup.vi v16, v8, 3 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %s = shufflevector <8 x double> %x, <8 x double> %y, <8 x i32> ret <8 x double> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -240,22 +240,26 @@ ; LMULMAX1-LABEL: fp2si_v8f32_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: vle32.v v8, (a2) +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 ; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v10 -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v8 -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v8, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v9, v8 ; LMULMAX1-NEXT: addi a0, a1, 16 ; LMULMAX1-NEXT: vse64.v v12, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) +; LMULMAX1-NEXT: vse64.v v9, (a1) ; LMULMAX1-NEXT: addi a0, a1, 48 ; LMULMAX1-NEXT: vse64.v v11, (a0) ; LMULMAX1-NEXT: addi a0, a1, 32 @@ -279,22 +283,26 @@ ; LMULMAX1-LABEL: fp2ui_v8f32_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: vle32.v v8, (a2) +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 ; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v10 -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v8 -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v8, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v9, v8 ; LMULMAX1-NEXT: addi a0, a1, 16 ; LMULMAX1-NEXT: vse64.v v12, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) +; LMULMAX1-NEXT: vse64.v v9, (a1) ; LMULMAX1-NEXT: addi a0, a1, 48 ; LMULMAX1-NEXT: vse64.v v11, (a0) ; LMULMAX1-NEXT: addi a0, a1, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll @@ -10,9 +10,12 @@ ; CHECK-LABEL: vfptosi_v4i1_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl) ret <4 x i1> %v @@ -35,9 +38,12 @@ ; CHECK-LABEL: vfptosi_v4i1_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %v = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl) ret <4 x i1> %v @@ -61,7 +67,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vmsne.vi v8, v10, 0, v0.t ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll @@ -318,9 +318,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll @@ -10,9 +10,12 @@ ; CHECK-LABEL: vfptoui_v4i1_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.xu.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl) ret <4 x i1> %v @@ -35,9 +38,12 @@ ; CHECK-LABEL: vfptoui_v4i1_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.xu.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %v = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl) ret <4 x i1> %v @@ -61,7 +67,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vfcvt.rtz.xu.f.v v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vmsne.vi v8, v10, 0, v0.t ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll @@ -318,9 +318,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -156,10 +156,12 @@ ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vmerge.vim v10, v10, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v10, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v9, v9, -1, v0 ; LMULMAX1-NEXT: vfcvt.f.x.v v9, v9 @@ -186,10 +188,12 @@ ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vmerge.vim v10, v10, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v10, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v9, v9, 1, v0 ; LMULMAX1-NEXT: vfcvt.f.xu.v v9, v9 @@ -242,13 +246,19 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v9, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v10, v9 ; LMULMAX1-NEXT: vfcvt.f.x.v v9, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -286,13 +296,19 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vle16.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v9, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vzext.vf4 v10, v9 ; LMULMAX1-NEXT: vfcvt.f.xu.v v9, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v11, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu @@ -335,10 +351,12 @@ ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmv.v.i v12, 0 ; LMULMAX1-NEXT: vmerge.vim v9, v12, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v9, v11, -1, v0 ; LMULMAX1-NEXT: vfcvt.f.x.v v9, v9 @@ -346,19 +364,23 @@ ; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vmv1r.v v0, v10 ; LMULMAX1-NEXT: vmerge.vim v10, v13, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v13, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v10, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v10, v11, -1, v0 ; LMULMAX1-NEXT: vfcvt.f.x.v v10, v10 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmerge.vim v12, v12, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v12, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v12, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v11, v11, -1, v0 ; LMULMAX1-NEXT: vfcvt.f.x.v v11, v11 @@ -386,10 +408,12 @@ ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmv.v.i v12, 0 ; LMULMAX1-NEXT: vmerge.vim v9, v12, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v9, v11, 1, v0 ; LMULMAX1-NEXT: vfcvt.f.xu.v v9, v9 @@ -397,19 +421,23 @@ ; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vmv1r.v v0, v10 ; LMULMAX1-NEXT: vmerge.vim v10, v13, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v13, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v10, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v10, v11, 1, v0 ; LMULMAX1-NEXT: vfcvt.f.xu.v v10, v10 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmerge.vim v12, v12, 1, v0 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v13, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v12, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v12, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; LMULMAX1-NEXT: vmerge.vim v11, v11, 1, v0 ; LMULMAX1-NEXT: vfcvt.f.xu.v v11, v11 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll @@ -19,8 +19,11 @@ define <1 x i1> @insertelt_idx_v1i1(<1 x i1> %x, i1 %elt, i32 zeroext %idx) nounwind { ; RV32-LABEL: insertelt_idx_v1i1: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vmerge.vim v9, v9, 1, v0 ; RV32-NEXT: addi a0, a1, 1 @@ -33,8 +36,11 @@ ; ; RV64-LABEL: insertelt_idx_v1i1: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vmerge.vim v9, v9, 1, v0 ; RV64-NEXT: sext.w a0, a1 @@ -52,8 +58,11 @@ define <2 x i1> @insertelt_v2i1(<2 x i1> %x, i1 %elt) nounwind { ; CHECK-LABEL: insertelt_v2i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, mu @@ -69,8 +78,11 @@ define <2 x i1> @insertelt_idx_v2i1(<2 x i1> %x, i1 %elt, i32 zeroext %idx) nounwind { ; RV32-LABEL: insertelt_idx_v2i1: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 2, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vmerge.vim v9, v9, 1, v0 ; RV32-NEXT: addi a0, a1, 1 @@ -83,8 +95,11 @@ ; ; RV64-LABEL: insertelt_idx_v2i1: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 2, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vmerge.vim v9, v9, 1, v0 ; RV64-NEXT: sext.w a0, a1 @@ -102,8 +117,11 @@ define <8 x i1> @insertelt_v8i1(<8 x i1> %x, i1 %elt) nounwind { ; CHECK-LABEL: insertelt_v8i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, mu @@ -119,8 +137,11 @@ define <8 x i1> @insertelt_idx_v8i1(<8 x i1> %x, i1 %elt, i32 zeroext %idx) nounwind { ; RV32-LABEL: insertelt_idx_v8i1: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vmerge.vim v9, v9, 1, v0 ; RV32-NEXT: addi a0, a1, 1 @@ -133,8 +154,11 @@ ; ; RV64-LABEL: insertelt_idx_v8i1: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vmerge.vim v9, v9, 1, v0 ; RV64-NEXT: sext.w a0, a1 @@ -153,8 +177,11 @@ ; CHECK-LABEL: insertelt_v64i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, mu ; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, m4, tu, mu @@ -171,8 +198,11 @@ ; RV32-LABEL: insertelt_idx_v64i1: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 64 -; RV32-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetvli zero, a2, e8, m4, tu, mu ; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: vsetvli zero, zero, e8, m4, ta, mu ; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: vmerge.vim v12, v12, 1, v0 ; RV32-NEXT: addi a0, a1, 1 @@ -186,8 +216,11 @@ ; RV64-LABEL: insertelt_idx_v64i1: ; RV64: # %bb.0: ; RV64-NEXT: li a2, 64 -; RV64-NEXT: vsetvli zero, a2, e8, m4, ta, mu +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetvli zero, a2, e8, m4, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vsetvli zero, zero, e8, m4, ta, mu ; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: vmerge.vim v12, v12, 1, v0 ; RV64-NEXT: sext.w a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -14,8 +14,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv1r.v v16, v12 ; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 0 +; CHECK-NEXT: vslideup.vi v8, v16, 0 ; CHECK-NEXT: ret %sv = load <2 x i32>, <2 x i32>* %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 0) @@ -27,8 +30,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv1r.v v16, v12 ; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 2 +; CHECK-NEXT: vslideup.vi v8, v16, 2 ; CHECK-NEXT: ret %sv = load <2 x i32>, <2 x i32>* %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 2) @@ -40,8 +46,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv1r.v v16, v12 ; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 6 +; CHECK-NEXT: vslideup.vi v8, v16, 6 ; CHECK-NEXT: ret %sv = load <2 x i32>, <2 x i32>* %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 6) @@ -53,18 +62,28 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: vle32.v v12, (a0) +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; LMULMAX2-NEXT: vmv.v.i v16, 0 +; LMULMAX2-NEXT: vmv2r.v v16, v12 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, mu -; LMULMAX2-NEXT: vslideup.vi v8, v12, 0 +; LMULMAX2-NEXT: vslideup.vi v8, v16, 0 ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-NEXT: vle32.v v12, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v16, (a0) +; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; LMULMAX1-NEXT: vmv.v.i v16, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-NEXT: vle32.v v13, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; LMULMAX1-NEXT: vmv.v.i v20, 0 +; LMULMAX1-NEXT: vmv1r.v v16, v12 +; LMULMAX1-NEXT: vmv1r.v v20, v13 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v8, v12, 0 +; LMULMAX1-NEXT: vslideup.vi v8, v20, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, mu ; LMULMAX1-NEXT: vslideup.vi v8, v16, 4 ; LMULMAX1-NEXT: ret @@ -78,18 +97,28 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: vle32.v v12, (a0) +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; LMULMAX2-NEXT: vmv.v.i v16, 0 +; LMULMAX2-NEXT: vmv2r.v v16, v12 ; LMULMAX2-NEXT: vsetivli zero, 16, e32, m4, tu, mu -; LMULMAX2-NEXT: vslideup.vi v8, v12, 8 +; LMULMAX2-NEXT: vslideup.vi v8, v16, 8 ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-NEXT: vle32.v v12, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v16, (a0) +; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; LMULMAX1-NEXT: vmv.v.i v16, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-NEXT: vle32.v v13, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; LMULMAX1-NEXT: vmv.v.i v20, 0 +; LMULMAX1-NEXT: vmv1r.v v16, v12 +; LMULMAX1-NEXT: vmv1r.v v20, v13 ; LMULMAX1-NEXT: vsetivli zero, 12, e32, m4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v8, v12, 8 +; LMULMAX1-NEXT: vslideup.vi v8, v20, 8 ; LMULMAX1-NEXT: vsetivli zero, 16, e32, m4, tu, mu ; LMULMAX1-NEXT: vslideup.vi v8, v16, 12 ; LMULMAX1-NEXT: ret @@ -102,7 +131,10 @@ ; CHECK-LABEL: insert_nxv8i32_undef_v2i32_0: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv1r.v v8, v12 ; CHECK-NEXT: ret %sv = load <2 x i32>, <2 x i32>* %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( undef, <2 x i32> %sv, i64 0) @@ -165,12 +197,15 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a1) +; LMULMAX2-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-NEXT: vle32.v v10, (a0) +; LMULMAX2-NEXT: vle32.v v12, (a0) +; LMULMAX2-NEXT: vmv1r.v v10, v8 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; LMULMAX2-NEXT: vslideup.vi v10, v8, 0 +; LMULMAX2-NEXT: vslideup.vi v12, v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: vse32.v v12, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_v8i32_v2i32_0: @@ -196,12 +231,15 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a1) +; LMULMAX2-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-NEXT: vle32.v v10, (a0) +; LMULMAX2-NEXT: vle32.v v12, (a0) +; LMULMAX2-NEXT: vmv1r.v v10, v8 ; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, mu -; LMULMAX2-NEXT: vslideup.vi v10, v8, 2 +; LMULMAX2-NEXT: vslideup.vi v12, v10, 2 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: vse32.v v12, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_v8i32_v2i32_2: @@ -226,11 +264,14 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a1) +; LMULMAX2-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-NEXT: vle32.v v10, (a0) +; LMULMAX2-NEXT: vle32.v v12, (a0) +; LMULMAX2-NEXT: vmv1r.v v10, v8 ; LMULMAX2-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; LMULMAX2-NEXT: vslideup.vi v10, v8, 6 -; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: vslideup.vi v12, v10, 6 +; LMULMAX2-NEXT: vse32.v v12, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_v8i32_v2i32_6: @@ -256,15 +297,21 @@ ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a1) +; LMULMAX2-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 +; LMULMAX2-NEXT: vmv1r.v v10, v8 +; LMULMAX2-NEXT: vmv.v.i v8, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, mu -; LMULMAX2-NEXT: vslideup.vi v10, v8, 6 -; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: vslideup.vi v8, v10, 6 +; LMULMAX2-NEXT: vse32.v v8, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_v8i32_undef_v2i32_6: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vle32.v v8, (a1) +; LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; LMULMAX1-NEXT: vslideup.vi v9, v8, 2 ; LMULMAX1-NEXT: addi a0, a0, 16 @@ -512,10 +559,17 @@ ; CHECK-LABEL: insert_v2i64_nxv16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v17, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: vmv1r.v v24, v17 ; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 4 +; CHECK-NEXT: vslideup.vi v8, v24, 4 ; CHECK-NEXT: vs8r.v v8, (a2) ; CHECK-NEXT: ret %sv0 = load <2 x i64>, <2 x i64>* %psv0 @@ -531,7 +585,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vs8r.v v8, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vs8r.v v16, (a1) ; CHECK-NEXT: ret %sv = load <2 x i64>, <2 x i64>* %psv %v = call @llvm.vector.insert.v2i64.nxv16i64( undef, <2 x i64> %sv, i64 0) @@ -544,9 +601,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 4, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vi v16, v8, 2 -; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vslideup.vi v8, v16, 2 +; CHECK-NEXT: vs8r.v v8, (a1) ; CHECK-NEXT: ret %sv = load <2 x i64>, <2 x i64>* %psv %v = call @llvm.vector.insert.v2i64.nxv16i64( undef, <2 x i64> %sv, i64 2) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -23,8 +23,10 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu ; RV64-NEXT: vmv.s.x v10, a1 -; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, mu ; RV64-NEXT: vslideup.vi v8, v10, 3 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret @@ -43,24 +45,30 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: lw a3, 16(a0) ; RV32-NEXT: addi a4, a0, 20 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vlse32.v v10, (a4), zero +; RV32-NEXT: vlse32.v v9, (a4), zero ; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu -; RV32-NEXT: vmv.s.x v10, a3 -; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, mu -; RV32-NEXT: vslideup.vi v8, v10, 2 +; RV32-NEXT: vmv.s.x v9, a3 +; RV32-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v12, v9 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vslideup.vi v10, v12, 2 ; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vslide1up.vx v12, v10, a2 -; RV32-NEXT: vslide1up.vx v10, v12, a1 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vslide1up.vx v12, v8, a2 +; RV32-NEXT: vslide1up.vx v8, v12, a1 ; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, mu -; RV32-NEXT: vslideup.vi v8, v10, 2 +; RV32-NEXT: vslideup.vi v10, v8, 2 ; RV32-NEXT: sw a1, 16(a0) ; RV32-NEXT: sw a2, 20(a0) ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: vse64.v v10, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v3i64: @@ -78,6 +86,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 15, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 14 @@ -96,6 +107,9 @@ ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e16, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vsetvli a4, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a3, e16, m4, tu, mu ; RV32-NEXT: vmv.s.x v12, a1 ; RV32-NEXT: addi a1, a2, 1 ; RV32-NEXT: vsetvli zero, a1, e16, m4, tu, mu @@ -109,6 +123,9 @@ ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e16, m4, ta, mu ; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: vsetvli a4, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a3, e16, m4, tu, mu ; RV64-NEXT: vmv.s.x v12, a1 ; RV64-NEXT: sext.w a1, a2 ; RV64-NEXT: addi a2, a1, 1 @@ -128,6 +145,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV32-NEXT: vfmv.s.f v10, fa0 ; RV32-NEXT: addi a2, a1, 1 ; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, mu @@ -140,6 +160,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64-NEXT: vfmv.s.f v10, fa0 ; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: addi a2, a1, 1 @@ -176,6 +199,9 @@ ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: li a2, -1 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu ; RV32-NEXT: vmv.s.x v12, a2 ; RV32-NEXT: addi a2, a1, 1 ; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu @@ -189,6 +215,9 @@ ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu ; RV64-NEXT: vmv.s.x v12, a2 ; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: addi a2, a1, 1 @@ -225,6 +254,9 @@ ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: li a2, 6 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu ; RV32-NEXT: vmv.s.x v12, a2 ; RV32-NEXT: addi a2, a1, 1 ; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu @@ -238,6 +270,9 @@ ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a2, 6 +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu ; RV64-NEXT: vmv.s.x v12, a2 ; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: addi a2, a1, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -192,8 +192,11 @@ ; RV32-LABEL: buildvec_vid_step1_add0_v4i64: ; RV32: # %bb.0: ; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV32-NEXT: vslideup.vi v8, v9, 2 @@ -216,8 +219,11 @@ ; RV32-LABEL: buildvec_vid_step2_add0_v4i64: ; RV32: # %bb.0: ; RV32-NEXT: li a0, 2 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV32-NEXT: vslideup.vi v8, v9, 2 @@ -309,8 +315,11 @@ define void @buildvec_dominant0_v8i16(<8 x i16>* %x) { ; CHECK-LABEL: buildvec_dominant0_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, 8 ; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vi v9, v8, 3 @@ -499,15 +508,20 @@ ; RV32-NEXT: li a1, 3 ; RV32-NEXT: sb a1, 8(a0) ; RV32-NEXT: li a1, 73 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmv.v.i v9, 2 +; RV32-NEXT: vmv.v.i v8, 2 +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-NEXT: li a1, 36 -; RV32-NEXT: vmv.s.x v8, a1 -; RV32-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vmerge.vim v8, v9, 3, v0 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vmerge.vim v8, v8, 3, v0 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; @@ -548,13 +562,20 @@ ; RV32-NEXT: vse32.v v8, (a2) ; RV32-NEXT: vse32.v v8, (a3) ; RV32-NEXT: vse32.v v8, (a4) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v8, zero ; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV32-NEXT: vslideup.vi v9, v8, 1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV32-NEXT: vse32.v v9, (a5) ; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV32-NEXT: vslideup.vi v9, v8, 3 @@ -572,13 +593,20 @@ ; RV64-NEXT: vse32.v v8, (a2) ; RV64-NEXT: vse32.v v8, (a3) ; RV64-NEXT: vse32.v v8, (a4) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v8, zero ; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64-NEXT: vslideup.vi v9, v8, 1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64-NEXT: vse32.v v9, (a5) ; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64-NEXT: vslideup.vi v9, v8, 3 @@ -610,15 +638,21 @@ ; CHECK-NEXT: vse16.v v8, (a3) ; CHECK-NEXT: vse16.v v8, (a4) ; CHECK-NEXT: li a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v10, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v10, v8, 1 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vse16.v v10, (a5) ; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v9, v8, 3 ; CHECK-NEXT: vse16.v v9, (a6) ; CHECK-NEXT: ret @@ -752,8 +786,11 @@ ; CHECK-LABEL: buildvec_not_vid_v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 7, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 6 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll @@ -55,6 +55,8 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vle8.v v8, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v9, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v9, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -85,12 +87,18 @@ ; LMULMAX2-NEXT: li a2, 32 ; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, mu ; LMULMAX2-NEXT: vle8.v v8, (a0) +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX2-NEXT: vslidedown.vi v10, v8, 8 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: vsext.vf4 v12, v10 +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; LMULMAX2-NEXT: vmv.v.i v10, 0 ; LMULMAX2-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; LMULMAX2-NEXT: vslidedown.vi v10, v8, 16 +; LMULMAX2-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX2-NEXT: vmv.v.i v9, 0 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; LMULMAX2-NEXT: vslidedown.vi v9, v10, 8 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu @@ -109,36 +117,48 @@ ; LMULMAX1-LABEL: sext_v32i8_v32i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: vle8.v v8, (a2) +; LMULMAX1-NEXT: vle8.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 ; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v11, v10 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v10, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 8 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v12, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v12, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v13, v12 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v12, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v12, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v14, v12 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v12, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 8 +; LMULMAX1-NEXT: vslidedown.vi v12, v8, 8 +; LMULMAX1-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v15, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; LMULMAX1-NEXT: vslidedown.vi v15, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-NEXT: vsext.vf4 v16, v15 ; LMULMAX1-NEXT: vsext.vf4 v15, v10 ; LMULMAX1-NEXT: vsext.vf4 v10, v12 -; LMULMAX1-NEXT: vsext.vf4 v12, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v9 +; LMULMAX1-NEXT: vsext.vf4 v12, v9 +; LMULMAX1-NEXT: vsext.vf4 v9, v8 ; LMULMAX1-NEXT: addi a0, a1, 32 ; LMULMAX1-NEXT: vse32.v v10, (a0) -; LMULMAX1-NEXT: vse32.v v8, (a1) +; LMULMAX1-NEXT: vse32.v v9, (a1) ; LMULMAX1-NEXT: addi a0, a1, 96 ; LMULMAX1-NEXT: vse32.v v15, (a0) ; LMULMAX1-NEXT: addi a0, a1, 64 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 ; Test optimizing interleaves to widening arithmetic. @@ -51,29 +51,43 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; RV32-V128-LABEL: interleave_v2i64: ; RV32-V128: # %bb.0: -; RV32-V128-NEXT: vmv1r.v v12, v9 +; RV32-V128-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-V128-NEXT: vmv.v.i v10, 0 +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmv1r.v v10, v9 +; RV32-V128-NEXT: vmv1r.v v12, v8 ; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV32-V128-NEXT: vid.v v9 -; RV32-V128-NEXT: vsrl.vi v9, v9, 1 +; RV32-V128-NEXT: vid.v v8 +; RV32-V128-NEXT: vsrl.vi v14, v8, 1 ; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v8, v12, v14 ; RV32-V128-NEXT: li a0, 10 +; RV32-V128-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-V128-NEXT: vmv.v.i v0, 0 +; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-V128-NEXT: vmv.s.x v0, a0 -; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v9 -; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v9, v0.t -; RV32-V128-NEXT: vmv.v.v v8, v10 +; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-V128-NEXT: vrgatherei16.vv v8, v10, v14, v0.t ; RV32-V128-NEXT: ret ; ; RV64-V128-LABEL: interleave_v2i64: ; RV64-V128: # %bb.0: -; RV64-V128-NEXT: vmv1r.v v12, v9 +; RV64-V128-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-V128-NEXT: vmv.v.i v10, 0 +; RV64-V128-NEXT: vmv.v.i v12, 0 +; RV64-V128-NEXT: vmv1r.v v10, v9 +; RV64-V128-NEXT: vmv1r.v v12, v8 ; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-V128-NEXT: vid.v v10 -; RV64-V128-NEXT: vsrl.vi v14, v10, 1 +; RV64-V128-NEXT: vid.v v8 +; RV64-V128-NEXT: vsrl.vi v14, v8, 1 +; RV64-V128-NEXT: vrgather.vv v8, v12, v14 ; RV64-V128-NEXT: li a0, 10 +; RV64-V128-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-V128-NEXT: vmv.v.i v0, 0 +; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-V128-NEXT: vmv.s.x v0, a0 -; RV64-V128-NEXT: vrgather.vv v10, v8, v14 -; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t -; RV64-V128-NEXT: vmv.v.v v8, v10 +; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-V128-NEXT: vrgather.vv v8, v10, v14, v0.t ; RV64-V128-NEXT: ret ; ; RV32-V512-LABEL: interleave_v2i64: @@ -82,9 +96,13 @@ ; RV32-V512-NEXT: vid.v v10 ; RV32-V512-NEXT: vsrl.vi v11, v10, 1 ; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 ; RV32-V512-NEXT: li a0, 10 +; RV32-V512-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-V512-NEXT: vmv.v.i v0, 0 +; RV32-V512-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-V512-NEXT: vmv.s.x v0, a0 -; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 +; RV32-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t ; RV32-V512-NEXT: vmv.v.v v8, v10 ; RV32-V512-NEXT: ret @@ -94,9 +112,13 @@ ; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV64-V512-NEXT: vid.v v10 ; RV64-V512-NEXT: vsrl.vi v11, v10, 1 +; RV64-V512-NEXT: vrgather.vv v10, v8, v11 ; RV64-V512-NEXT: li a0, 10 +; RV64-V512-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-V512-NEXT: vmv.v.i v0, 0 +; RV64-V512-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-V512-NEXT: vmv.s.x v0, a0 -; RV64-V512-NEXT: vrgather.vv v10, v8, v11 +; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t ; RV64-V512-NEXT: vmv.v.v v8, v10 ; RV64-V512-NEXT: ret @@ -365,47 +387,59 @@ ; RV32-V128-NEXT: addi sp, sp, -16 ; RV32-V128-NEXT: .cfi_def_cfa_offset 16 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: li a1, 24 +; RV32-V128-NEXT: mul a0, a0, a1 ; RV32-V128-NEXT: sub sp, sp, a0 ; RV32-V128-NEXT: lui a0, %hi(.LCPI15_0) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_0) ; RV32-V128-NEXT: li a1, 32 ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV32-V128-NEXT: vle32.v v0, (a0) -; RV32-V128-NEXT: vmv8r.v v24, v8 -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 ; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vrgather.vv v16, v8, v0 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, %hi(.LCPI15_1) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) ; RV32-V128-NEXT: vle32.v v24, (a0) ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, 699051 ; RV32-V128-NEXT: addi a0, a0, -1366 -; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-V128-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-V128-NEXT: vmv.v.i v0, 0 +; RV32-V128-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-V128-NEXT: vmv.s.x v0, a0 ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV32-V128-NEXT: vmv.v.v v24, v8 -; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-V128-NEXT: addi a0, sp, 16 ; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV32-V128-NEXT: vrgather.vv v16, v8, v24, v0.t +; RV32-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV32-V128-NEXT: vmv4r.v v24, v8 +; RV32-V128-NEXT: csrr a0, vlenb +; RV32-V128-NEXT: slli a0, a0, 3 +; RV32-V128-NEXT: add a0, sp, a0 +; RV32-V128-NEXT: addi a0, a0, 16 +; RV32-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-V128-NEXT: vwaddu.vv v0, v8, v24 ; RV32-V128-NEXT: li a0, -1 -; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV32-V128-NEXT: vwmaccu.vx v0, a0, v24 ; RV32-V128-NEXT: vmv8r.v v8, v0 -; RV32-V128-NEXT: vmv8r.v v16, v24 ; RV32-V128-NEXT: csrr a0, vlenb -; RV32-V128-NEXT: slli a0, a0, 4 +; RV32-V128-NEXT: li a1, 24 +; RV32-V128-NEXT: mul a0, a0, a1 ; RV32-V128-NEXT: add sp, sp, a0 ; RV32-V128-NEXT: addi sp, sp, 16 ; RV32-V128-NEXT: ret @@ -415,47 +449,59 @@ ; RV64-V128-NEXT: addi sp, sp, -16 ; RV64-V128-NEXT: .cfi_def_cfa_offset 16 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: li a1, 24 +; RV64-V128-NEXT: mul a0, a0, a1 ; RV64-V128-NEXT: sub sp, sp, a0 ; RV64-V128-NEXT: lui a0, %hi(.LCPI15_0) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_0) ; RV64-V128-NEXT: li a1, 32 ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV64-V128-NEXT: vle32.v v0, (a0) -; RV64-V128-NEXT: vmv8r.v v24, v8 -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 ; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vrgather.vv v16, v8, v0 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, %hi(.LCPI15_1) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) ; RV64-V128-NEXT: vle32.v v24, (a0) ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, 699051 ; RV64-V128-NEXT: addiw a0, a0, -1366 -; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-V128-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-V128-NEXT: vmv.v.i v0, 0 +; RV64-V128-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-V128-NEXT: vmv.s.x v0, a0 ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vrgather.vv v8, v16, v24, v0.t -; RV64-V128-NEXT: vmv.v.v v24, v8 -; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV64-V128-NEXT: addi a0, sp, 16 ; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-V128-NEXT: vwaddu.vv v0, v8, v16 +; RV64-V128-NEXT: vrgather.vv v16, v8, v24, v0.t +; RV64-V128-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; RV64-V128-NEXT: vmv4r.v v24, v8 +; RV64-V128-NEXT: csrr a0, vlenb +; RV64-V128-NEXT: slli a0, a0, 3 +; RV64-V128-NEXT: add a0, sp, a0 +; RV64-V128-NEXT: addi a0, a0, 16 +; RV64-V128-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-V128-NEXT: vwaddu.vv v0, v8, v24 ; RV64-V128-NEXT: li a0, -1 -; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16 +; RV64-V128-NEXT: vwmaccu.vx v0, a0, v24 ; RV64-V128-NEXT: vmv8r.v v8, v0 -; RV64-V128-NEXT: vmv8r.v v16, v24 ; RV64-V128-NEXT: csrr a0, vlenb -; RV64-V128-NEXT: slli a0, a0, 4 +; RV64-V128-NEXT: li a1, 24 +; RV64-V128-NEXT: mul a0, a0, a1 ; RV64-V128-NEXT: add sp, sp, a0 ; RV64-V128-NEXT: addi sp, sp, 16 ; RV64-V128-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -6,7 +6,9 @@ ; CHECK-LABEL: shuffle_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 11 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 @@ -19,7 +21,9 @@ ; CHECK-LABEL: shuffle_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 203 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 @@ -32,7 +36,9 @@ ; CHECK-LABEL: shuffle_xv_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 9 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmerge.vim v8, v8, 5, v0 @@ -45,7 +51,9 @@ ; CHECK-LABEL: shuffle_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 6 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmerge.vim v8, v8, 5, v0 @@ -89,9 +97,13 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v11, (a0) +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: li a0, 8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -103,7 +115,9 @@ ; CHECK-LABEL: vrgather_shuffle_xv_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 12 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vid.v v9 @@ -123,7 +137,11 @@ ; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a0, 3 ; CHECK-NEXT: vmul.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v9, 5 ; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v8, v9 @@ -184,8 +202,11 @@ ; RV32-LABEL: vrgather_shuffle_vv_v8i64: ; RV32: # %bb.0: ; RV32-NEXT: li a0, 5 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, a0 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV32-NEXT: vmv.v.i v20, 2 ; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vslideup.vi v20, v16, 7 @@ -193,9 +214,13 @@ ; RV32-NEXT: addi a0, a0, %lo(.LCPI11_0) ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; RV32-NEXT: vle16.v v21, (a0) +; RV32-NEXT: vrgatherei16.vv v16, v8, v21 ; RV32-NEXT: li a0, 164 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgatherei16.vv v16, v8, v21 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v16, v12, v20, v0.t ; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: ret @@ -203,8 +228,11 @@ ; RV64-LABEL: vrgather_shuffle_vv_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 5 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; RV64-NEXT: vmv.v.i v20, 2 ; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu ; RV64-NEXT: vslideup.vi v20, v16, 7 @@ -212,9 +240,13 @@ ; RV64-NEXT: addi a0, a0, %lo(.LCPI11_0) ; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; RV64-NEXT: vle64.v v24, (a0) +; RV64-NEXT: vrgather.vv v16, v8, v24 ; RV64-NEXT: li a0, 164 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vrgather.vv v16, v8, v24 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vrgather.vv v16, v12, v20, v0.t ; RV64-NEXT: vmv.v.v v8, v16 ; RV64-NEXT: ret @@ -230,20 +262,26 @@ ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle16.v v16, (a0) ; RV32-NEXT: vmv.v.i v20, -1 -; RV32-NEXT: lui a0, %hi(.LCPI12_1) -; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1) -; RV32-NEXT: vle16.v v17, (a0) +; RV32-NEXT: vrgatherei16.vv v12, v20, v16 ; RV32-NEXT: li a0, 113 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgatherei16.vv v12, v20, v16 -; RV32-NEXT: vrgatherei16.vv v12, v8, v17, v0.t +; RV32-NEXT: lui a0, %hi(.LCPI12_1) +; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1) +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vle16.v v16, (a0) +; RV32-NEXT: vrgatherei16.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_xv_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 113 -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, %hi(.LCPI12_0) ; RV64-NEXT: addi a0, a0, %lo(.LCPI12_0) @@ -264,21 +302,27 @@ ; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vrgatherei16.vv v12, v8, v16 -; RV32-NEXT: lui a0, %hi(.LCPI13_1) -; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) -; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vmv4r.v v12, v8 +; RV32-NEXT: vrgatherei16.vv v8, v12, v16 ; RV32-NEXT: li a0, 140 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: lui a0, %hi(.LCPI13_1) +; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vle16.v v12, (a0) ; RV32-NEXT: vmv.v.i v16, 5 -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 +; RV32-NEXT: vrgatherei16.vv v8, v16, v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_vx_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 115 -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, %hi(.LCPI13_0) ; RV64-NEXT: addi a0, a0, %lo(.LCPI13_0) @@ -340,8 +384,11 @@ ; CHECK-LABEL: splat_ve4_ins_i1ve3: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v10, 4 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v10, v9, 1 @@ -357,7 +404,9 @@ ; CHECK-LABEL: splat_ve2_we0: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 66 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v8, 2 @@ -377,9 +426,13 @@ ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v11, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: li a0, 66 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v9, 0, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -391,7 +444,9 @@ ; CHECK-LABEL: splat_ve2_we0_ins_i0we4: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 67 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v8, 2 @@ -413,9 +468,13 @@ ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV32-NEXT: vmv.v.x v11, a0 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vrgather.vv v10, v8, v11 ; RV32-NEXT: li a0, 66 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgather.vv v10, v8, v11 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV32-NEXT: vrgather.vi v10, v9, 0, v0.t ; RV32-NEXT: vmv1r.v v8, v10 ; RV32-NEXT: ret @@ -427,9 +486,13 @@ ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV64-NEXT: vmv.v.x v11, a0 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vv v10, v8, v11 ; RV64-NEXT: li a0, 66 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vrgather.vv v10, v8, v11 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64-NEXT: vrgather.vi v10, v9, 0, v0.t ; RV64-NEXT: vmv1r.v v8, v10 ; RV64-NEXT: ret @@ -441,12 +504,18 @@ ; CHECK-LABEL: splat_ve2_we0_ins_i2we4: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 4 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v11, v10, 2 ; CHECK-NEXT: li a0, 70 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v8, 2 @@ -461,8 +530,11 @@ ; RV32-LABEL: splat_ve2_we0_ins_i2ve4_i5we6: ; RV32: # %bb.0: ; RV32-NEXT: li a0, 6 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV32-NEXT: vmv.v.i v11, 0 ; RV32-NEXT: vsetivli zero, 6, e8, mf2, tu, mu ; RV32-NEXT: vslideup.vi v11, v10, 5 @@ -471,9 +543,13 @@ ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV32-NEXT: vmv.v.x v12, a0 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: vrgather.vv v10, v8, v12 ; RV32-NEXT: li a0, 98 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgather.vv v10, v8, v12 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV32-NEXT: vrgather.vv v10, v9, v11, v0.t ; RV32-NEXT: vmv1r.v v8, v10 ; RV32-NEXT: ret @@ -481,8 +557,11 @@ ; RV64-LABEL: splat_ve2_we0_ins_i2ve4_i5we6: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 6 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64-NEXT: vmv.v.i v11, 0 ; RV64-NEXT: vsetivli zero, 6, e8, mf2, tu, mu ; RV64-NEXT: vslideup.vi v11, v10, 5 @@ -491,9 +570,13 @@ ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV64-NEXT: vmv.v.x v12, a0 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: vrgather.vv v10, v8, v12 ; RV64-NEXT: li a0, 98 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vrgather.vv v10, v8, v12 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64-NEXT: vrgather.vv v10, v9, v11, v0.t ; RV64-NEXT: vmv1r.v v8, v10 ; RV64-NEXT: ret @@ -518,8 +601,11 @@ define <4 x i16> @slidedown_v4i16(<4 x i16> %x) { ; CHECK-LABEL: slidedown_v4i16: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> ret <4 x i16> %s @@ -528,8 +614,11 @@ define <8 x i32> @slidedown_v8i32(<8 x i32> %x) { ; CHECK-LABEL: slidedown_v8i32: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %s = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> ret <8 x i32> %s @@ -538,6 +627,8 @@ define <4 x i16> @slideup_v4i16(<4 x i16> %x) { ; CHECK-LABEL: slideup_v4i16: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: vmv1r.v v8, v9 @@ -549,6 +640,8 @@ define <8 x i32> @slideup_v8i32(<8 x i32> %x) { ; CHECK-LABEL: slideup_v8i32: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; CHECK-NEXT: vslideup.vi v10, v8, 3 ; CHECK-NEXT: vmv2r.v v8, v10 @@ -560,6 +653,8 @@ define <8 x i16> @splice_unary(<8 x i16> %x) { ; CHECK-LABEL: splice_unary: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu @@ -573,6 +668,8 @@ define <8 x i32> @splice_unary2(<8 x i32> %x) { ; CHECK-LABEL: splice_unary2: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 3, e32, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 5 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu @@ -586,10 +683,13 @@ define <8 x i16> @splice_binary(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: splice_binary: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 6 +; CHECK-NEXT: vslideup.vi v10, v9, 6 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %s = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> ret <8 x i16> %s @@ -598,10 +698,13 @@ define <8 x i32> @splice_binary2(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: splice_binary2: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 3, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 5 +; CHECK-NEXT: vslidedown.vi v12, v8, 5 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: vslideup.vi v12, v10, 3 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> ret <8 x i32> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll @@ -230,7 +230,9 @@ ; LMULMAX1-RV32-LABEL: splat_v4i64: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: li a3, 5 -; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; LMULMAX1-RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v0, a3 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v8, a2 @@ -710,6 +712,9 @@ ; LMULMAX2-RV32-NEXT: addi a0, a0, 32 ; LMULMAX2-RV32-NEXT: vle64.v v14, (a0) ; LMULMAX2-RV32-NEXT: li a0, 85 +; LMULMAX2-RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a0 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v16, a2 @@ -747,6 +752,9 @@ ; LMULMAX1-RV32-NEXT: addi a0, a0, 16 ; LMULMAX1-RV32-NEXT: vle64.v v15, (a0) ; LMULMAX1-RV32-NEXT: li a0, 5 +; LMULMAX1-RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v0, a0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v16, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -844,33 +844,43 @@ ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-NEXT: vle8.v v8, (a0) ; RV32-NEXT: li a1, 513 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, 4 ; RV32-NEXT: vmerge.vim v9, v9, 1, v0 ; RV32-NEXT: lui a1, 1 ; RV32-NEXT: addi a2, a1, 78 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-NEXT: vmv.s.x v0, a2 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-NEXT: vmerge.vim v9, v9, 3, v0 ; RV32-NEXT: lui a2, 8 ; RV32-NEXT: addi a2, a2, 304 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-NEXT: vmv.s.x v0, a2 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-NEXT: vmerge.vim v9, v9, 2, v0 ; RV32-NEXT: lui a2, 3 ; RV32-NEXT: addi a2, a2, -2044 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-NEXT: vmv.s.x v0, a2 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: li a2, -128 ; RV32-NEXT: vmerge.vxm v11, v10, a2, v0 ; RV32-NEXT: addi a1, a1, 32 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-NEXT: lui a1, %hi(.LCPI52_0) @@ -891,33 +901,43 @@ ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: vle8.v v8, (a0) ; RV64-NEXT: li a1, 513 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: vmv.v.i v9, 4 ; RV64-NEXT: vmerge.vim v9, v9, 1, v0 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: addiw a2, a1, 78 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-NEXT: vmv.s.x v0, a2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: vmerge.vim v9, v9, 3, v0 ; RV64-NEXT: lui a2, 8 ; RV64-NEXT: addiw a2, a2, 304 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-NEXT: vmv.s.x v0, a2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: vmerge.vim v9, v9, 2, v0 ; RV64-NEXT: lui a2, 3 ; RV64-NEXT: addiw a2, a2, -2044 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-NEXT: vmv.s.x v0, a2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: li a2, -128 ; RV64-NEXT: vmerge.vxm v11, v10, a2, v0 ; RV64-NEXT: addiw a1, a1, 32 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: lui a1, %hi(.LCPI52_0) @@ -944,9 +964,16 @@ ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: li a1, 33 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 3 ; CHECK-NEXT: vmerge.vim v10, v10, 2, v0 ; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, mu @@ -983,7 +1010,11 @@ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v10, v9, 2 @@ -996,7 +1027,11 @@ ; CHECK-NEXT: vmulhu.vv v8, v8, v10 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v10, v9, 3 @@ -1061,7 +1096,9 @@ ; RV32-NEXT: vle8.v v8, (a0) ; RV32-NEXT: lui a1, 5 ; RV32-NEXT: addi a1, a1, -1452 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, 7 @@ -1081,7 +1118,9 @@ ; RV64-NEXT: vle8.v v8, (a0) ; RV64-NEXT: lui a1, 5 ; RV64-NEXT: addiw a1, a1, -1452 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: vmv.v.i v9, 7 @@ -1106,9 +1145,13 @@ ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: li a1, 105 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: lui a1, 5 ; RV32-NEXT: addi a1, a1, -1755 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: lui a1, 1048571 ; RV32-NEXT: addi a1, a1, 1755 @@ -1125,9 +1168,13 @@ ; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: li a1, 105 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: lui a1, 5 ; RV64-NEXT: addiw a1, a1, -1755 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: lui a1, 1048571 ; RV64-NEXT: addiw a1, a1, 1755 @@ -1150,9 +1197,13 @@ ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: li a1, 5 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: lui a1, 419430 ; RV32-NEXT: addi a1, a1, 1639 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: lui a1, 629146 ; RV32-NEXT: addi a1, a1, -1639 @@ -1206,8 +1257,11 @@ ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vmadd.vv v10, v8, v9 ; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v8, a1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV32-NEXT: vslideup.vi v9, v8, 2 @@ -4361,7 +4415,9 @@ ; LMULMAX2-RV32-NEXT: vle8.v v8, (a0) ; LMULMAX2-RV32-NEXT: lui a2, 66049 ; LMULMAX2-RV32-NEXT: addi a2, a2, 32 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV32-NEXT: lui a2, %hi(.LCPI153_0) @@ -4374,7 +4430,9 @@ ; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 ; LMULMAX2-RV32-NEXT: lui a2, 163907 ; LMULMAX2-RV32-NEXT: addi a2, a2, -2044 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV32-NEXT: li a2, -128 ; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu @@ -4383,20 +4441,26 @@ ; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 ; LMULMAX2-RV32-NEXT: lui a2, 8208 ; LMULMAX2-RV32-NEXT: addi a2, a2, 513 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.i v10, 4 ; LMULMAX2-RV32-NEXT: vmerge.vim v10, v10, 1, v0 ; LMULMAX2-RV32-NEXT: lui a2, 66785 ; LMULMAX2-RV32-NEXT: addi a2, a2, 78 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmerge.vim v10, v10, 3, v0 ; LMULMAX2-RV32-NEXT: lui a2, 529160 ; LMULMAX2-RV32-NEXT: addi a2, a2, 304 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmerge.vim v10, v10, 2, v0 @@ -4411,7 +4475,9 @@ ; LMULMAX2-RV64-NEXT: vle8.v v8, (a0) ; LMULMAX2-RV64-NEXT: lui a2, 66049 ; LMULMAX2-RV64-NEXT: addiw a2, a2, 32 -; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI153_0) @@ -4424,7 +4490,9 @@ ; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 ; LMULMAX2-RV64-NEXT: lui a2, 163907 ; LMULMAX2-RV64-NEXT: addiw a2, a2, -2044 -; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV64-NEXT: li a2, -128 ; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu @@ -4433,20 +4501,26 @@ ; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 ; LMULMAX2-RV64-NEXT: lui a2, 8208 ; LMULMAX2-RV64-NEXT: addiw a2, a2, 513 -; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV64-NEXT: vmv.v.i v10, 4 ; LMULMAX2-RV64-NEXT: vmerge.vim v10, v10, 1, v0 ; LMULMAX2-RV64-NEXT: lui a2, 66785 ; LMULMAX2-RV64-NEXT: addiw a2, a2, 78 -; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV64-NEXT: vmerge.vim v10, v10, 3, v0 ; LMULMAX2-RV64-NEXT: lui a2, 529160 ; LMULMAX2-RV64-NEXT: addiw a2, a2, 304 -; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; LMULMAX2-RV64-NEXT: vmerge.vim v10, v10, 2, v0 @@ -4481,21 +4555,33 @@ ; LMULMAX2-RV32-NEXT: vle16.v v10, (a0) ; LMULMAX2-RV32-NEXT: lui a1, 2 ; LMULMAX2-RV32-NEXT: addi a1, a1, 289 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 -; LMULMAX2-RV32-NEXT: vmv.v.i v12, 3 +; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v8, 3 +; LMULMAX2-RV32-NEXT: vmerge.vim v12, v8, 2, v0 ; LMULMAX2-RV32-NEXT: lui a1, 4 ; LMULMAX2-RV32-NEXT: addi a1, a1, 64 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v8, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v8, a1 -; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 2, v0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8 ; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 1, v0 ; LMULMAX2-RV32-NEXT: li a1, 257 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 +; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0 -; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI154_0) -; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI154_0) -; LMULMAX2-RV32-NEXT: vle16.v v16, (a1) ; LMULMAX2-RV32-NEXT: lui a1, 1048568 +; LMULMAX2-RV32-NEXT: lui a2, %hi(.LCPI154_0) +; LMULMAX2-RV32-NEXT: addi a2, a2, %lo(.LCPI154_0) +; LMULMAX2-RV32-NEXT: vle16.v v16, (a2) ; LMULMAX2-RV32-NEXT: vmerge.vxm v18, v14, a1, v0 ; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8 ; LMULMAX2-RV32-NEXT: vmerge.vim v8, v14, 1, v0 @@ -4514,21 +4600,33 @@ ; LMULMAX2-RV64-NEXT: vle16.v v10, (a0) ; LMULMAX2-RV64-NEXT: lui a1, 2 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 289 +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 -; LMULMAX2-RV64-NEXT: vmv.v.i v12, 3 +; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v8, 3 +; LMULMAX2-RV64-NEXT: vmerge.vim v12, v8, 2, v0 ; LMULMAX2-RV64-NEXT: lui a1, 4 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 64 +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v8, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v8, a1 -; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 2, v0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8 ; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 1, v0 ; LMULMAX2-RV64-NEXT: li a1, 257 +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 +; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX2-RV64-NEXT: vmv.v.i v14, 0 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI154_0) -; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI154_0) -; LMULMAX2-RV64-NEXT: vle16.v v16, (a1) ; LMULMAX2-RV64-NEXT: lui a1, 1048568 +; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI154_0) +; LMULMAX2-RV64-NEXT: addi a2, a2, %lo(.LCPI154_0) +; LMULMAX2-RV64-NEXT: vle16.v v16, (a2) ; LMULMAX2-RV64-NEXT: vmerge.vxm v18, v14, a1, v0 ; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8 ; LMULMAX2-RV64-NEXT: vmerge.vim v8, v14, 1, v0 @@ -4567,7 +4665,11 @@ ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: vle32.v v8, (a0) ; LMULMAX2-NEXT: li a1, 68 +; LMULMAX2-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-NEXT: vmv.v.i v0, 0 +; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-NEXT: vmv.s.x v0, a1 +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: lui a1, %hi(.LCPI155_0) ; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI155_0) ; LMULMAX2-NEXT: vle32.v v10, (a1) @@ -4579,7 +4681,11 @@ ; LMULMAX2-NEXT: vmulhu.vv v8, v8, v12 ; LMULMAX2-NEXT: vadd.vv v8, v8, v10 ; LMULMAX2-NEXT: li a1, 136 +; LMULMAX2-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-NEXT: vmv.v.i v0, 0 +; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-NEXT: vmv.s.x v0, a1 +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-NEXT: vmv.v.i v10, 2 ; LMULMAX2-NEXT: vmerge.vim v10, v10, 1, v0 ; LMULMAX2-NEXT: vsrl.vv v8, v8, v10 @@ -4593,7 +4699,11 @@ ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 ; LMULMAX1-RV32-NEXT: vle32.v v9, (a1) ; LMULMAX1-RV32-NEXT: lui a2, 524288 +; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v10, a2 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; LMULMAX1-RV32-NEXT: vslideup.vi v11, v10, 2 @@ -4606,7 +4716,11 @@ ; LMULMAX1-RV32-NEXT: vmulhu.vv v9, v9, v11 ; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v12 ; LMULMAX1-RV32-NEXT: li a2, 1 +; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v12, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v12, a2 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.i v13, 2 ; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; LMULMAX1-RV32-NEXT: vslideup.vi v13, v12, 3 @@ -4654,8 +4768,11 @@ ; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v8, v10 ; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 ; LMULMAX2-RV32-NEXT: lui a1, 524288 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v12, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0 ; LMULMAX2-RV32-NEXT: vsetivli zero, 6, e32, m2, tu, mu ; LMULMAX2-RV32-NEXT: vslideup.vi v14, v12, 5 @@ -4677,7 +4794,11 @@ ; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) ; LMULMAX2-RV64-NEXT: li a1, -1 ; LMULMAX2-RV64-NEXT: slli a1, a1, 63 +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v10, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v10, a1 +; LMULMAX2-RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; LMULMAX2-RV64-NEXT: vmv.v.i v12, 0 ; LMULMAX2-RV64-NEXT: vsetivli zero, 3, e64, m2, tu, mu ; LMULMAX2-RV64-NEXT: vslideup.vi v12, v10, 2 @@ -4775,7 +4896,9 @@ ; LMULMAX2-RV32-NEXT: vmv.v.x v10, a2 ; LMULMAX2-RV32-NEXT: lui a2, 304453 ; LMULMAX2-RV32-NEXT: addi a2, a2, -1452 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV32-NEXT: li a2, 57 ; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu @@ -4796,7 +4919,9 @@ ; LMULMAX2-RV64-NEXT: vmv.v.x v10, a2 ; LMULMAX2-RV64-NEXT: lui a2, 304453 ; LMULMAX2-RV64-NEXT: addiw a2, a2, -1452 -; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; LMULMAX2-RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a2 ; LMULMAX2-RV64-NEXT: li a2, 57 ; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu @@ -4816,7 +4941,9 @@ ; LMULMAX1-RV32-NEXT: vle8.v v9, (a1) ; LMULMAX1-RV32-NEXT: lui a2, 5 ; LMULMAX1-RV32-NEXT: addi a2, a2, -1452 -; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.i v10, -9 @@ -4835,7 +4962,9 @@ ; LMULMAX1-RV64-NEXT: vle8.v v9, (a1) ; LMULMAX1-RV64-NEXT: lui a2, 5 ; LMULMAX1-RV64-NEXT: addiw a2, a2, -1452 -; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX1-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX1-RV64-NEXT: vmv.s.x v0, a2 ; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-RV64-NEXT: vmv.v.i v10, -9 @@ -4858,9 +4987,13 @@ ; LMULMAX2-RV32-NEXT: vle16.v v8, (a0) ; LMULMAX2-RV32-NEXT: lui a1, 7 ; LMULMAX2-RV32-NEXT: addi a1, a1, -1687 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 ; LMULMAX2-RV32-NEXT: lui a1, 5 ; LMULMAX2-RV32-NEXT: addi a1, a1, -1755 +; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 ; LMULMAX2-RV32-NEXT: lui a1, 1048571 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1755 @@ -4878,9 +5011,13 @@ ; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) ; LMULMAX2-RV64-NEXT: lui a1, 7 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -1687 +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 ; LMULMAX2-RV64-NEXT: lui a1, 5 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -1755 +; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; LMULMAX2-RV64-NEXT: vmv.v.x v10, a1 ; LMULMAX2-RV64-NEXT: lui a1, 1048571 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1755 @@ -4899,7 +5036,11 @@ ; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vle16.v v9, (a1) ; LMULMAX1-NEXT: li a2, 105 +; LMULMAX1-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX1-NEXT: vmv.v.i v0, 0 +; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-NEXT: vmv.s.x v0, a2 +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v10, 7 ; LMULMAX1-NEXT: vmerge.vim v10, v10, -7, v0 ; LMULMAX1-NEXT: vdiv.vv v9, v9, v10 @@ -4919,9 +5060,13 @@ ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) ; LMULMAX2-RV32-NEXT: li a1, 85 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 ; LMULMAX2-RV32-NEXT: lui a1, 419430 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1639 +; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 ; LMULMAX2-RV32-NEXT: lui a1, 629146 ; LMULMAX2-RV32-NEXT: addi a1, a1, -1639 @@ -4956,9 +5101,13 @@ ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 ; LMULMAX1-RV32-NEXT: vle32.v v9, (a1) ; LMULMAX1-RV32-NEXT: li a2, 5 +; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX1-RV32-NEXT: vmv.s.x v0, a2 ; LMULMAX1-RV32-NEXT: lui a2, 419430 ; LMULMAX1-RV32-NEXT: addi a2, a2, 1639 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v10, a2 ; LMULMAX1-RV32-NEXT: lui a2, 629146 ; LMULMAX1-RV32-NEXT: addi a2, a2, -1639 @@ -5004,6 +5153,9 @@ ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX2-RV32-NEXT: li a1, 17 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a2, a1, 1365 @@ -5014,6 +5166,9 @@ ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmulh.vv v10, v8, v10 ; LMULMAX2-RV32-NEXT: li a1, 51 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.i v12, -1 @@ -5023,6 +5178,9 @@ ; LMULMAX2-RV32-NEXT: li a1, 63 ; LMULMAX2-RV32-NEXT: vsrl.vx v8, v12, a1 ; LMULMAX2-RV32-NEXT: li a1, 68 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV32-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.i v10, 0 @@ -5036,25 +5194,29 @@ ; LMULMAX2-RV64-LABEL: mulhs_v4i64: ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) ; LMULMAX2-RV64-NEXT: li a1, 5 +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 +; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI160_0) ; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI160_0) -; LMULMAX2-RV64-NEXT: vlse64.v v8, (a1), zero +; LMULMAX2-RV64-NEXT: vlse64.v v10, (a1), zero ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI160_1) ; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI160_1)(a1) -; LMULMAX2-RV64-NEXT: vle64.v v10, (a0) ; LMULMAX2-RV64-NEXT: vmv.v.i v12, -1 ; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 0, v0 -; LMULMAX2-RV64-NEXT: vmerge.vxm v8, v8, a1, v0 -; LMULMAX2-RV64-NEXT: vmulh.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vmacc.vv v8, v10, v12 +; LMULMAX2-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; LMULMAX2-RV64-NEXT: vmulh.vv v10, v8, v10 +; LMULMAX2-RV64-NEXT: vmacc.vv v10, v8, v12 ; LMULMAX2-RV64-NEXT: li a1, 63 -; LMULMAX2-RV64-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX2-RV64-NEXT: vsrl.vx v8, v10, a1 ; LMULMAX2-RV64-NEXT: vmv.v.i v12, 1 ; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 0, v0 -; LMULMAX2-RV64-NEXT: vsra.vv v8, v8, v12 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64-NEXT: vsra.vv v10, v10, v12 +; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 ; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -115,14 +115,18 @@ ; CHECK-LABEL: buildvec_mask_v1i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 2 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_v1i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a0 ; ZVE32F-NEXT: ret ret <3 x i1> @@ -132,14 +136,18 @@ ; CHECK-LABEL: buildvec_mask_optsize_v1i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 2 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_optsize_v1i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a0 ; ZVE32F-NEXT: ret ret <3 x i1> @@ -149,14 +157,18 @@ ; CHECK-LABEL: buildvec_mask_v4i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 6 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_v4i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a0, 6 -; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a0 ; ZVE32F-NEXT: ret ret <4 x i1> @@ -166,7 +178,9 @@ ; CHECK-LABEL: buildvec_mask_nonconst_v4i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 3 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vmv.v.x v8, a1 @@ -178,7 +192,9 @@ ; ZVE32F-LABEL: buildvec_mask_nonconst_v4i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a2, 3 -; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a2 ; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a1 @@ -278,14 +294,18 @@ ; CHECK-LABEL: buildvec_mask_v8i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 182 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_v8i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a0, 182 -; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a0 ; ZVE32F-NEXT: ret ret <8 x i1> @@ -295,7 +315,9 @@ ; CHECK-LABEL: buildvec_mask_nonconst_v8i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 19 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v0, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a1 @@ -307,7 +329,9 @@ ; ZVE32F-LABEL: buildvec_mask_nonconst_v8i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a2, 19 -; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a2 ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a1 @@ -487,14 +511,18 @@ ; CHECK-LABEL: buildvec_mask_v10i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 949 -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_v10i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a0, 949 -; ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a0 ; ZVE32F-NEXT: ret ret <10 x i1> @@ -505,7 +533,9 @@ ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: lui a0, 11 ; CHECK-RV32-NEXT: addi a0, a0, 1718 -; CHECK-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-RV32-NEXT: vmv.v.i v0, 0 +; CHECK-RV32-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; CHECK-RV32-NEXT: vmv.s.x v0, a0 ; CHECK-RV32-NEXT: ret ; @@ -513,7 +543,9 @@ ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: lui a0, 11 ; CHECK-RV64-NEXT: addiw a0, a0, 1718 -; CHECK-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-RV64-NEXT: vmv.v.i v0, 0 +; CHECK-RV64-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; CHECK-RV64-NEXT: vmv.s.x v0, a0 ; CHECK-RV64-NEXT: ret ret <16 x i1> @@ -523,14 +555,18 @@ ; CHECK-LABEL: buildvec_mask_v16i1_undefs: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 1722 -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_v16i1_undefs: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: li a0, 1722 -; ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; ZVE32F-NEXT: vmv.v.i v0, 0 +; ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu ; ZVE32F-NEXT: vmv.s.x v0, a0 ; ZVE32F-NEXT: ret ret <16 x i1> @@ -540,20 +576,30 @@ ; RV32-LMULMAX1-LABEL: buildvec_mask_v32i1: ; RV32-LMULMAX1: # %bb.0: ; RV32-LMULMAX1-NEXT: li a0, 1776 -; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX1-NEXT: lui a0, 11 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1718 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX1-NEXT: ret ; ; RV64-LMULMAX1-LABEL: buildvec_mask_v32i1: ; RV64-LMULMAX1: # %bb.0: ; RV64-LMULMAX1-NEXT: li a0, 1776 -; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX1-NEXT: lui a0, 11 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1718 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV64-LMULMAX1-NEXT: ret ; @@ -561,7 +607,9 @@ ; RV32-LMULMAX2: # %bb.0: ; RV32-LMULMAX2-NEXT: lui a0, 748384 ; RV32-LMULMAX2-NEXT: addi a0, a0, 1776 -; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX2-NEXT: ret ; @@ -569,7 +617,9 @@ ; RV64-LMULMAX2: # %bb.0: ; RV64-LMULMAX2-NEXT: lui a0, 748384 ; RV64-LMULMAX2-NEXT: addiw a0, a0, 1776 -; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX2-NEXT: ret ; @@ -577,7 +627,9 @@ ; RV32-LMULMAX4: # %bb.0: ; RV32-LMULMAX4-NEXT: lui a0, 748384 ; RV32-LMULMAX4-NEXT: addi a0, a0, 1776 -; RV32-LMULMAX4-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX4-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX4-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX4-NEXT: ret ; @@ -585,7 +637,9 @@ ; RV64-LMULMAX4: # %bb.0: ; RV64-LMULMAX4-NEXT: lui a0, 748384 ; RV64-LMULMAX4-NEXT: addiw a0, a0, 1776 -; RV64-LMULMAX4-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX4-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX4-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX4-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX4-NEXT: ret ; @@ -593,7 +647,9 @@ ; RV32-LMULMAX8: # %bb.0: ; RV32-LMULMAX8-NEXT: lui a0, 748384 ; RV32-LMULMAX8-NEXT: addi a0, a0, 1776 -; RV32-LMULMAX8-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX8-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX8-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX8-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX8-NEXT: ret ; @@ -601,7 +657,9 @@ ; RV64-LMULMAX8: # %bb.0: ; RV64-LMULMAX8-NEXT: lui a0, 748384 ; RV64-LMULMAX8-NEXT: addiw a0, a0, 1776 -; RV64-LMULMAX8-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX8-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX8-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX8-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX8-NEXT: ret ret <32 x i1> @@ -611,13 +669,21 @@ ; RV32-LMULMAX1-LABEL: buildvec_mask_v64i1: ; RV32-LMULMAX1: # %bb.0: ; RV32-LMULMAX1-NEXT: li a0, 1776 -; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX1-NEXT: lui a0, 4 ; RV32-LMULMAX1-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v9, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v9, a0 ; RV32-LMULMAX1-NEXT: lui a0, 11 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1718 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX1-NEXT: vmv1r.v v10, v8 ; RV32-LMULMAX1-NEXT: ret @@ -625,13 +691,21 @@ ; RV64-LMULMAX1-LABEL: buildvec_mask_v64i1: ; RV64-LMULMAX1: # %bb.0: ; RV64-LMULMAX1-NEXT: li a0, 1776 -; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX1-NEXT: lui a0, 4 ; RV64-LMULMAX1-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v9, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v9, a0 ; RV64-LMULMAX1-NEXT: lui a0, 11 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1718 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV64-LMULMAX1-NEXT: vmv1r.v v10, v8 ; RV64-LMULMAX1-NEXT: ret @@ -640,10 +714,15 @@ ; RV32-LMULMAX2: # %bb.0: ; RV32-LMULMAX2-NEXT: lui a0, 748384 ; RV32-LMULMAX2-NEXT: addi a0, a0, 1776 -; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX2-NEXT: lui a0, 748388 ; RV32-LMULMAX2-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX2-NEXT: ret ; @@ -651,10 +730,15 @@ ; RV64-LMULMAX2: # %bb.0: ; RV64-LMULMAX2-NEXT: lui a0, 748384 ; RV64-LMULMAX2-NEXT: addiw a0, a0, 1776 -; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX2-NEXT: lui a0, 748388 ; RV64-LMULMAX2-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v8, a0 ; RV64-LMULMAX2-NEXT: ret ; @@ -662,12 +746,16 @@ ; RV32-LMULMAX4: # %bb.0: ; RV32-LMULMAX4-NEXT: lui a0, 748388 ; RV32-LMULMAX4-NEXT: addi a0, a0, -1793 -; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX4-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX4-NEXT: lui a0, 748384 ; RV32-LMULMAX4-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX4-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vmv.s.x v0, a0 -; RV32-LMULMAX4-NEXT: vsetvli zero, zero, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vslideup.vi v0, v8, 1 ; RV32-LMULMAX4-NEXT: ret ; @@ -683,12 +771,16 @@ ; RV32-LMULMAX8: # %bb.0: ; RV32-LMULMAX8-NEXT: lui a0, 748388 ; RV32-LMULMAX8-NEXT: addi a0, a0, -1793 -; RV32-LMULMAX8-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX8-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX8-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX8-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX8-NEXT: lui a0, 748384 ; RV32-LMULMAX8-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX8-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX8-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX8-NEXT: vmv.s.x v0, a0 -; RV32-LMULMAX8-NEXT: vsetvli zero, zero, e32, mf2, tu, mu ; RV32-LMULMAX8-NEXT: vslideup.vi v0, v8, 1 ; RV32-LMULMAX8-NEXT: ret ; @@ -706,19 +798,33 @@ ; RV32-LMULMAX1-LABEL: buildvec_mask_v128i1: ; RV32-LMULMAX1: # %bb.0: ; RV32-LMULMAX1-NEXT: li a0, 1776 -; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX1-NEXT: lui a0, 11 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1718 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX1-NEXT: lui a0, 8 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1718 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v12, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v12, a0 ; RV32-LMULMAX1-NEXT: lui a0, 4 ; RV32-LMULMAX1-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v9, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v9, a0 ; RV32-LMULMAX1-NEXT: lui a0, 14 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1722 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v14, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v14, a0 ; RV32-LMULMAX1-NEXT: vmv1r.v v10, v8 ; RV32-LMULMAX1-NEXT: vmv1r.v v11, v0 @@ -728,19 +834,33 @@ ; RV64-LMULMAX1-LABEL: buildvec_mask_v128i1: ; RV64-LMULMAX1: # %bb.0: ; RV64-LMULMAX1-NEXT: li a0, 1776 -; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX1-NEXT: lui a0, 11 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1718 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV64-LMULMAX1-NEXT: lui a0, 8 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1718 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v12, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v12, a0 ; RV64-LMULMAX1-NEXT: lui a0, 4 ; RV64-LMULMAX1-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v9, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v9, a0 ; RV64-LMULMAX1-NEXT: lui a0, 14 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1722 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v14, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v14, a0 ; RV64-LMULMAX1-NEXT: vmv1r.v v10, v8 ; RV64-LMULMAX1-NEXT: vmv1r.v v11, v0 @@ -751,16 +871,27 @@ ; RV32-LMULMAX2: # %bb.0: ; RV32-LMULMAX2-NEXT: lui a0, 748384 ; RV32-LMULMAX2-NEXT: addi a0, a0, 1776 -; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX2-NEXT: lui a0, 748388 ; RV32-LMULMAX2-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX2-NEXT: lui a0, 551776 ; RV32-LMULMAX2-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v9, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v9, a0 ; RV32-LMULMAX2-NEXT: lui a0, 945060 ; RV32-LMULMAX2-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v10, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v10, a0 ; RV32-LMULMAX2-NEXT: ret ; @@ -768,16 +899,27 @@ ; RV64-LMULMAX2: # %bb.0: ; RV64-LMULMAX2-NEXT: lui a0, 748384 ; RV64-LMULMAX2-NEXT: addiw a0, a0, 1776 -; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX2-NEXT: lui a0, 748388 ; RV64-LMULMAX2-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v8, a0 ; RV64-LMULMAX2-NEXT: lui a0, 551776 ; RV64-LMULMAX2-NEXT: addiw a0, a0, 1776 +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v9, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v9, a0 ; RV64-LMULMAX2-NEXT: lui a0, 945060 ; RV64-LMULMAX2-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v10, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v10, a0 ; RV64-LMULMAX2-NEXT: ret ; @@ -785,18 +927,28 @@ ; RV32-LMULMAX4: # %bb.0: ; RV32-LMULMAX4-NEXT: lui a0, 748388 ; RV32-LMULMAX4-NEXT: addi a0, a0, -1793 -; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX4-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX4-NEXT: lui a0, 748384 ; RV32-LMULMAX4-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX4-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vmv.s.x v0, a0 -; RV32-LMULMAX4-NEXT: vsetvli zero, zero, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vslideup.vi v0, v8, 1 ; RV32-LMULMAX4-NEXT: lui a0, 945060 ; RV32-LMULMAX4-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX4-NEXT: vmv.v.i v9, 0 +; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vmv.s.x v9, a0 ; RV32-LMULMAX4-NEXT: lui a0, 551776 ; RV32-LMULMAX4-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX4-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX4-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX4-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-LMULMAX4-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX4-NEXT: vslideup.vi v8, v9, 1 ; RV32-LMULMAX4-NEXT: ret @@ -816,22 +968,32 @@ ; RV32-LMULMAX8: # %bb.0: ; RV32-LMULMAX8-NEXT: lui a0, 748388 ; RV32-LMULMAX8-NEXT: addi a0, a0, -1793 -; RV32-LMULMAX8-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX8-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX8-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-LMULMAX8-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX8-NEXT: lui a0, 748384 ; RV32-LMULMAX8-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX8-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX8-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-LMULMAX8-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX8-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV32-LMULMAX8-NEXT: vslideup.vi v0, v8, 1 ; RV32-LMULMAX8-NEXT: lui a0, 551776 ; RV32-LMULMAX8-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX8-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX8-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV32-LMULMAX8-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX8-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV32-LMULMAX8-NEXT: vslideup.vi v0, v8, 2 ; RV32-LMULMAX8-NEXT: lui a0, 945060 ; RV32-LMULMAX8-NEXT: addi a0, a0, -1793 -; RV32-LMULMAX8-NEXT: vmv.s.x v8, a0 +; RV32-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX8-NEXT: vmv.v.i v8, 0 ; RV32-LMULMAX8-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV32-LMULMAX8-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX8-NEXT: vslideup.vi v0, v8, 3 ; RV32-LMULMAX8-NEXT: ret ; @@ -839,12 +1001,16 @@ ; RV64-LMULMAX8: # %bb.0: ; RV64-LMULMAX8-NEXT: lui a0, %hi(.LCPI20_0) ; RV64-LMULMAX8-NEXT: ld a0, %lo(.LCPI20_0)(a0) -; RV64-LMULMAX8-NEXT: lui a1, %hi(.LCPI20_1) -; RV64-LMULMAX8-NEXT: ld a1, %lo(.LCPI20_1)(a1) -; RV64-LMULMAX8-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX8-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX8-NEXT: vsetivli zero, 2, e64, m1, tu, mu ; RV64-LMULMAX8-NEXT: vmv.s.x v8, a0 -; RV64-LMULMAX8-NEXT: vmv.s.x v0, a1 -; RV64-LMULMAX8-NEXT: vsetvli zero, zero, e64, m1, tu, mu +; RV64-LMULMAX8-NEXT: lui a0, %hi(.LCPI20_1) +; RV64-LMULMAX8-NEXT: ld a0, %lo(.LCPI20_1)(a0) +; RV64-LMULMAX8-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX8-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX8-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-LMULMAX8-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX8-NEXT: vslideup.vi v0, v8, 1 ; RV64-LMULMAX8-NEXT: ret ret <128 x i1> @@ -854,19 +1020,33 @@ ; RV32-LMULMAX1-LABEL: buildvec_mask_optsize_v128i1: ; RV32-LMULMAX1: # %bb.0: ; RV32-LMULMAX1-NEXT: li a0, 1776 -; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX1-NEXT: lui a0, 11 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1718 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX1-NEXT: lui a0, 8 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1718 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v12, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v12, a0 ; RV32-LMULMAX1-NEXT: lui a0, 4 ; RV32-LMULMAX1-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v9, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v9, a0 ; RV32-LMULMAX1-NEXT: lui a0, 14 ; RV32-LMULMAX1-NEXT: addi a0, a0, 1722 +; RV32-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX1-NEXT: vmv.v.i v14, 0 +; RV32-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV32-LMULMAX1-NEXT: vmv.s.x v14, a0 ; RV32-LMULMAX1-NEXT: vmv1r.v v10, v8 ; RV32-LMULMAX1-NEXT: vmv1r.v v11, v0 @@ -876,19 +1056,33 @@ ; RV64-LMULMAX1-LABEL: buildvec_mask_optsize_v128i1: ; RV64-LMULMAX1: # %bb.0: ; RV64-LMULMAX1-NEXT: li a0, 1776 -; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX1-NEXT: lui a0, 11 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1718 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v8, a0 ; RV64-LMULMAX1-NEXT: lui a0, 8 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1718 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v12, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v12, a0 ; RV64-LMULMAX1-NEXT: lui a0, 4 ; RV64-LMULMAX1-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v9, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v9, a0 ; RV64-LMULMAX1-NEXT: lui a0, 14 ; RV64-LMULMAX1-NEXT: addiw a0, a0, 1722 +; RV64-LMULMAX1-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX1-NEXT: vmv.v.i v14, 0 +; RV64-LMULMAX1-NEXT: vsetivli zero, 1, e16, mf4, tu, mu ; RV64-LMULMAX1-NEXT: vmv.s.x v14, a0 ; RV64-LMULMAX1-NEXT: vmv1r.v v10, v8 ; RV64-LMULMAX1-NEXT: vmv1r.v v11, v0 @@ -899,16 +1093,27 @@ ; RV32-LMULMAX2: # %bb.0: ; RV32-LMULMAX2-NEXT: lui a0, 748384 ; RV32-LMULMAX2-NEXT: addi a0, a0, 1776 -; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV32-LMULMAX2-NEXT: lui a0, 748388 ; RV32-LMULMAX2-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v8, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v8, a0 ; RV32-LMULMAX2-NEXT: lui a0, 551776 ; RV32-LMULMAX2-NEXT: addi a0, a0, 1776 +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v9, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v9, a0 ; RV32-LMULMAX2-NEXT: lui a0, 945060 ; RV32-LMULMAX2-NEXT: addi a0, a0, -1793 +; RV32-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-LMULMAX2-NEXT: vmv.v.i v10, 0 +; RV32-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-LMULMAX2-NEXT: vmv.s.x v10, a0 ; RV32-LMULMAX2-NEXT: ret ; @@ -916,16 +1121,27 @@ ; RV64-LMULMAX2: # %bb.0: ; RV64-LMULMAX2-NEXT: lui a0, 748384 ; RV64-LMULMAX2-NEXT: addiw a0, a0, 1776 -; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v0, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX2-NEXT: lui a0, 748388 ; RV64-LMULMAX2-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v8, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v8, a0 ; RV64-LMULMAX2-NEXT: lui a0, 551776 ; RV64-LMULMAX2-NEXT: addiw a0, a0, 1776 +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v9, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v9, a0 ; RV64-LMULMAX2-NEXT: lui a0, 945060 ; RV64-LMULMAX2-NEXT: addiw a0, a0, -1793 +; RV64-LMULMAX2-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-LMULMAX2-NEXT: vmv.v.i v10, 0 +; RV64-LMULMAX2-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV64-LMULMAX2-NEXT: vmv.s.x v10, a0 ; RV64-LMULMAX2-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -92,9 +92,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB1_2 ; RV64ZVE32F-NEXT: .LBB1_4: # %cond.load1 ; RV64ZVE32F-NEXT: lb a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) @@ -141,9 +142,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB2_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lb a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB2_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu @@ -195,9 +197,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB3_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lb a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB3_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu @@ -249,9 +252,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB4_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lb a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB4_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu @@ -303,9 +307,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB5_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lb a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB5_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu @@ -338,6 +343,8 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v8, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 @@ -365,11 +372,14 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB6_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lb a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB6_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 @@ -401,6 +411,8 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v8, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 @@ -428,11 +440,14 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB7_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lb a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB7_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -489,7 +504,9 @@ ; RV64ZVE32F-NEXT: .LBB8_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -498,7 +515,9 @@ ; RV64ZVE32F-NEXT: .LBB8_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -507,9 +526,10 @@ ; RV64ZVE32F-NEXT: .LBB8_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lb a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %m, <4 x i8> %passthru) @@ -558,7 +578,9 @@ ; RV64ZVE32F-NEXT: .LBB9_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -567,7 +589,9 @@ ; RV64ZVE32F-NEXT: .LBB9_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -576,9 +600,10 @@ ; RV64ZVE32F-NEXT: .LBB9_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lb a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 @@ -661,7 +686,9 @@ ; RV64ZVE32F-NEXT: .LBB11_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -670,7 +697,9 @@ ; RV64ZVE32F-NEXT: .LBB11_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -679,7 +708,9 @@ ; RV64ZVE32F-NEXT: .LBB11_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 @@ -688,7 +719,9 @@ ; RV64ZVE32F-NEXT: .LBB11_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 @@ -697,7 +730,9 @@ ; RV64ZVE32F-NEXT: .LBB11_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld a2, 40(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 @@ -706,7 +741,9 @@ ; RV64ZVE32F-NEXT: .LBB11_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld a2, 48(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 @@ -715,9 +752,11 @@ ; RV64ZVE32F-NEXT: .LBB11_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a0, 56(a0) ; RV64ZVE32F-NEXT: lb a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 ; RV64ZVE32F-NEXT: ret %v = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> %m, <8 x i8> %passthru) @@ -756,98 +795,125 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB12_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB12_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB12_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB12_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB12_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB12_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_14 -; RV64ZVE32F-NEXT: .LBB12_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB12_10 -; RV64ZVE32F-NEXT: .LBB12_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB12_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB12_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB12_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB12_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB12_16 -; RV64ZVE32F-NEXT: .LBB12_12: # %else20 +; RV64ZVE32F-NEXT: .LBB12_14: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB12_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_8 -; RV64ZVE32F-NEXT: .LBB12_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_9 -; RV64ZVE32F-NEXT: j .LBB12_10 ; RV64ZVE32F-NEXT: .LBB12_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB12_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB12_14 ; RV64ZVE32F-NEXT: .LBB12_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 @@ -942,9 +1008,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB14_2 ; RV64ZVE32F-NEXT: .LBB14_4: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) @@ -991,9 +1058,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB15_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB15_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu @@ -1045,9 +1113,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB16_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB16_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu @@ -1080,6 +1149,8 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v8, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 @@ -1107,11 +1178,14 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB17_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB17_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 @@ -1143,6 +1217,8 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v8, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 @@ -1172,9 +1248,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB18_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu @@ -1182,9 +1259,11 @@ ; RV64ZVE32F-NEXT: lui a1, 16 ; RV64ZVE32F-NEXT: addiw a1, a1, -1 ; RV64ZVE32F-NEXT: and a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: and a1, a2, a1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) @@ -1236,7 +1315,9 @@ ; RV64ZVE32F-NEXT: .LBB19_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -1245,7 +1326,9 @@ ; RV64ZVE32F-NEXT: .LBB19_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -1254,9 +1337,10 @@ ; RV64ZVE32F-NEXT: .LBB19_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> %m, <4 x i16> %passthru) @@ -1305,7 +1389,9 @@ ; RV64ZVE32F-NEXT: .LBB20_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -1314,7 +1400,9 @@ ; RV64ZVE32F-NEXT: .LBB20_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -1323,9 +1411,10 @@ ; RV64ZVE32F-NEXT: .LBB20_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 @@ -1408,7 +1497,9 @@ ; RV64ZVE32F-NEXT: .LBB22_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -1417,7 +1508,9 @@ ; RV64ZVE32F-NEXT: .LBB22_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -1426,7 +1519,9 @@ ; RV64ZVE32F-NEXT: .LBB22_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 @@ -1435,7 +1530,9 @@ ; RV64ZVE32F-NEXT: .LBB22_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 @@ -1444,7 +1541,9 @@ ; RV64ZVE32F-NEXT: .LBB22_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld a2, 40(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 @@ -1453,7 +1552,9 @@ ; RV64ZVE32F-NEXT: .LBB22_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld a2, 48(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 @@ -1462,9 +1563,11 @@ ; RV64ZVE32F-NEXT: .LBB22_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a0, 56(a0) ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 ; RV64ZVE32F-NEXT: ret %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) @@ -1506,113 +1609,134 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB23_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB23_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB23_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB23_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB23_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB23_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_14 -; RV64ZVE32F-NEXT: .LBB23_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB23_10 -; RV64ZVE32F-NEXT: .LBB23_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB23_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB23_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB23_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB23_16 -; RV64ZVE32F-NEXT: .LBB23_12: # %else20 +; RV64ZVE32F-NEXT: .LBB23_14: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB23_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_8 -; RV64ZVE32F-NEXT: .LBB23_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_9 -; RV64ZVE32F-NEXT: j .LBB23_10 ; RV64ZVE32F-NEXT: .LBB23_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB23_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB23_14 ; RV64ZVE32F-NEXT: .LBB23_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -1656,113 +1780,134 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB24_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB24_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB24_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB24_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB24_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_14 -; RV64ZVE32F-NEXT: .LBB24_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_10 -; RV64ZVE32F-NEXT: .LBB24_9: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB24_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB24_16 -; RV64ZVE32F-NEXT: .LBB24_12: # %else20 -; RV64ZVE32F-NEXT: vmv1r.v v8, v9 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB24_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB24_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_8 -; RV64ZVE32F-NEXT: .LBB24_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB24_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_9 -; RV64ZVE32F-NEXT: j .LBB24_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB24_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB24_16 +; RV64ZVE32F-NEXT: .LBB24_14: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB24_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB24_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB24_14 ; RV64ZVE32F-NEXT: .LBB24_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -1808,9 +1953,12 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB25_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB25_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 @@ -1818,110 +1966,128 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB25_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB25_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB25_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB25_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_14 -; RV64ZVE32F-NEXT: .LBB25_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB25_10 -; RV64ZVE32F-NEXT: .LBB25_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB25_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB25_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB25_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB25_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB25_16 -; RV64ZVE32F-NEXT: .LBB25_12: # %else20 +; RV64ZVE32F-NEXT: .LBB25_14: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB25_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_8 -; RV64ZVE32F-NEXT: .LBB25_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_9 -; RV64ZVE32F-NEXT: j .LBB25_10 ; RV64ZVE32F-NEXT: .LBB25_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB25_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB25_14 ; RV64ZVE32F-NEXT: .LBB25_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -1967,105 +2133,132 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB26_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB26_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB26_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB26_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB26_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB26_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_14 -; RV64ZVE32F-NEXT: .LBB26_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB26_10 -; RV64ZVE32F-NEXT: .LBB26_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB26_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB26_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB26_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB26_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB26_16 -; RV64ZVE32F-NEXT: .LBB26_12: # %else20 +; RV64ZVE32F-NEXT: .LBB26_14: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB26_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_8 -; RV64ZVE32F-NEXT: .LBB26_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_9 -; RV64ZVE32F-NEXT: j .LBB26_10 ; RV64ZVE32F-NEXT: .LBB26_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB26_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB26_14 ; RV64ZVE32F-NEXT: .LBB26_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 @@ -2160,9 +2353,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB28_2 ; RV64ZVE32F-NEXT: .LBB28_4: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) @@ -2190,6 +2384,8 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v8, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 @@ -2218,11 +2414,14 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB29_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 @@ -2254,6 +2453,8 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v8, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: sw zero, 12(a0) @@ -2278,11 +2479,14 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB30_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -2341,7 +2545,9 @@ ; RV64ZVE32F-NEXT: .LBB31_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -2350,7 +2556,9 @@ ; RV64ZVE32F-NEXT: .LBB31_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -2359,9 +2567,10 @@ ; RV64ZVE32F-NEXT: .LBB31_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %m, <4 x i32> %passthru) @@ -2409,7 +2618,9 @@ ; RV64ZVE32F-NEXT: .LBB32_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -2418,7 +2629,9 @@ ; RV64ZVE32F-NEXT: .LBB32_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -2427,9 +2640,10 @@ ; RV64ZVE32F-NEXT: .LBB32_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 @@ -2512,7 +2726,9 @@ ; RV64ZVE32F-NEXT: .LBB34_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 @@ -2521,7 +2737,9 @@ ; RV64ZVE32F-NEXT: .LBB34_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2 @@ -2530,7 +2748,9 @@ ; RV64ZVE32F-NEXT: .LBB34_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3 @@ -2539,7 +2759,9 @@ ; RV64ZVE32F-NEXT: .LBB34_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 4 @@ -2548,7 +2770,9 @@ ; RV64ZVE32F-NEXT: .LBB34_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld a2, 40(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 5 @@ -2557,7 +2781,9 @@ ; RV64ZVE32F-NEXT: .LBB34_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld a2, 48(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 6 @@ -2566,9 +2792,10 @@ ; RV64ZVE32F-NEXT: .LBB34_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a0, 56(a0) ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 7 ; RV64ZVE32F-NEXT: ret %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -2609,113 +2836,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB35_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB35_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB35_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB35_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 -; RV64ZVE32F-NEXT: .LBB35_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB35_10 -; RV64ZVE32F-NEXT: .LBB35_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB35_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB35_16 -; RV64ZVE32F-NEXT: .LBB35_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB35_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB35_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 -; RV64ZVE32F-NEXT: .LBB35_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_9 -; RV64ZVE32F-NEXT: j .LBB35_10 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB35_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB35_16 +; RV64ZVE32F-NEXT: .LBB35_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB35_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB35_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB35_14 ; RV64ZVE32F-NEXT: .LBB35_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -2758,113 +3005,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB36_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB36_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB36_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB36_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB36_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_14 -; RV64ZVE32F-NEXT: .LBB36_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_10 -; RV64ZVE32F-NEXT: .LBB36_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB36_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB36_16 -; RV64ZVE32F-NEXT: .LBB36_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB36_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB36_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_8 -; RV64ZVE32F-NEXT: .LBB36_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB36_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_9 -; RV64ZVE32F-NEXT: j .LBB36_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB36_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB36_16 +; RV64ZVE32F-NEXT: .LBB36_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB36_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB36_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB36_14 ; RV64ZVE32F-NEXT: .LBB36_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -2909,9 +3176,12 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB37_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB37_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -2919,110 +3189,127 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB37_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB37_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB37_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_14 -; RV64ZVE32F-NEXT: .LBB37_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_10 -; RV64ZVE32F-NEXT: .LBB37_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB37_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB37_16 -; RV64ZVE32F-NEXT: .LBB37_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB37_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB37_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_8 -; RV64ZVE32F-NEXT: .LBB37_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB37_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_9 -; RV64ZVE32F-NEXT: j .LBB37_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB37_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB37_16 +; RV64ZVE32F-NEXT: .LBB37_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB37_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB37_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB37_14 ; RV64ZVE32F-NEXT: .LBB37_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -3067,113 +3354,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB38_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB38_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB38_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB38_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB38_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_14 -; RV64ZVE32F-NEXT: .LBB38_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_10 -; RV64ZVE32F-NEXT: .LBB38_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB38_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB38_16 -; RV64ZVE32F-NEXT: .LBB38_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB38_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB38_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_8 -; RV64ZVE32F-NEXT: .LBB38_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB38_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_9 -; RV64ZVE32F-NEXT: j .LBB38_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB38_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB38_16 +; RV64ZVE32F-NEXT: .LBB38_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB38_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB38_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB38_14 ; RV64ZVE32F-NEXT: .LBB38_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -3217,113 +3524,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB39_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB39_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB39_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB39_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB39_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_14 -; RV64ZVE32F-NEXT: .LBB39_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_10 -; RV64ZVE32F-NEXT: .LBB39_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB39_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB39_16 -; RV64ZVE32F-NEXT: .LBB39_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB39_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB39_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_8 -; RV64ZVE32F-NEXT: .LBB39_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB39_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_9 -; RV64ZVE32F-NEXT: j .LBB39_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB39_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB39_16 +; RV64ZVE32F-NEXT: .LBB39_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB39_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB39_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB39_14 ; RV64ZVE32F-NEXT: .LBB39_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -3371,9 +3698,12 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a3 ; RV64ZVE32F-NEXT: .LBB40_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB40_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -3381,110 +3711,127 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB40_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB40_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB40_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB40_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB40_14 -; RV64ZVE32F-NEXT: .LBB40_8: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB40_10 -; RV64ZVE32F-NEXT: .LBB40_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 2 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB40_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB40_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: bnez a2, .LBB40_16 -; RV64ZVE32F-NEXT: .LBB40_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB40_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB40_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB40_8 -; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB40_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB40_9 -; RV64ZVE32F-NEXT: j .LBB40_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB40_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB40_16 +; RV64ZVE32F-NEXT: .LBB40_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB40_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB40_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_14 ; RV64ZVE32F-NEXT: .LBB40_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: and a1, a2, a1 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -3528,109 +3875,138 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB41_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB41_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v15, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v16, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v16, 1 ; RV64ZVE32F-NEXT: .LBB41_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB41_12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB41_13 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_14 ; RV64ZVE32F-NEXT: .LBB41_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB41_14 -; RV64ZVE32F-NEXT: .LBB41_7: # %else11 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_8 +; RV64ZVE32F-NEXT: .LBB41_7: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: .LBB41_8: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB41_9 -; RV64ZVE32F-NEXT: .LBB41_8: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 -; RV64ZVE32F-NEXT: .LBB41_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5 +; RV64ZVE32F-NEXT: .LBB41_10: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB41_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else17 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB41_16 -; RV64ZVE32F-NEXT: .LBB41_11: # %else20 +; RV64ZVE32F-NEXT: .LBB41_12: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB41_12: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: .LBB41_13: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB41_6 -; RV64ZVE32F-NEXT: .LBB41_13: # %cond.load7 +; RV64ZVE32F-NEXT: .LBB41_14: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v14, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB41_7 -; RV64ZVE32F-NEXT: .LBB41_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB41_8 -; RV64ZVE32F-NEXT: j .LBB41_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_7 +; RV64ZVE32F-NEXT: j .LBB41_8 ; RV64ZVE32F-NEXT: .LBB41_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB41_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB41_12 ; RV64ZVE32F-NEXT: .LBB41_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -3729,9 +4105,11 @@ ; RV32ZVE32F-NEXT: andi a4, a4, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB43_2 ; RV32ZVE32F-NEXT: .LBB43_4: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: lw a4, 4(a1) ; RV32ZVE32F-NEXT: lw a1, 0(a1) ; RV32ZVE32F-NEXT: .LBB43_5: # %else2 @@ -3812,6 +4190,8 @@ ; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB44_2 ; RV32ZVE32F-NEXT: .LBB44_6: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v9 @@ -3820,6 +4200,8 @@ ; RV32ZVE32F-NEXT: andi a7, a6, 4 ; RV32ZVE32F-NEXT: beqz a7, .LBB44_3 ; RV32ZVE32F-NEXT: .LBB44_7: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s t0, v9 @@ -3828,9 +4210,11 @@ ; RV32ZVE32F-NEXT: andi a6, a6, 8 ; RV32ZVE32F-NEXT: beqz a6, .LBB44_4 ; RV32ZVE32F-NEXT: .LBB44_8: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: lw a6, 4(a1) ; RV32ZVE32F-NEXT: lw a1, 0(a1) ; RV32ZVE32F-NEXT: .LBB44_9: # %else8 @@ -3940,6 +4324,8 @@ ; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB45_2 ; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v9 @@ -3948,6 +4334,8 @@ ; RV32ZVE32F-NEXT: andi a7, a6, 4 ; RV32ZVE32F-NEXT: beqz a7, .LBB45_3 ; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s t0, v9 @@ -3956,9 +4344,11 @@ ; RV32ZVE32F-NEXT: andi a6, a6, 8 ; RV32ZVE32F-NEXT: beqz a6, .LBB45_4 ; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: lw a6, 4(a1) ; RV32ZVE32F-NEXT: lw a1, 0(a1) ; RV32ZVE32F-NEXT: .LBB45_9: # %else8 @@ -4145,6 +4535,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB47_2 ; RV32ZVE32F-NEXT: .LBB47_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -4153,6 +4545,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB47_3 ; RV32ZVE32F-NEXT: .LBB47_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -4161,6 +4555,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB47_4 ; RV32ZVE32F-NEXT: .LBB47_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -4169,6 +4565,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB47_5 ; RV32ZVE32F-NEXT: .LBB47_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -4177,6 +4575,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB47_6 ; RV32ZVE32F-NEXT: .LBB47_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -4185,6 +4585,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB47_7 ; RV32ZVE32F-NEXT: .LBB47_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -4193,9 +4595,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB47_8 ; RV32ZVE32F-NEXT: .LBB47_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a1) ; RV32ZVE32F-NEXT: lw a1, 0(a1) ; RV32ZVE32F-NEXT: .LBB47_17: # %else20 @@ -4392,6 +4796,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB48_2 ; RV32ZVE32F-NEXT: .LBB48_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -4400,6 +4806,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB48_3 ; RV32ZVE32F-NEXT: .LBB48_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -4408,6 +4816,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB48_4 ; RV32ZVE32F-NEXT: .LBB48_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -4416,6 +4826,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB48_5 ; RV32ZVE32F-NEXT: .LBB48_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -4424,6 +4836,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB48_6 ; RV32ZVE32F-NEXT: .LBB48_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -4432,6 +4846,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB48_7 ; RV32ZVE32F-NEXT: .LBB48_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -4440,9 +4856,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB48_8 ; RV32ZVE32F-NEXT: .LBB48_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB48_17: # %else20 @@ -4472,108 +4890,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB48_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB48_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB48_4 +; RV64ZVE32F-NEXT: j .LBB48_3 ; RV64ZVE32F-NEXT: .LBB48_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB48_5 -; RV64ZVE32F-NEXT: .LBB48_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB48_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB48_2 -; RV64ZVE32F-NEXT: .LBB48_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB48_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB48_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB48_6 +; RV64ZVE32F-NEXT: .LBB48_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB48_6: # %else2 +; RV64ZVE32F-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB48_7 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB48_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB48_8 -; RV64ZVE32F-NEXT: .LBB48_7: +; RV64ZVE32F-NEXT: j .LBB48_9 +; RV64ZVE32F-NEXT: .LBB48_8: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB48_8: # %else5 +; RV64ZVE32F-NEXT: .LBB48_9: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB48_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB48_13 -; RV64ZVE32F-NEXT: .LBB48_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB48_14 ; RV64ZVE32F-NEXT: .LBB48_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB48_15 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB48_14 ; RV64ZVE32F-NEXT: .LBB48_12: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB48_10 +; RV64ZVE32F-NEXT: beqz t0, .LBB48_11 ; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB48_14: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB48_11 -; RV64ZVE32F-NEXT: .LBB48_14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB48_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v10 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB48_15: # %else14 +; RV64ZVE32F-NEXT: j .LBB48_17 +; RV64ZVE32F-NEXT: .LBB48_16: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB48_17: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB48_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB48_20 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB48_19 -; RV64ZVE32F-NEXT: .LBB48_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB48_21 +; RV64ZVE32F-NEXT: .LBB48_19: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB48_20 -; RV64ZVE32F-NEXT: .LBB48_18: +; RV64ZVE32F-NEXT: j .LBB48_22 +; RV64ZVE32F-NEXT: .LBB48_20: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB48_17 -; RV64ZVE32F-NEXT: .LBB48_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB48_19 +; RV64ZVE32F-NEXT: .LBB48_21: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB48_20: # %else20 +; RV64ZVE32F-NEXT: .LBB48_22: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -4673,6 +5103,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB49_2 ; RV32ZVE32F-NEXT: .LBB49_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -4681,6 +5113,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB49_3 ; RV32ZVE32F-NEXT: .LBB49_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -4689,6 +5123,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB49_4 ; RV32ZVE32F-NEXT: .LBB49_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -4697,6 +5133,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB49_5 ; RV32ZVE32F-NEXT: .LBB49_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -4705,6 +5143,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB49_6 ; RV32ZVE32F-NEXT: .LBB49_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -4713,6 +5153,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB49_7 ; RV32ZVE32F-NEXT: .LBB49_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -4721,9 +5163,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB49_8 ; RV32ZVE32F-NEXT: .LBB49_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB49_17: # %else20 @@ -4753,108 +5197,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB49_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB49_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB49_4 +; RV64ZVE32F-NEXT: j .LBB49_3 ; RV64ZVE32F-NEXT: .LBB49_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB49_5 -; RV64ZVE32F-NEXT: .LBB49_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB49_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB49_2 -; RV64ZVE32F-NEXT: .LBB49_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB49_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB49_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB49_6 +; RV64ZVE32F-NEXT: .LBB49_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB49_6: # %else2 +; RV64ZVE32F-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB49_7 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB49_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB49_8 -; RV64ZVE32F-NEXT: .LBB49_7: +; RV64ZVE32F-NEXT: j .LBB49_9 +; RV64ZVE32F-NEXT: .LBB49_8: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB49_8: # %else5 +; RV64ZVE32F-NEXT: .LBB49_9: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB49_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB49_13 -; RV64ZVE32F-NEXT: .LBB49_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB49_14 ; RV64ZVE32F-NEXT: .LBB49_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB49_15 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB49_14 ; RV64ZVE32F-NEXT: .LBB49_12: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB49_10 +; RV64ZVE32F-NEXT: beqz t0, .LBB49_11 ; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB49_14: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB49_11 -; RV64ZVE32F-NEXT: .LBB49_14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB49_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v10 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB49_15: # %else14 +; RV64ZVE32F-NEXT: j .LBB49_17 +; RV64ZVE32F-NEXT: .LBB49_16: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB49_17: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB49_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB49_20 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB49_19 -; RV64ZVE32F-NEXT: .LBB49_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB49_21 +; RV64ZVE32F-NEXT: .LBB49_19: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB49_20 -; RV64ZVE32F-NEXT: .LBB49_18: +; RV64ZVE32F-NEXT: j .LBB49_22 +; RV64ZVE32F-NEXT: .LBB49_20: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB49_17 -; RV64ZVE32F-NEXT: .LBB49_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB49_19 +; RV64ZVE32F-NEXT: .LBB49_21: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB49_20: # %else20 +; RV64ZVE32F-NEXT: .LBB49_22: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -4955,6 +5411,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB50_2 ; RV32ZVE32F-NEXT: .LBB50_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -4963,6 +5421,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB50_3 ; RV32ZVE32F-NEXT: .LBB50_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -4971,6 +5431,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB50_4 ; RV32ZVE32F-NEXT: .LBB50_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -4979,6 +5441,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB50_5 ; RV32ZVE32F-NEXT: .LBB50_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -4987,6 +5451,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB50_6 ; RV32ZVE32F-NEXT: .LBB50_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -4995,6 +5461,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB50_7 ; RV32ZVE32F-NEXT: .LBB50_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -5003,9 +5471,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB50_8 ; RV32ZVE32F-NEXT: .LBB50_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB50_17: # %else20 @@ -5035,23 +5505,23 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB50_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB50_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB50_4 +; RV64ZVE32F-NEXT: j .LBB50_3 ; RV64ZVE32F-NEXT: .LBB50_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB50_5 -; RV64ZVE32F-NEXT: .LBB50_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB50_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB50_2 -; RV64ZVE32F-NEXT: .LBB50_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB50_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 @@ -5059,92 +5529,104 @@ ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB50_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB50_6 +; RV64ZVE32F-NEXT: .LBB50_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB50_6: # %else2 +; RV64ZVE32F-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB50_7 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB50_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: andi a6, a6, 255 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB50_8 -; RV64ZVE32F-NEXT: .LBB50_7: +; RV64ZVE32F-NEXT: j .LBB50_9 +; RV64ZVE32F-NEXT: .LBB50_8: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB50_8: # %else5 +; RV64ZVE32F-NEXT: .LBB50_9: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB50_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: andi a7, a7, 255 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB50_13 -; RV64ZVE32F-NEXT: .LBB50_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB50_14 ; RV64ZVE32F-NEXT: .LBB50_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB50_15 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB50_14 ; RV64ZVE32F-NEXT: .LBB50_12: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB50_10 +; RV64ZVE32F-NEXT: beqz t0, .LBB50_11 ; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: andi t0, t0, 255 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB50_14: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB50_11 -; RV64ZVE32F-NEXT: .LBB50_14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB50_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v10 ; RV64ZVE32F-NEXT: andi t1, t1, 255 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB50_15: # %else14 +; RV64ZVE32F-NEXT: j .LBB50_17 +; RV64ZVE32F-NEXT: .LBB50_16: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB50_17: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB50_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB50_20 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: andi t2, t2, 255 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB50_19 -; RV64ZVE32F-NEXT: .LBB50_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB50_21 +; RV64ZVE32F-NEXT: .LBB50_19: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB50_20 -; RV64ZVE32F-NEXT: .LBB50_18: +; RV64ZVE32F-NEXT: j .LBB50_22 +; RV64ZVE32F-NEXT: .LBB50_20: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB50_17 -; RV64ZVE32F-NEXT: .LBB50_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB50_19 +; RV64ZVE32F-NEXT: .LBB50_21: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB50_20: # %else20 +; RV64ZVE32F-NEXT: .LBB50_22: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5243,6 +5725,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB51_2 ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -5251,6 +5735,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB51_3 ; RV32ZVE32F-NEXT: .LBB51_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -5259,6 +5745,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB51_4 ; RV32ZVE32F-NEXT: .LBB51_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -5267,6 +5755,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB51_5 ; RV32ZVE32F-NEXT: .LBB51_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -5275,6 +5765,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB51_6 ; RV32ZVE32F-NEXT: .LBB51_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -5283,6 +5775,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB51_7 ; RV32ZVE32F-NEXT: .LBB51_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -5291,9 +5785,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB51_8 ; RV32ZVE32F-NEXT: .LBB51_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB51_17: # %else20 @@ -5323,109 +5819,121 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB51_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB51_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB51_4 +; RV64ZVE32F-NEXT: j .LBB51_3 ; RV64ZVE32F-NEXT: .LBB51_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB51_5 -; RV64ZVE32F-NEXT: .LBB51_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB51_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB51_2 -; RV64ZVE32F-NEXT: .LBB51_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB51_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB51_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB51_6 +; RV64ZVE32F-NEXT: .LBB51_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB51_6: # %else2 +; RV64ZVE32F-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB51_7 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB51_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB51_8 -; RV64ZVE32F-NEXT: .LBB51_7: +; RV64ZVE32F-NEXT: j .LBB51_9 +; RV64ZVE32F-NEXT: .LBB51_8: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB51_8: # %else5 +; RV64ZVE32F-NEXT: .LBB51_9: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB51_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB51_13 -; RV64ZVE32F-NEXT: .LBB51_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB51_14 ; RV64ZVE32F-NEXT: .LBB51_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB51_15 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB51_14 ; RV64ZVE32F-NEXT: .LBB51_12: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB51_10 +; RV64ZVE32F-NEXT: beqz t0, .LBB51_11 ; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB51_14: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB51_11 -; RV64ZVE32F-NEXT: .LBB51_14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB51_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v10 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB51_15: # %else14 +; RV64ZVE32F-NEXT: j .LBB51_17 +; RV64ZVE32F-NEXT: .LBB51_16: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB51_17: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB51_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB51_20 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB51_19 -; RV64ZVE32F-NEXT: .LBB51_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB51_21 +; RV64ZVE32F-NEXT: .LBB51_19: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB51_20 -; RV64ZVE32F-NEXT: .LBB51_18: +; RV64ZVE32F-NEXT: j .LBB51_22 +; RV64ZVE32F-NEXT: .LBB51_20: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB51_17 -; RV64ZVE32F-NEXT: .LBB51_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB51_19 +; RV64ZVE32F-NEXT: .LBB51_21: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB51_20: # %else20 +; RV64ZVE32F-NEXT: .LBB51_22: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5525,6 +6033,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB52_2 ; RV32ZVE32F-NEXT: .LBB52_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -5533,6 +6043,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB52_3 ; RV32ZVE32F-NEXT: .LBB52_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -5541,6 +6053,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB52_4 ; RV32ZVE32F-NEXT: .LBB52_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -5549,6 +6063,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB52_5 ; RV32ZVE32F-NEXT: .LBB52_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -5557,6 +6073,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB52_6 ; RV32ZVE32F-NEXT: .LBB52_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -5565,6 +6083,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB52_7 ; RV32ZVE32F-NEXT: .LBB52_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -5573,9 +6093,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB52_8 ; RV32ZVE32F-NEXT: .LBB52_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB52_17: # %else20 @@ -5605,109 +6127,121 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB52_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB52_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB52_4 +; RV64ZVE32F-NEXT: j .LBB52_3 ; RV64ZVE32F-NEXT: .LBB52_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB52_5 -; RV64ZVE32F-NEXT: .LBB52_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB52_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB52_2 -; RV64ZVE32F-NEXT: .LBB52_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB52_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB52_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB52_6 +; RV64ZVE32F-NEXT: .LBB52_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB52_6: # %else2 +; RV64ZVE32F-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB52_7 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB52_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB52_8 -; RV64ZVE32F-NEXT: .LBB52_7: +; RV64ZVE32F-NEXT: j .LBB52_9 +; RV64ZVE32F-NEXT: .LBB52_8: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB52_8: # %else5 +; RV64ZVE32F-NEXT: .LBB52_9: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB52_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB52_13 -; RV64ZVE32F-NEXT: .LBB52_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB52_14 ; RV64ZVE32F-NEXT: .LBB52_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB52_15 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB52_14 ; RV64ZVE32F-NEXT: .LBB52_12: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB52_10 +; RV64ZVE32F-NEXT: beqz t0, .LBB52_11 ; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB52_14: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB52_11 -; RV64ZVE32F-NEXT: .LBB52_14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB52_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v10 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB52_15: # %else14 +; RV64ZVE32F-NEXT: j .LBB52_17 +; RV64ZVE32F-NEXT: .LBB52_16: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB52_17: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB52_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB52_20 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB52_19 -; RV64ZVE32F-NEXT: .LBB52_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB52_21 +; RV64ZVE32F-NEXT: .LBB52_19: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB52_20 -; RV64ZVE32F-NEXT: .LBB52_18: +; RV64ZVE32F-NEXT: j .LBB52_22 +; RV64ZVE32F-NEXT: .LBB52_20: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB52_17 -; RV64ZVE32F-NEXT: .LBB52_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB52_19 +; RV64ZVE32F-NEXT: .LBB52_21: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB52_20: # %else20 +; RV64ZVE32F-NEXT: .LBB52_22: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5808,6 +6342,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB53_2 ; RV32ZVE32F-NEXT: .LBB53_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -5816,6 +6352,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB53_3 ; RV32ZVE32F-NEXT: .LBB53_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -5824,6 +6362,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB53_4 ; RV32ZVE32F-NEXT: .LBB53_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -5832,6 +6372,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB53_5 ; RV32ZVE32F-NEXT: .LBB53_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -5840,6 +6382,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB53_6 ; RV32ZVE32F-NEXT: .LBB53_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -5848,6 +6392,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB53_7 ; RV32ZVE32F-NEXT: .LBB53_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -5856,9 +6402,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB53_8 ; RV32ZVE32F-NEXT: .LBB53_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB53_17: # %else20 @@ -5890,7 +6438,7 @@ ; RV64ZVE32F-NEXT: vmv.x.s a6, v0 ; RV64ZVE32F-NEXT: andi a4, a6, 1 ; RV64ZVE32F-NEXT: addiw a5, a3, -1 -; RV64ZVE32F-NEXT: beqz a4, .LBB53_3 +; RV64ZVE32F-NEXT: beqz a4, .LBB53_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -5898,16 +6446,16 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a6, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB53_4 +; RV64ZVE32F-NEXT: j .LBB53_3 ; RV64ZVE32F-NEXT: .LBB53_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB53_5 -; RV64ZVE32F-NEXT: .LBB53_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB53_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a6, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB53_2 -; RV64ZVE32F-NEXT: .LBB53_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB53_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 @@ -5915,92 +6463,104 @@ ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB53_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB53_6 +; RV64ZVE32F-NEXT: .LBB53_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB53_6: # %else2 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a7, a6, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a7, .LBB53_7 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a7, .LBB53_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a7, v10 ; RV64ZVE32F-NEXT: and a7, a7, a5 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: j .LBB53_8 -; RV64ZVE32F-NEXT: .LBB53_7: +; RV64ZVE32F-NEXT: j .LBB53_9 +; RV64ZVE32F-NEXT: .LBB53_8: ; RV64ZVE32F-NEXT: ld a7, 16(a2) -; RV64ZVE32F-NEXT: .LBB53_8: # %else5 +; RV64ZVE32F-NEXT: .LBB53_9: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t0, a6, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: beqz t0, .LBB53_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli t0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: and t0, t0, a5 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a6, 16 ; RV64ZVE32F-NEXT: bnez t1, .LBB53_13 -; RV64ZVE32F-NEXT: .LBB53_10: -; RV64ZVE32F-NEXT: ld t1, 32(a2) -; RV64ZVE32F-NEXT: andi t2, a6, 32 -; RV64ZVE32F-NEXT: bnez t2, .LBB53_14 ; RV64ZVE32F-NEXT: .LBB53_11: -; RV64ZVE32F-NEXT: ld t2, 40(a2) -; RV64ZVE32F-NEXT: j .LBB53_15 +; RV64ZVE32F-NEXT: ld t1, 32(a2) +; RV64ZVE32F-NEXT: j .LBB53_14 ; RV64ZVE32F-NEXT: .LBB53_12: ; RV64ZVE32F-NEXT: ld t0, 24(a2) ; RV64ZVE32F-NEXT: andi t1, a6, 16 -; RV64ZVE32F-NEXT: beqz t1, .LBB53_10 +; RV64ZVE32F-NEXT: beqz t1, .LBB53_11 ; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: and t1, t1, a5 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB53_14: # %else11 +; RV64ZVE32F-NEXT: vsetvli t2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t2, a6, 32 -; RV64ZVE32F-NEXT: beqz t2, .LBB53_11 -; RV64ZVE32F-NEXT: .LBB53_14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t2, .LBB53_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t2, v10 ; RV64ZVE32F-NEXT: and t2, t2, a5 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: .LBB53_15: # %else14 +; RV64ZVE32F-NEXT: j .LBB53_17 +; RV64ZVE32F-NEXT: .LBB53_16: +; RV64ZVE32F-NEXT: ld t2, 40(a2) +; RV64ZVE32F-NEXT: .LBB53_17: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi t3, a6, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t3, .LBB53_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t3, .LBB53_20 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t3, v8 ; RV64ZVE32F-NEXT: and t3, t3, a5 ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) ; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB53_19 -; RV64ZVE32F-NEXT: .LBB53_17: +; RV64ZVE32F-NEXT: bnez a6, .LBB53_21 +; RV64ZVE32F-NEXT: .LBB53_19: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB53_20 -; RV64ZVE32F-NEXT: .LBB53_18: +; RV64ZVE32F-NEXT: j .LBB53_22 +; RV64ZVE32F-NEXT: .LBB53_20: ; RV64ZVE32F-NEXT: ld t3, 48(a2) ; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB53_17 -; RV64ZVE32F-NEXT: .LBB53_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a6, .LBB53_19 +; RV64ZVE32F-NEXT: .LBB53_21: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: and a2, a2, a5 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB53_20: # %else20 +; RV64ZVE32F-NEXT: .LBB53_22: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a7, 16(a0) @@ -6097,6 +6657,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB54_2 ; RV32ZVE32F-NEXT: .LBB54_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -6105,6 +6667,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB54_3 ; RV32ZVE32F-NEXT: .LBB54_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -6113,6 +6677,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB54_4 ; RV32ZVE32F-NEXT: .LBB54_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -6121,6 +6687,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB54_5 ; RV32ZVE32F-NEXT: .LBB54_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -6129,6 +6697,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB54_6 ; RV32ZVE32F-NEXT: .LBB54_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -6137,6 +6707,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB54_7 ; RV32ZVE32F-NEXT: .LBB54_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -6145,9 +6717,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB54_8 ; RV32ZVE32F-NEXT: .LBB54_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB54_17: # %else20 @@ -6177,109 +6751,121 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB54_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB54_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB54_4 +; RV64ZVE32F-NEXT: j .LBB54_3 ; RV64ZVE32F-NEXT: .LBB54_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB54_5 -; RV64ZVE32F-NEXT: .LBB54_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB54_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB54_2 -; RV64ZVE32F-NEXT: .LBB54_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB54_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v13 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB54_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB54_6 +; RV64ZVE32F-NEXT: .LBB54_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB54_6: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB54_10 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v12 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB54_11 -; RV64ZVE32F-NEXT: .LBB54_7: +; RV64ZVE32F-NEXT: .LBB54_8: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB54_12 -; RV64ZVE32F-NEXT: .LBB54_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB54_13 ; RV64ZVE32F-NEXT: .LBB54_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB54_14 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB54_13 ; RV64ZVE32F-NEXT: .LBB54_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: beqz a7, .LBB54_7 +; RV64ZVE32F-NEXT: beqz a7, .LBB54_8 ; RV64ZVE32F-NEXT: .LBB54_11: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB54_8 +; RV64ZVE32F-NEXT: beqz t0, .LBB54_9 ; RV64ZVE32F-NEXT: .LBB54_12: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s t0, v10 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB54_13: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB54_9 -; RV64ZVE32F-NEXT: .LBB54_13: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB54_15 +; RV64ZVE32F-NEXT: # %bb.14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB54_14: # %else14 +; RV64ZVE32F-NEXT: j .LBB54_16 +; RV64ZVE32F-NEXT: .LBB54_15: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB54_16: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB54_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: beqz t2, .LBB54_19 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB54_18 -; RV64ZVE32F-NEXT: .LBB54_16: +; RV64ZVE32F-NEXT: bnez a5, .LBB54_20 +; RV64ZVE32F-NEXT: .LBB54_18: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB54_19 -; RV64ZVE32F-NEXT: .LBB54_17: +; RV64ZVE32F-NEXT: j .LBB54_21 +; RV64ZVE32F-NEXT: .LBB54_19: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB54_16 -; RV64ZVE32F-NEXT: .LBB54_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB54_18 +; RV64ZVE32F-NEXT: .LBB54_20: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB54_19: # %else20 +; RV64ZVE32F-NEXT: .LBB54_21: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -6378,6 +6964,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB55_2 ; RV32ZVE32F-NEXT: .LBB55_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -6386,6 +6974,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB55_3 ; RV32ZVE32F-NEXT: .LBB55_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -6394,6 +6984,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB55_4 ; RV32ZVE32F-NEXT: .LBB55_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -6402,6 +6994,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB55_5 ; RV32ZVE32F-NEXT: .LBB55_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -6410,6 +7004,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB55_6 ; RV32ZVE32F-NEXT: .LBB55_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -6418,6 +7014,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB55_7 ; RV32ZVE32F-NEXT: .LBB55_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -6426,9 +7024,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB55_8 ; RV32ZVE32F-NEXT: .LBB55_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB55_17: # %else20 @@ -6458,109 +7058,121 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB55_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB55_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB55_4 +; RV64ZVE32F-NEXT: j .LBB55_3 ; RV64ZVE32F-NEXT: .LBB55_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB55_5 -; RV64ZVE32F-NEXT: .LBB55_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB55_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB55_2 -; RV64ZVE32F-NEXT: .LBB55_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB55_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v13 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB55_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB55_6 +; RV64ZVE32F-NEXT: .LBB55_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB55_6: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB55_10 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v12 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB55_11 -; RV64ZVE32F-NEXT: .LBB55_7: +; RV64ZVE32F-NEXT: .LBB55_8: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB55_12 -; RV64ZVE32F-NEXT: .LBB55_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB55_13 ; RV64ZVE32F-NEXT: .LBB55_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB55_14 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB55_13 ; RV64ZVE32F-NEXT: .LBB55_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: beqz a7, .LBB55_7 +; RV64ZVE32F-NEXT: beqz a7, .LBB55_8 ; RV64ZVE32F-NEXT: .LBB55_11: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB55_8 +; RV64ZVE32F-NEXT: beqz t0, .LBB55_9 ; RV64ZVE32F-NEXT: .LBB55_12: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s t0, v10 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB55_13: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB55_9 -; RV64ZVE32F-NEXT: .LBB55_13: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB55_15 +; RV64ZVE32F-NEXT: # %bb.14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB55_14: # %else14 +; RV64ZVE32F-NEXT: j .LBB55_16 +; RV64ZVE32F-NEXT: .LBB55_15: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB55_16: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB55_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: beqz t2, .LBB55_19 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB55_18 -; RV64ZVE32F-NEXT: .LBB55_16: +; RV64ZVE32F-NEXT: bnez a5, .LBB55_20 +; RV64ZVE32F-NEXT: .LBB55_18: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB55_19 -; RV64ZVE32F-NEXT: .LBB55_17: +; RV64ZVE32F-NEXT: j .LBB55_21 +; RV64ZVE32F-NEXT: .LBB55_19: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB55_16 -; RV64ZVE32F-NEXT: .LBB55_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB55_18 +; RV64ZVE32F-NEXT: .LBB55_20: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB55_19: # %else20 +; RV64ZVE32F-NEXT: .LBB55_21: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -6660,6 +7272,8 @@ ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB56_2 ; RV32ZVE32F-NEXT: .LBB56_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 @@ -6668,6 +7282,8 @@ ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB56_3 ; RV32ZVE32F-NEXT: .LBB56_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a6, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 @@ -6676,6 +7292,8 @@ ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB56_4 ; RV32ZVE32F-NEXT: .LBB56_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli t1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 @@ -6684,6 +7302,8 @@ ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB56_5 ; RV32ZVE32F-NEXT: .LBB56_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli t3, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 @@ -6692,6 +7312,8 @@ ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB56_6 ; RV32ZVE32F-NEXT: .LBB56_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli t5, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 @@ -6700,6 +7322,8 @@ ; RV32ZVE32F-NEXT: andi s0, t0, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB56_7 ; RV32ZVE32F-NEXT: .LBB56_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli s0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 @@ -6708,9 +7332,11 @@ ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB56_8 ; RV32ZVE32F-NEXT: .LBB56_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 ; RV32ZVE32F-NEXT: lw t0, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) ; RV32ZVE32F-NEXT: .LBB56_17: # %else20 @@ -6740,7 +7366,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB56_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB56_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -6748,63 +7374,66 @@ ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB56_4 +; RV64ZVE32F-NEXT: j .LBB56_3 ; RV64ZVE32F-NEXT: .LBB56_2: -; RV64ZVE32F-NEXT: ld a4, 8(a2) -; RV64ZVE32F-NEXT: j .LBB56_5 -; RV64ZVE32F-NEXT: .LBB56_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB56_3: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a5, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB56_2 -; RV64ZVE32F-NEXT: .LBB56_4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB56_5 +; RV64ZVE32F-NEXT: # %bb.4: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v13 ; RV64ZVE32F-NEXT: slli a4, a4, 32 ; RV64ZVE32F-NEXT: srli a4, a4, 29 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: .LBB56_5: # %else2 +; RV64ZVE32F-NEXT: j .LBB56_6 +; RV64ZVE32F-NEXT: .LBB56_5: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: .LBB56_6: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a6, .LBB56_10 -; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v12 ; RV64ZVE32F-NEXT: slli a6, a6, 32 ; RV64ZVE32F-NEXT: srli a6, a6, 29 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: bnez a7, .LBB56_11 -; RV64ZVE32F-NEXT: .LBB56_7: +; RV64ZVE32F-NEXT: .LBB56_8: ; RV64ZVE32F-NEXT: ld a7, 24(a2) ; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB56_12 -; RV64ZVE32F-NEXT: .LBB56_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB56_13 ; RV64ZVE32F-NEXT: .LBB56_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB56_14 +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: j .LBB56_13 ; RV64ZVE32F-NEXT: .LBB56_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: beqz a7, .LBB56_7 +; RV64ZVE32F-NEXT: beqz a7, .LBB56_8 ; RV64ZVE32F-NEXT: .LBB56_11: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 32 ; RV64ZVE32F-NEXT: srli a7, a7, 29 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB56_8 +; RV64ZVE32F-NEXT: beqz t0, .LBB56_9 ; RV64ZVE32F-NEXT: .LBB56_12: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s t0, v10 @@ -6812,45 +7441,54 @@ ; RV64ZVE32F-NEXT: srli t0, t0, 29 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: .LBB56_13: # %else11 +; RV64ZVE32F-NEXT: vsetvli t1, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB56_9 -; RV64ZVE32F-NEXT: .LBB56_13: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz t1, .LBB56_15 +; RV64ZVE32F-NEXT: # %bb.14: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 32 ; RV64ZVE32F-NEXT: srli t1, t1, 29 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB56_14: # %else14 +; RV64ZVE32F-NEXT: j .LBB56_16 +; RV64ZVE32F-NEXT: .LBB56_15: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: .LBB56_16: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB56_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: beqz t2, .LBB56_19 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 32 ; RV64ZVE32F-NEXT: srli t2, t2, 29 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB56_18 -; RV64ZVE32F-NEXT: .LBB56_16: +; RV64ZVE32F-NEXT: bnez a5, .LBB56_20 +; RV64ZVE32F-NEXT: .LBB56_18: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB56_19 -; RV64ZVE32F-NEXT: .LBB56_17: +; RV64ZVE32F-NEXT: j .LBB56_21 +; RV64ZVE32F-NEXT: .LBB56_19: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB56_16 -; RV64ZVE32F-NEXT: .LBB56_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB56_18 +; RV64ZVE32F-NEXT: .LBB56_20: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB56_19: # %else20 +; RV64ZVE32F-NEXT: .LBB56_21: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -6929,9 +7567,9 @@ ; RV32ZVE32F-NEXT: lw t5, 24(a3) ; RV32ZVE32F-NEXT: lw t6, 20(a3) ; RV32ZVE32F-NEXT: lw s2, 16(a3) -; RV32ZVE32F-NEXT: lw s3, 12(a3) +; RV32ZVE32F-NEXT: lw s4, 12(a3) ; RV32ZVE32F-NEXT: lw s5, 8(a3) -; RV32ZVE32F-NEXT: lw s4, 4(a3) +; RV32ZVE32F-NEXT: lw s3, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) ; RV32ZVE32F-NEXT: lw s6, 0(a2) ; RV32ZVE32F-NEXT: lw s7, 8(a2) @@ -6980,16 +7618,18 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB57_9 ; RV32ZVE32F-NEXT: .LBB57_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: lw a4, 4(a1) ; RV32ZVE32F-NEXT: lw a5, 0(a1) ; RV32ZVE32F-NEXT: .LBB57_9: # %else20 ; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw s4, 4(a0) +; RV32ZVE32F-NEXT: sw s3, 4(a0) ; RV32ZVE32F-NEXT: sw s5, 8(a0) -; RV32ZVE32F-NEXT: sw s3, 12(a0) +; RV32ZVE32F-NEXT: sw s4, 12(a0) ; RV32ZVE32F-NEXT: sw s2, 16(a0) ; RV32ZVE32F-NEXT: sw t6, 20(a0) ; RV32ZVE32F-NEXT: sw t5, 24(a0) @@ -7020,19 +7660,23 @@ ; RV32ZVE32F-NEXT: .LBB57_10: # %cond.load ; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw s4, 4(a2) +; RV32ZVE32F-NEXT: lw s3, 4(a2) ; RV32ZVE32F-NEXT: lw a3, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_2 ; RV32ZVE32F-NEXT: .LBB57_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 -; RV32ZVE32F-NEXT: lw s3, 4(a2) +; RV32ZVE32F-NEXT: lw s4, 4(a2) ; RV32ZVE32F-NEXT: lw s5, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_3 ; RV32ZVE32F-NEXT: .LBB57_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -7041,6 +7685,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_4 ; RV32ZVE32F-NEXT: .LBB57_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -7049,6 +7695,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_5 ; RV32ZVE32F-NEXT: .LBB57_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -7057,6 +7705,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_6 ; RV32ZVE32F-NEXT: .LBB57_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -7065,6 +7715,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_7 ; RV32ZVE32F-NEXT: .LBB57_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -7264,9 +7916,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB59_2 ; RV64ZVE32F-NEXT: .LBB59_4: # %cond.load1 ; RV64ZVE32F-NEXT: flh ft0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> %ptrs, i32 2, <2 x i1> %m, <2 x half> %passthru) @@ -7317,7 +7970,9 @@ ; RV64ZVE32F-NEXT: .LBB60_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -7326,7 +7981,9 @@ ; RV64ZVE32F-NEXT: .LBB60_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -7335,9 +7992,10 @@ ; RV64ZVE32F-NEXT: .LBB60_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> %m, <4 x half> %passthru) @@ -7386,7 +8044,9 @@ ; RV64ZVE32F-NEXT: .LBB61_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -7395,7 +8055,9 @@ ; RV64ZVE32F-NEXT: .LBB61_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -7404,9 +8066,10 @@ ; RV64ZVE32F-NEXT: .LBB61_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 @@ -7489,7 +8152,9 @@ ; RV64ZVE32F-NEXT: .LBB63_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -7498,7 +8163,9 @@ ; RV64ZVE32F-NEXT: .LBB63_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -7507,7 +8174,9 @@ ; RV64ZVE32F-NEXT: .LBB63_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 @@ -7516,7 +8185,9 @@ ; RV64ZVE32F-NEXT: .LBB63_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 @@ -7525,7 +8196,9 @@ ; RV64ZVE32F-NEXT: .LBB63_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld a2, 40(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 @@ -7534,7 +8207,9 @@ ; RV64ZVE32F-NEXT: .LBB63_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld a2, 48(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 @@ -7543,9 +8218,11 @@ ; RV64ZVE32F-NEXT: .LBB63_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a0, 56(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 ; RV64ZVE32F-NEXT: ret %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) @@ -7587,113 +8264,134 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: .LBB64_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB64_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB64_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB64_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB64_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_14 -; RV64ZVE32F-NEXT: .LBB64_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_10 -; RV64ZVE32F-NEXT: .LBB64_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB64_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB64_16 -; RV64ZVE32F-NEXT: .LBB64_12: # %else20 -; RV64ZVE32F-NEXT: vmv1r.v v8, v9 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB64_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB64_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_8 -; RV64ZVE32F-NEXT: .LBB64_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB64_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_9 -; RV64ZVE32F-NEXT: j .LBB64_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB64_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB64_16 +; RV64ZVE32F-NEXT: .LBB64_14: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB64_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB64_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB64_14 ; RV64ZVE32F-NEXT: .LBB64_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -7737,113 +8435,134 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: .LBB65_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB65_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB65_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB65_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB65_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB65_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_14 -; RV64ZVE32F-NEXT: .LBB65_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB65_10 -; RV64ZVE32F-NEXT: .LBB65_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB65_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB65_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB65_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB65_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB65_16 -; RV64ZVE32F-NEXT: .LBB65_12: # %else20 +; RV64ZVE32F-NEXT: .LBB65_14: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB65_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_8 -; RV64ZVE32F-NEXT: .LBB65_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_9 -; RV64ZVE32F-NEXT: j .LBB65_10 ; RV64ZVE32F-NEXT: .LBB65_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB65_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB65_14 ; RV64ZVE32F-NEXT: .LBB65_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -7889,9 +8608,12 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: .LBB66_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB66_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 @@ -7899,110 +8621,128 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB66_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB66_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB66_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB66_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_14 -; RV64ZVE32F-NEXT: .LBB66_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB66_10 -; RV64ZVE32F-NEXT: .LBB66_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB66_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB66_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB66_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB66_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB66_16 -; RV64ZVE32F-NEXT: .LBB66_12: # %else20 +; RV64ZVE32F-NEXT: .LBB66_14: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB66_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_8 -; RV64ZVE32F-NEXT: .LBB66_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_9 -; RV64ZVE32F-NEXT: j .LBB66_10 ; RV64ZVE32F-NEXT: .LBB66_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB66_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB66_14 ; RV64ZVE32F-NEXT: .LBB66_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -8048,105 +8788,132 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: .LBB67_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB67_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB67_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB67_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB67_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: .LBB67_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_14 -; RV64ZVE32F-NEXT: .LBB67_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB67_10 -; RV64ZVE32F-NEXT: .LBB67_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB67_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB67_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB67_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB67_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB67_16 -; RV64ZVE32F-NEXT: .LBB67_12: # %else20 +; RV64ZVE32F-NEXT: .LBB67_14: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB67_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_8 -; RV64ZVE32F-NEXT: .LBB67_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_9 -; RV64ZVE32F-NEXT: j .LBB67_10 ; RV64ZVE32F-NEXT: .LBB67_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB67_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB67_14 ; RV64ZVE32F-NEXT: .LBB67_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 @@ -8241,9 +9008,10 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB69_2 ; RV64ZVE32F-NEXT: .LBB69_4: # %cond.load1 ; RV64ZVE32F-NEXT: flw ft0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %ptrs, i32 4, <2 x i1> %m, <2 x float> %passthru) @@ -8294,7 +9062,9 @@ ; RV64ZVE32F-NEXT: .LBB70_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -8303,7 +9073,9 @@ ; RV64ZVE32F-NEXT: .LBB70_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -8312,9 +9084,10 @@ ; RV64ZVE32F-NEXT: .LBB70_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> %m, <4 x float> %passthru) @@ -8362,7 +9135,9 @@ ; RV64ZVE32F-NEXT: .LBB71_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 @@ -8371,7 +9146,9 @@ ; RV64ZVE32F-NEXT: .LBB71_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 @@ -8380,9 +9157,10 @@ ; RV64ZVE32F-NEXT: .LBB71_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 @@ -8465,7 +9243,9 @@ ; RV64ZVE32F-NEXT: .LBB73_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 @@ -8474,7 +9254,9 @@ ; RV64ZVE32F-NEXT: .LBB73_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2 @@ -8483,7 +9265,9 @@ ; RV64ZVE32F-NEXT: .LBB73_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3 @@ -8492,7 +9276,9 @@ ; RV64ZVE32F-NEXT: .LBB73_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 4 @@ -8501,7 +9287,9 @@ ; RV64ZVE32F-NEXT: .LBB73_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld a2, 40(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 5 @@ -8510,7 +9298,9 @@ ; RV64ZVE32F-NEXT: .LBB73_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld a2, 48(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 6 @@ -8519,9 +9309,10 @@ ; RV64ZVE32F-NEXT: .LBB73_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a0, 56(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 7 ; RV64ZVE32F-NEXT: ret %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -8562,113 +9353,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: .LBB74_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB74_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB74_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB74_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB74_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_14 -; RV64ZVE32F-NEXT: .LBB74_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_10 -; RV64ZVE32F-NEXT: .LBB74_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB74_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB74_16 -; RV64ZVE32F-NEXT: .LBB74_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB74_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB74_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_8 -; RV64ZVE32F-NEXT: .LBB74_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB74_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_9 -; RV64ZVE32F-NEXT: j .LBB74_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB74_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB74_16 +; RV64ZVE32F-NEXT: .LBB74_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB74_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB74_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB74_14 ; RV64ZVE32F-NEXT: .LBB74_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -8711,113 +9522,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: .LBB75_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB75_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB75_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB75_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB75_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB75_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB75_14 -; RV64ZVE32F-NEXT: .LBB75_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB75_10 -; RV64ZVE32F-NEXT: .LBB75_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB75_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB75_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB75_16 -; RV64ZVE32F-NEXT: .LBB75_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB75_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB75_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB75_8 -; RV64ZVE32F-NEXT: .LBB75_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB75_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB75_9 -; RV64ZVE32F-NEXT: j .LBB75_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB75_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB75_16 +; RV64ZVE32F-NEXT: .LBB75_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB75_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB75_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB75_14 ; RV64ZVE32F-NEXT: .LBB75_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -8862,9 +9693,12 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: .LBB76_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB76_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -8872,110 +9706,127 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB76_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB76_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB76_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB76_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB76_14 -; RV64ZVE32F-NEXT: .LBB76_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB76_10 -; RV64ZVE32F-NEXT: .LBB76_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB76_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB76_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB76_16 -; RV64ZVE32F-NEXT: .LBB76_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB76_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB76_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB76_8 -; RV64ZVE32F-NEXT: .LBB76_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB76_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB76_9 -; RV64ZVE32F-NEXT: j .LBB76_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB76_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB76_16 +; RV64ZVE32F-NEXT: .LBB76_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB76_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB76_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB76_14 ; RV64ZVE32F-NEXT: .LBB76_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -9020,113 +9871,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: .LBB77_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB77_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB77_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB77_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB77_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB77_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB77_14 -; RV64ZVE32F-NEXT: .LBB77_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB77_10 -; RV64ZVE32F-NEXT: .LBB77_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB77_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB77_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB77_16 -; RV64ZVE32F-NEXT: .LBB77_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB77_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB77_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB77_8 -; RV64ZVE32F-NEXT: .LBB77_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB77_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB77_9 -; RV64ZVE32F-NEXT: j .LBB77_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB77_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB77_16 +; RV64ZVE32F-NEXT: .LBB77_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB77_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB77_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB77_14 ; RV64ZVE32F-NEXT: .LBB77_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -9170,113 +10041,133 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: .LBB78_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB78_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB78_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB78_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB78_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_14 -; RV64ZVE32F-NEXT: .LBB78_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_10 -; RV64ZVE32F-NEXT: .LBB78_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB78_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB78_16 -; RV64ZVE32F-NEXT: .LBB78_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB78_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB78_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_8 -; RV64ZVE32F-NEXT: .LBB78_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB78_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_9 -; RV64ZVE32F-NEXT: j .LBB78_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB78_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB78_16 +; RV64ZVE32F-NEXT: .LBB78_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB78_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB78_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB78_14 ; RV64ZVE32F-NEXT: .LBB78_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -9324,9 +10215,12 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: .LBB79_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB79_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -9334,110 +10228,127 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw ft0, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB79_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB79_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw ft0, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB79_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB79_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB79_14 -; RV64ZVE32F-NEXT: .LBB79_8: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB79_10 -; RV64ZVE32F-NEXT: .LBB79_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 2 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: flw ft0, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB79_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB79_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: bnez a2, .LBB79_16 -; RV64ZVE32F-NEXT: .LBB79_12: # %else20 -; RV64ZVE32F-NEXT: vmv2r.v v8, v10 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB79_13: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw ft0, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB79_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB79_8 -; RV64ZVE32F-NEXT: .LBB79_14: # %cond.load10 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw ft0, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB79_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB79_9 -; RV64ZVE32F-NEXT: j .LBB79_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB79_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB79_16 +; RV64ZVE32F-NEXT: .LBB79_14: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB79_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw ft0, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB79_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB79_14 ; RV64ZVE32F-NEXT: .LBB79_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: and a1, a2, a1 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -9481,109 +10392,138 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 ; RV64ZVE32F-NEXT: .LBB80_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB80_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v15, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v16, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v16, 1 ; RV64ZVE32F-NEXT: .LBB80_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_13 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_14 ; RV64ZVE32F-NEXT: .LBB80_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_14 -; RV64ZVE32F-NEXT: .LBB80_7: # %else11 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_8 +; RV64ZVE32F-NEXT: .LBB80_7: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: .LBB80_8: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB80_9 -; RV64ZVE32F-NEXT: .LBB80_8: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 -; RV64ZVE32F-NEXT: .LBB80_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5 +; RV64ZVE32F-NEXT: .LBB80_10: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB80_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else17 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB80_16 -; RV64ZVE32F-NEXT: .LBB80_11: # %else20 +; RV64ZVE32F-NEXT: .LBB80_12: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB80_12: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: .LBB80_13: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 2 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB80_6 -; RV64ZVE32F-NEXT: .LBB80_13: # %cond.load7 +; RV64ZVE32F-NEXT: .LBB80_14: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v14, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB80_7 -; RV64ZVE32F-NEXT: .LBB80_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw ft0, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_8 -; RV64ZVE32F-NEXT: j .LBB80_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_7 +; RV64ZVE32F-NEXT: j .LBB80_8 ; RV64ZVE32F-NEXT: .LBB80_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB80_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB80_12 ; RV64ZVE32F-NEXT: .LBB80_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret @@ -9675,9 +10615,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB82_2 ; RV32ZVE32F-NEXT: .LBB82_4: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: fld fa1, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -9736,9 +10678,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB83_5 ; RV32ZVE32F-NEXT: .LBB83_4: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: fld fa3, 0(a1) ; RV32ZVE32F-NEXT: .LBB83_5: # %else8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -9753,6 +10697,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB83_2 ; RV32ZVE32F-NEXT: .LBB83_7: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9760,6 +10706,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB83_3 ; RV32ZVE32F-NEXT: .LBB83_8: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9842,9 +10790,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB84_5 ; RV32ZVE32F-NEXT: .LBB84_4: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: fld fa3, 0(a1) ; RV32ZVE32F-NEXT: .LBB84_5: # %else8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -9859,6 +10809,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB84_2 ; RV32ZVE32F-NEXT: .LBB84_7: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9866,6 +10818,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB84_3 ; RV32ZVE32F-NEXT: .LBB84_8: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9995,9 +10949,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB86_9 ; RV32ZVE32F-NEXT: .LBB86_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB86_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -10016,6 +10972,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB86_2 ; RV32ZVE32F-NEXT: .LBB86_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10023,6 +10981,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB86_3 ; RV32ZVE32F-NEXT: .LBB86_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10030,6 +10990,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB86_4 ; RV32ZVE32F-NEXT: .LBB86_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10037,6 +10999,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB86_5 ; RV32ZVE32F-NEXT: .LBB86_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10044,6 +11008,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB86_6 ; RV32ZVE32F-NEXT: .LBB86_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10051,6 +11017,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB86_7 ; RV32ZVE32F-NEXT: .LBB86_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10191,9 +11159,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB87_9 ; RV32ZVE32F-NEXT: .LBB87_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB87_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -10212,6 +11182,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB87_2 ; RV32ZVE32F-NEXT: .LBB87_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10219,6 +11191,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB87_3 ; RV32ZVE32F-NEXT: .LBB87_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10226,6 +11200,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB87_4 ; RV32ZVE32F-NEXT: .LBB87_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10233,6 +11209,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB87_5 ; RV32ZVE32F-NEXT: .LBB87_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10240,6 +11218,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB87_6 ; RV32ZVE32F-NEXT: .LBB87_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10247,6 +11227,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB87_7 ; RV32ZVE32F-NEXT: .LBB87_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10267,9 +11249,12 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB87_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB87_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -10277,54 +11262,76 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB87_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB87_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa2, 0(a3) ; RV64ZVE32F-NEXT: .LBB87_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB87_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB87_16 -; RV64ZVE32F-NEXT: .LBB87_8: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: beqz a3, .LBB87_10 -; RV64ZVE32F-NEXT: .LBB87_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB87_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB87_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_12: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB87_12: # %else17 +; RV64ZVE32F-NEXT: .LBB87_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB87_14: # %else20 +; RV64ZVE32F-NEXT: .LBB87_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10334,24 +11341,6 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB87_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB87_8 -; RV64ZVE32F-NEXT: .LBB87_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB87_9 -; RV64ZVE32F-NEXT: j .LBB87_10 %ptrs = getelementptr inbounds double, double* %base, <8 x i8> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -10411,9 +11400,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB88_9 ; RV32ZVE32F-NEXT: .LBB88_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB88_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -10432,6 +11423,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB88_2 ; RV32ZVE32F-NEXT: .LBB88_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10439,6 +11432,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB88_3 ; RV32ZVE32F-NEXT: .LBB88_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10446,6 +11441,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB88_4 ; RV32ZVE32F-NEXT: .LBB88_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10453,6 +11450,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB88_5 ; RV32ZVE32F-NEXT: .LBB88_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10460,6 +11459,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB88_6 ; RV32ZVE32F-NEXT: .LBB88_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10467,6 +11468,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB88_7 ; RV32ZVE32F-NEXT: .LBB88_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10487,9 +11490,12 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB88_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB88_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -10497,54 +11503,76 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB88_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB88_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa2, 0(a3) ; RV64ZVE32F-NEXT: .LBB88_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB88_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB88_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB88_16 -; RV64ZVE32F-NEXT: .LBB88_8: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: beqz a3, .LBB88_10 -; RV64ZVE32F-NEXT: .LBB88_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB88_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB88_10: # %else14 +; RV64ZVE32F-NEXT: .LBB88_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB88_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB88_12: # %else17 +; RV64ZVE32F-NEXT: .LBB88_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB88_14: # %else20 +; RV64ZVE32F-NEXT: .LBB88_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10554,24 +11582,6 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB88_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB88_8 -; RV64ZVE32F-NEXT: .LBB88_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB88_9 -; RV64ZVE32F-NEXT: j .LBB88_10 %eidxs = sext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -10632,9 +11642,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB89_9 ; RV32ZVE32F-NEXT: .LBB89_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB89_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -10653,6 +11665,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB89_2 ; RV32ZVE32F-NEXT: .LBB89_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10660,6 +11674,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB89_3 ; RV32ZVE32F-NEXT: .LBB89_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10667,6 +11683,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB89_4 ; RV32ZVE32F-NEXT: .LBB89_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10674,6 +11692,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB89_5 ; RV32ZVE32F-NEXT: .LBB89_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10681,6 +11701,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB89_6 ; RV32ZVE32F-NEXT: .LBB89_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10688,6 +11710,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB89_7 ; RV32ZVE32F-NEXT: .LBB89_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10709,9 +11733,12 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB89_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB89_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -10720,12 +11747,14 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB89_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB89_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -10733,45 +11762,67 @@ ; RV64ZVE32F-NEXT: .LBB89_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB89_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB89_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB89_16 -; RV64ZVE32F-NEXT: .LBB89_8: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: beqz a3, .LBB89_10 -; RV64ZVE32F-NEXT: .LBB89_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB89_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB89_10: # %else14 +; RV64ZVE32F-NEXT: .LBB89_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB89_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB89_12: # %else17 +; RV64ZVE32F-NEXT: .LBB89_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB89_14: # %else20 +; RV64ZVE32F-NEXT: .LBB89_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10781,26 +11832,6 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB89_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: andi a3, a3, 255 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB89_8 -; RV64ZVE32F-NEXT: .LBB89_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: andi a3, a3, 255 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB89_9 -; RV64ZVE32F-NEXT: j .LBB89_10 %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -10859,9 +11890,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB90_9 ; RV32ZVE32F-NEXT: .LBB90_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB90_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -10880,6 +11913,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB90_2 ; RV32ZVE32F-NEXT: .LBB90_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10887,6 +11922,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB90_3 ; RV32ZVE32F-NEXT: .LBB90_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10894,6 +11931,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB90_4 ; RV32ZVE32F-NEXT: .LBB90_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10901,6 +11940,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB90_5 ; RV32ZVE32F-NEXT: .LBB90_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10908,6 +11949,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB90_6 ; RV32ZVE32F-NEXT: .LBB90_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10915,6 +11958,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB90_7 ; RV32ZVE32F-NEXT: .LBB90_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10936,9 +11981,12 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB90_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB90_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -10946,54 +11994,76 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB90_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB90_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa2, 0(a3) ; RV64ZVE32F-NEXT: .LBB90_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB90_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB90_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB90_16 -; RV64ZVE32F-NEXT: .LBB90_8: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: beqz a3, .LBB90_10 -; RV64ZVE32F-NEXT: .LBB90_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB90_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB90_10: # %else14 +; RV64ZVE32F-NEXT: .LBB90_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB90_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB90_12: # %else17 +; RV64ZVE32F-NEXT: .LBB90_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB90_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB90_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB90_14: # %else20 +; RV64ZVE32F-NEXT: .LBB90_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11003,24 +12073,6 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB90_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB90_8 -; RV64ZVE32F-NEXT: .LBB90_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB90_9 -; RV64ZVE32F-NEXT: j .LBB90_10 %ptrs = getelementptr inbounds double, double* %base, <8 x i16> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -11080,9 +12132,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB91_9 ; RV32ZVE32F-NEXT: .LBB91_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB91_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -11101,6 +12155,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB91_2 ; RV32ZVE32F-NEXT: .LBB91_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11108,6 +12164,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB91_3 ; RV32ZVE32F-NEXT: .LBB91_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11115,6 +12173,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB91_4 ; RV32ZVE32F-NEXT: .LBB91_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11122,6 +12182,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB91_5 ; RV32ZVE32F-NEXT: .LBB91_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11129,6 +12191,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB91_6 ; RV32ZVE32F-NEXT: .LBB91_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11136,6 +12200,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB91_7 ; RV32ZVE32F-NEXT: .LBB91_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11157,9 +12223,12 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB91_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB91_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -11167,54 +12236,76 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB91_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB91_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa2, 0(a3) ; RV64ZVE32F-NEXT: .LBB91_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB91_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB91_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB91_16 -; RV64ZVE32F-NEXT: .LBB91_8: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: beqz a3, .LBB91_10 -; RV64ZVE32F-NEXT: .LBB91_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB91_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB91_10: # %else14 +; RV64ZVE32F-NEXT: .LBB91_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB91_12: # %else17 +; RV64ZVE32F-NEXT: .LBB91_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB91_14: # %else20 +; RV64ZVE32F-NEXT: .LBB91_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11224,24 +12315,6 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB91_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_8 -; RV64ZVE32F-NEXT: .LBB91_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB91_9 -; RV64ZVE32F-NEXT: j .LBB91_10 %eidxs = sext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -11302,9 +12375,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB92_9 ; RV32ZVE32F-NEXT: .LBB92_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB92_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -11323,6 +12398,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB92_2 ; RV32ZVE32F-NEXT: .LBB92_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11330,6 +12407,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB92_3 ; RV32ZVE32F-NEXT: .LBB92_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11337,6 +12416,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB92_4 ; RV32ZVE32F-NEXT: .LBB92_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11344,6 +12425,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB92_5 ; RV32ZVE32F-NEXT: .LBB92_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11351,6 +12434,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB92_6 ; RV32ZVE32F-NEXT: .LBB92_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11358,6 +12443,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB92_7 ; RV32ZVE32F-NEXT: .LBB92_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11382,9 +12469,12 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa0, 0(a4) ; RV64ZVE32F-NEXT: .LBB92_2: # %else +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a3, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a4, .LBB92_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 @@ -11393,12 +12483,14 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa1, 0(a4) ; RV64ZVE32F-NEXT: .LBB92_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a4, a3, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a4, .LBB92_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 @@ -11406,45 +12498,67 @@ ; RV64ZVE32F-NEXT: .LBB92_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a4, a3, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a4, .LBB92_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa3, 0(a4) +; RV64ZVE32F-NEXT: .LBB92_8: # %else8 ; RV64ZVE32F-NEXT: andi a4, a3, 16 -; RV64ZVE32F-NEXT: bnez a4, .LBB92_16 -; RV64ZVE32F-NEXT: .LBB92_8: # %else11 -; RV64ZVE32F-NEXT: andi a4, a3, 32 ; RV64ZVE32F-NEXT: beqz a4, .LBB92_10 -; RV64ZVE32F-NEXT: .LBB92_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a4, v9 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa4, 0(a4) +; RV64ZVE32F-NEXT: .LBB92_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a4, a3, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa5, 0(a4) -; RV64ZVE32F-NEXT: .LBB92_10: # %else14 +; RV64ZVE32F-NEXT: .LBB92_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a4, a3, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB92_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa6, 0(a4) -; RV64ZVE32F-NEXT: .LBB92_12: # %else17 +; RV64ZVE32F-NEXT: .LBB92_14: # %else17 ; RV64ZVE32F-NEXT: andi a3, a3, -128 -; RV64ZVE32F-NEXT: beqz a3, .LBB92_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a3, .LBB92_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: and a2, a3, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB92_14: # %else20 +; RV64ZVE32F-NEXT: .LBB92_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11454,26 +12568,6 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB92_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: and a4, a4, a2 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: fld fa3, 0(a4) -; RV64ZVE32F-NEXT: andi a4, a3, 16 -; RV64ZVE32F-NEXT: beqz a4, .LBB92_8 -; RV64ZVE32F-NEXT: .LBB92_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a4, v8 -; RV64ZVE32F-NEXT: and a4, a4, a2 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: fld fa4, 0(a4) -; RV64ZVE32F-NEXT: andi a4, a3, 32 -; RV64ZVE32F-NEXT: bnez a4, .LBB92_9 -; RV64ZVE32F-NEXT: j .LBB92_10 %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -11530,9 +12624,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB93_9 ; RV32ZVE32F-NEXT: .LBB93_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB93_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -11551,6 +12647,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB93_2 ; RV32ZVE32F-NEXT: .LBB93_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11558,6 +12656,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB93_3 ; RV32ZVE32F-NEXT: .LBB93_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11565,6 +12665,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB93_4 ; RV32ZVE32F-NEXT: .LBB93_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11572,6 +12674,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB93_5 ; RV32ZVE32F-NEXT: .LBB93_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11579,6 +12683,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB93_6 ; RV32ZVE32F-NEXT: .LBB93_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11586,6 +12692,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB93_7 ; RV32ZVE32F-NEXT: .LBB93_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11607,12 +12715,17 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB93_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB93_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v13 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) @@ -11621,45 +12734,56 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB93_14 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_15 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB93_15 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_16 ; RV64ZVE32F-NEXT: .LBB93_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB93_16 -; RV64ZVE32F-NEXT: .LBB93_7: # %else11 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_8 +; RV64ZVE32F-NEXT: .LBB93_7: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB93_8: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB93_9 -; RV64ZVE32F-NEXT: .LBB93_8: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB93_9: # %else14 +; RV64ZVE32F-NEXT: .LBB93_10: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB93_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB93_11: # %else17 +; RV64ZVE32F-NEXT: .LBB93_12: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB93_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB93_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB93_13: # %else20 +; RV64ZVE32F-NEXT: .LBB93_14: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11669,31 +12793,25 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB93_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: .LBB93_15: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa2, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 8 ; RV64ZVE32F-NEXT: beqz a3, .LBB93_6 -; RV64ZVE32F-NEXT: .LBB93_15: # %cond.load7 +; RV64ZVE32F-NEXT: .LBB93_16: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB93_7 -; RV64ZVE32F-NEXT: .LBB93_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB93_8 -; RV64ZVE32F-NEXT: j .LBB93_9 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_7 +; RV64ZVE32F-NEXT: j .LBB93_8 %ptrs = getelementptr inbounds double, double* %base, <8 x i32> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -11752,9 +12870,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB94_9 ; RV32ZVE32F-NEXT: .LBB94_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB94_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -11773,6 +12893,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB94_2 ; RV32ZVE32F-NEXT: .LBB94_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11780,6 +12902,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB94_3 ; RV32ZVE32F-NEXT: .LBB94_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11787,6 +12911,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB94_4 ; RV32ZVE32F-NEXT: .LBB94_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11794,6 +12920,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB94_5 ; RV32ZVE32F-NEXT: .LBB94_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11801,6 +12929,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB94_6 ; RV32ZVE32F-NEXT: .LBB94_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11808,6 +12938,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB94_7 ; RV32ZVE32F-NEXT: .LBB94_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -11829,12 +12961,17 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB94_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB94_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v13 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) @@ -11843,45 +12980,56 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB94_14 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_15 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB94_15 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_16 ; RV64ZVE32F-NEXT: .LBB94_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB94_16 -; RV64ZVE32F-NEXT: .LBB94_7: # %else11 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_8 +; RV64ZVE32F-NEXT: .LBB94_7: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB94_8: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB94_9 -; RV64ZVE32F-NEXT: .LBB94_8: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB94_9: # %else14 +; RV64ZVE32F-NEXT: .LBB94_10: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB94_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB94_11: # %else17 +; RV64ZVE32F-NEXT: .LBB94_12: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB94_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB94_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB94_13: # %else20 +; RV64ZVE32F-NEXT: .LBB94_14: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11891,31 +13039,25 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB94_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: .LBB94_15: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa2, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 8 ; RV64ZVE32F-NEXT: beqz a3, .LBB94_6 -; RV64ZVE32F-NEXT: .LBB94_15: # %cond.load7 +; RV64ZVE32F-NEXT: .LBB94_16: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB94_7 -; RV64ZVE32F-NEXT: .LBB94_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB94_8 -; RV64ZVE32F-NEXT: j .LBB94_9 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_7 +; RV64ZVE32F-NEXT: j .LBB94_8 %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -11975,9 +13117,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB95_9 ; RV32ZVE32F-NEXT: .LBB95_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB95_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -11996,6 +13140,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB95_2 ; RV32ZVE32F-NEXT: .LBB95_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12003,6 +13149,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB95_3 ; RV32ZVE32F-NEXT: .LBB95_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12010,6 +13158,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB95_4 ; RV32ZVE32F-NEXT: .LBB95_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12017,6 +13167,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB95_5 ; RV32ZVE32F-NEXT: .LBB95_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12024,6 +13176,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB95_6 ; RV32ZVE32F-NEXT: .LBB95_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12031,6 +13185,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB95_7 ; RV32ZVE32F-NEXT: .LBB95_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12053,12 +13209,17 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB95_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB95_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v13 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -12068,48 +13229,60 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB95_14 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_15 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB95_15 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_16 ; RV64ZVE32F-NEXT: .LBB95_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB95_16 -; RV64ZVE32F-NEXT: .LBB95_7: # %else11 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_8 +; RV64ZVE32F-NEXT: .LBB95_7: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB95_8: # %else11 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB95_9 -; RV64ZVE32F-NEXT: .LBB95_8: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB95_9: # %else14 +; RV64ZVE32F-NEXT: .LBB95_10: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB95_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB95_11: # %else17 +; RV64ZVE32F-NEXT: .LBB95_12: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB95_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB95_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB95_13: # %else20 +; RV64ZVE32F-NEXT: .LBB95_14: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -12119,34 +13292,27 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB95_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: .LBB95_15: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa2, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 8 ; RV64ZVE32F-NEXT: beqz a3, .LBB95_6 -; RV64ZVE32F-NEXT: .LBB95_15: # %cond.load7 +; RV64ZVE32F-NEXT: .LBB95_16: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB95_7 -; RV64ZVE32F-NEXT: .LBB95_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 -; RV64ZVE32F-NEXT: slli a3, a3, 32 -; RV64ZVE32F-NEXT: srli a3, a3, 29 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB95_8 -; RV64ZVE32F-NEXT: j .LBB95_9 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_7 +; RV64ZVE32F-NEXT: j .LBB95_8 %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -12231,9 +13397,11 @@ ; RV32ZVE32F-NEXT: andi a1, a1, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB96_9 ; RV32ZVE32F-NEXT: .LBB96_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB96_9: # %else20 ; RV32ZVE32F-NEXT: fsd fa0, 0(a0) @@ -12256,6 +13424,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: beqz a2, .LBB96_2 ; RV32ZVE32F-NEXT: .LBB96_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12263,6 +13433,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 4 ; RV32ZVE32F-NEXT: beqz a2, .LBB96_3 ; RV32ZVE32F-NEXT: .LBB96_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12270,6 +13442,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 8 ; RV32ZVE32F-NEXT: beqz a2, .LBB96_4 ; RV32ZVE32F-NEXT: .LBB96_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12277,6 +13451,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 16 ; RV32ZVE32F-NEXT: beqz a2, .LBB96_5 ; RV32ZVE32F-NEXT: .LBB96_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12284,6 +13460,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 32 ; RV32ZVE32F-NEXT: beqz a2, .LBB96_6 ; RV32ZVE32F-NEXT: .LBB96_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12291,6 +13469,8 @@ ; RV32ZVE32F-NEXT: andi a2, a1, 64 ; RV32ZVE32F-NEXT: beqz a2, .LBB96_7 ; RV32ZVE32F-NEXT: .LBB96_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12431,211 +13611,270 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB97_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 1 ; RV64ZVE32F-NEXT: .LBB97_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 ; RV64ZVE32F-NEXT: .LBB97_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_28 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB97_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_29 -; RV64ZVE32F-NEXT: .LBB97_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_10 -; RV64ZVE32F-NEXT: .LBB97_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB97_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 -; RV64ZVE32F-NEXT: .LBB97_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v13, 5 +; RV64ZVE32F-NEXT: .LBB97_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_30 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_31 -; RV64ZVE32F-NEXT: .LBB97_12: # %else20 -; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_32 -; RV64ZVE32F-NEXT: .LBB97_13: # %else23 +; RV64ZVE32F-NEXT: .LBB97_14: # %else20 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_16 +; RV64ZVE32F-NEXT: .LBB97_15: # %cond.load22 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 8 +; RV64ZVE32F-NEXT: .LBB97_16: # %else23 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_15 -; RV64ZVE32F-NEXT: .LBB97_14: # %cond.load25 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.load25 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 9 -; RV64ZVE32F-NEXT: .LBB97_15: # %else26 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 9 +; RV64ZVE32F-NEXT: .LBB97_18: # %else26 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.load28 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10 -; RV64ZVE32F-NEXT: .LBB97_17: # %else29 +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 10 +; RV64ZVE32F-NEXT: .LBB97_20: # %else29 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 1 ; RV64ZVE32F-NEXT: addiw a3, a2, -2048 ; RV64ZVE32F-NEXT: and a3, a1, a3 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB97_19 -; RV64ZVE32F-NEXT: # %bb.18: # %cond.load31 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB97_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.load31 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lb a3, 0(a3) +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 11 -; RV64ZVE32F-NEXT: .LBB97_19: # %else32 +; RV64ZVE32F-NEXT: .LBB97_22: # %else32 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_21 -; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.load34 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 12 -; RV64ZVE32F-NEXT: .LBB97_21: # %else35 +; RV64ZVE32F-NEXT: .LBB97_24: # %else35 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 2 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.load37 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.load37 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 13 -; RV64ZVE32F-NEXT: .LBB97_23: # %else38 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 13 +; RV64ZVE32F-NEXT: .LBB97_26: # %else38 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 4 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_25 -; RV64ZVE32F-NEXT: # %bb.24: # %cond.load40 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.load40 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 14 -; RV64ZVE32F-NEXT: .LBB97_25: # %else41 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 14 +; RV64ZVE32F-NEXT: .LBB97_28: # %else41 ; RV64ZVE32F-NEXT: lui a2, 1048568 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB97_27 -; RV64ZVE32F-NEXT: # %bb.26: # %cond.load43 +; RV64ZVE32F-NEXT: beqz a1, .LBB97_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.load43 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lb a0, 0(a0) -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 15 -; RV64ZVE32F-NEXT: .LBB97_27: # %else44 +; RV64ZVE32F-NEXT: .LBB97_30: # %else44 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB97_28: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_8 -; RV64ZVE32F-NEXT: .LBB97_29: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_9 -; RV64ZVE32F-NEXT: j .LBB97_10 -; RV64ZVE32F-NEXT: .LBB97_30: # %cond.load16 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: .LBB97_31: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 6 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 6 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_12 -; RV64ZVE32F-NEXT: .LBB97_31: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_14 +; RV64ZVE32F-NEXT: .LBB97_32: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 7 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_13 -; RV64ZVE32F-NEXT: .LBB97_32: # %cond.load22 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 8 -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_14 -; RV64ZVE32F-NEXT: j .LBB97_15 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_15 +; RV64ZVE32F-NEXT: j .LBB97_16 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %idxs %v = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 2, <16 x i1> %m, <16 x i8> %passthru) ret <16 x i8> %v @@ -12656,24 +13895,37 @@ ; ; RV64V-LABEL: mgather_baseidx_v32i8: ; RV64V: # %bb.0: +; RV64V-NEXT: vmv2r.v v12, v8 ; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsext.vf8 v16, v12 ; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64V-NEXT: vmv1r.v v12, v10 -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vmv1r.v v14, v10 +; RV64V-NEXT: vluxei64.v v14, (a0), v16, v0.t +; RV64V-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64V-NEXT: vmv.v.i v8, 0 +; RV64V-NEXT: vmv.v.i v24, 0 ; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, mu -; RV64V-NEXT: vslidedown.vi v10, v10, 16 -; RV64V-NEXT: vslidedown.vi v8, v8, 16 +; RV64V-NEXT: vslidedown.vi v24, v10, 16 +; RV64V-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64V-NEXT: vmv.v.i v10, 0 +; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64V-NEXT: vslidedown.vi v10, v12, 16 ; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsext.vf8 v16, v10 +; RV64V-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64V-NEXT: vmv.v.i v8, 0 ; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64V-NEXT: vslidedown.vi v0, v0, 2 +; RV64V-NEXT: vslidedown.vi v8, v0, 2 ; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64V-NEXT: vluxei64.v v10, (a0), v16, v0.t +; RV64V-NEXT: vmv1r.v v0, v8 +; RV64V-NEXT: vluxei64.v v24, (a0), v16, v0.t +; RV64V-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64V-NEXT: vmv.v.i v10, 0 +; RV64V-NEXT: vmv1r.v v8, v14 +; RV64V-NEXT: vmv1r.v v10, v24 ; RV64V-NEXT: li a0, 32 ; RV64V-NEXT: vsetvli zero, a0, e8, m2, tu, mu -; RV64V-NEXT: vslideup.vi v12, v10, 16 -; RV64V-NEXT: vmv2r.v v8, v12 +; RV64V-NEXT: vslideup.vi v8, v10, 16 ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_baseidx_v32i8: @@ -12691,481 +13943,587 @@ ; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB98_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 1 ; RV64ZVE32F-NEXT: .LBB98_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 ; RV64ZVE32F-NEXT: .LBB98_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_60 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 3 +; RV64ZVE32F-NEXT: .LBB98_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_61 -; RV64ZVE32F-NEXT: .LBB98_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_10 -; RV64ZVE32F-NEXT: .LBB98_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4 +; RV64ZVE32F-NEXT: .LBB98_10: # %else11 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v15, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v16, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5 -; RV64ZVE32F-NEXT: .LBB98_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v16, 5 +; RV64ZVE32F-NEXT: .LBB98_12: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_62 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 -; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_63 -; RV64ZVE32F-NEXT: .LBB98_12: # %else20 -; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: # %bb.13: # %else17 +; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_64 -; RV64ZVE32F-NEXT: .LBB98_13: # %else23 +; RV64ZVE32F-NEXT: .LBB98_14: # %else20 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_16 +; RV64ZVE32F-NEXT: .LBB98_15: # %cond.load22 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 8 +; RV64ZVE32F-NEXT: .LBB98_16: # %else23 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_15 -; RV64ZVE32F-NEXT: .LBB98_14: # %cond.load25 +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.load25 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v14, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 9 -; RV64ZVE32F-NEXT: .LBB98_15: # %else26 +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 9 +; RV64ZVE32F-NEXT: .LBB98_18: # %else26 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v16, v14, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.load28 +; RV64ZVE32F-NEXT: vmv.x.s a2, v16 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 10 -; RV64ZVE32F-NEXT: .LBB98_17: # %else29 +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 10 +; RV64ZVE32F-NEXT: .LBB98_20: # %else29 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 1 ; RV64ZVE32F-NEXT: addiw a3, a2, -2048 ; RV64ZVE32F-NEXT: and a3, a1, a3 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB98_19 -; RV64ZVE32F-NEXT: # %bb.18: # %cond.load31 +; RV64ZVE32F-NEXT: vslidedown.vi v15, v14, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB98_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.load31 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v16, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v14 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lb a3, 0(a3) ; RV64ZVE32F-NEXT: li a4, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a4, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a3 +; RV64ZVE32F-NEXT: vsetvli a5, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a4, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v16, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 11 -; RV64ZVE32F-NEXT: .LBB98_19: # %else32 +; RV64ZVE32F-NEXT: vslideup.vi v10, v16, 11 +; RV64ZVE32F-NEXT: .LBB98_22: # %else32 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_21 -; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.load34 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 12 -; RV64ZVE32F-NEXT: .LBB98_21: # %else35 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 12 +; RV64ZVE32F-NEXT: .LBB98_24: # %else35 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 2 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.load37 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.load37 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v15, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v16, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 13 -; RV64ZVE32F-NEXT: .LBB98_23: # %else38 +; RV64ZVE32F-NEXT: vslideup.vi v10, v16, 13 +; RV64ZVE32F-NEXT: .LBB98_26: # %else38 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 4 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_25 -; RV64ZVE32F-NEXT: # %bb.24: # %cond.load40 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v15, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.load40 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 14 -; RV64ZVE32F-NEXT: .LBB98_25: # %else41 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 14 +; RV64ZVE32F-NEXT: .LBB98_28: # %else41 ; RV64ZVE32F-NEXT: lui a2, 8 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_27 -; RV64ZVE32F-NEXT: # %bb.26: # %cond.load43 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.load43 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 15 -; RV64ZVE32F-NEXT: .LBB98_27: # %else44 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 15 +; RV64ZVE32F-NEXT: .LBB98_30: # %else44 ; RV64ZVE32F-NEXT: lui a2, 16 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_29 -; RV64ZVE32F-NEXT: # %bb.28: # %cond.load46 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_32 +; RV64ZVE32F-NEXT: # %bb.31: # %cond.load46 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 17, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 16 -; RV64ZVE32F-NEXT: .LBB98_29: # %else47 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 16 +; RV64ZVE32F-NEXT: .LBB98_32: # %else47 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 32 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_31 -; RV64ZVE32F-NEXT: # %bb.30: # %cond.load49 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_34 +; RV64ZVE32F-NEXT: # %bb.33: # %cond.load49 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 18, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 17 -; RV64ZVE32F-NEXT: .LBB98_31: # %else50 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 17 +; RV64ZVE32F-NEXT: .LBB98_34: # %else50 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 64 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_33 -; RV64ZVE32F-NEXT: # %bb.32: # %cond.load52 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_36 +; RV64ZVE32F-NEXT: # %bb.35: # %cond.load52 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 18 -; RV64ZVE32F-NEXT: .LBB98_33: # %else53 +; RV64ZVE32F-NEXT: .LBB98_36: # %else53 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 128 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_35 -; RV64ZVE32F-NEXT: # %bb.34: # %cond.load55 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_38 +; RV64ZVE32F-NEXT: # %bb.37: # %cond.load55 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 20, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 19 -; RV64ZVE32F-NEXT: .LBB98_35: # %else56 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 19 +; RV64ZVE32F-NEXT: .LBB98_38: # %else56 ; RV64ZVE32F-NEXT: lui a2, 256 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_37 -; RV64ZVE32F-NEXT: # %bb.36: # %cond.load58 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_40 +; RV64ZVE32F-NEXT: # %bb.39: # %cond.load58 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 20 -; RV64ZVE32F-NEXT: .LBB98_37: # %else59 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 20 +; RV64ZVE32F-NEXT: .LBB98_40: # %else59 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: lui a2, 512 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_39 -; RV64ZVE32F-NEXT: # %bb.38: # %cond.load61 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_42 +; RV64ZVE32F-NEXT: # %bb.41: # %cond.load61 +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 22, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 21 -; RV64ZVE32F-NEXT: .LBB98_39: # %else62 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 21 +; RV64ZVE32F-NEXT: .LBB98_42: # %else62 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 1024 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_41 -; RV64ZVE32F-NEXT: # %bb.40: # %cond.load64 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_44 +; RV64ZVE32F-NEXT: # %bb.43: # %cond.load64 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 23, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 22 -; RV64ZVE32F-NEXT: .LBB98_41: # %else65 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 22 +; RV64ZVE32F-NEXT: .LBB98_44: # %else65 ; RV64ZVE32F-NEXT: lui a2, 2048 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_43 -; RV64ZVE32F-NEXT: # %bb.42: # %cond.load67 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_46 +; RV64ZVE32F-NEXT: # %bb.45: # %cond.load67 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 24, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 23 -; RV64ZVE32F-NEXT: .LBB98_43: # %else68 +; RV64ZVE32F-NEXT: .LBB98_46: # %else68 ; RV64ZVE32F-NEXT: lui a2, 4096 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_45 -; RV64ZVE32F-NEXT: # %bb.44: # %cond.load70 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_48 +; RV64ZVE32F-NEXT: # %bb.47: # %cond.load70 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 25, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 24 -; RV64ZVE32F-NEXT: .LBB98_45: # %else71 +; RV64ZVE32F-NEXT: .LBB98_48: # %else71 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 8192 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_47 -; RV64ZVE32F-NEXT: # %bb.46: # %cond.load73 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_50 +; RV64ZVE32F-NEXT: # %bb.49: # %cond.load73 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 26, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 25 -; RV64ZVE32F-NEXT: .LBB98_47: # %else74 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 25 +; RV64ZVE32F-NEXT: .LBB98_50: # %else74 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 16384 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_49 -; RV64ZVE32F-NEXT: # %bb.48: # %cond.load76 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_52 +; RV64ZVE32F-NEXT: # %bb.51: # %cond.load76 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26 -; RV64ZVE32F-NEXT: .LBB98_49: # %else77 +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 26 +; RV64ZVE32F-NEXT: .LBB98_52: # %else77 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 32768 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_51 -; RV64ZVE32F-NEXT: # %bb.50: # %cond.load79 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_54 +; RV64ZVE32F-NEXT: # %bb.53: # %cond.load79 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 28, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 27 -; RV64ZVE32F-NEXT: .LBB98_51: # %else80 +; RV64ZVE32F-NEXT: .LBB98_54: # %else80 ; RV64ZVE32F-NEXT: lui a2, 65536 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_53 -; RV64ZVE32F-NEXT: # %bb.52: # %cond.load82 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_56 +; RV64ZVE32F-NEXT: # %bb.55: # %cond.load82 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 29, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 28 -; RV64ZVE32F-NEXT: .LBB98_53: # %else83 +; RV64ZVE32F-NEXT: .LBB98_56: # %else83 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 131072 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_55 -; RV64ZVE32F-NEXT: # %bb.54: # %cond.load85 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_58 +; RV64ZVE32F-NEXT: # %bb.57: # %cond.load85 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 30, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 29 -; RV64ZVE32F-NEXT: .LBB98_55: # %else86 +; RV64ZVE32F-NEXT: .LBB98_58: # %else86 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 262144 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_57 -; RV64ZVE32F-NEXT: # %bb.56: # %cond.load88 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_60 +; RV64ZVE32F-NEXT: # %bb.59: # %cond.load88 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 31, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 30 -; RV64ZVE32F-NEXT: .LBB98_57: # %else89 +; RV64ZVE32F-NEXT: .LBB98_60: # %else89 ; RV64ZVE32F-NEXT: lui a2, 524288 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB98_59 -; RV64ZVE32F-NEXT: # %bb.58: # %cond.load91 +; RV64ZVE32F-NEXT: beqz a1, .LBB98_62 +; RV64ZVE32F-NEXT: # %bb.61: # %cond.load91 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: lb a0, 0(a0) ; RV64ZVE32F-NEXT: li a1, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 31 -; RV64ZVE32F-NEXT: .LBB98_59: # %else92 +; RV64ZVE32F-NEXT: .LBB98_62: # %else92 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB98_60: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_8 -; RV64ZVE32F-NEXT: .LBB98_61: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_9 -; RV64ZVE32F-NEXT: j .LBB98_10 -; RV64ZVE32F-NEXT: .LBB98_62: # %cond.load16 +; RV64ZVE32F-NEXT: .LBB98_63: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v16, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 6 +; RV64ZVE32F-NEXT: vslideup.vi v10, v16, 6 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_12 -; RV64ZVE32F-NEXT: .LBB98_63: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_14 +; RV64ZVE32F-NEXT: .LBB98_64: # %cond.load19 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 7 +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 7 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_13 -; RV64ZVE32F-NEXT: .LBB98_64: # %cond.load22 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lb a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 8 -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_14 -; RV64ZVE32F-NEXT: j .LBB98_15 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_15 +; RV64ZVE32F-NEXT: j .LBB98_16 %ptrs = getelementptr inbounds i8, i8* %base, <32 x i8> %idxs %v = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %ptrs, i32 2, <32 x i1> %m, <32 x i8> %passthru) ret <32 x i8> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -84,9 +84,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB1_2 ; RV64ZVE32F-NEXT: .LBB1_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %val, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) ret void @@ -132,9 +134,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB2_2 ; RV64ZVE32F-NEXT: .LBB2_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v9, (a1) ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i16> %val to <2 x i8> call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) @@ -189,9 +193,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB3_2 ; RV64ZVE32F-NEXT: .LBB3_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v9, (a1) ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i32> %val to <2 x i8> call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) @@ -268,9 +274,11 @@ ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB4_2 ; RV64ZVE32F-NEXT: .LBB4_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse8.v v8, (a3) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v9, (a3) ; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i8> @@ -320,21 +328,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB5_2 ; RV64ZVE32F-NEXT: .LBB5_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB5_3 ; RV64ZVE32F-NEXT: .LBB5_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB5_4 ; RV64ZVE32F-NEXT: .LBB5_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %m) ret void @@ -380,21 +394,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB6_2 ; RV64ZVE32F-NEXT: .LBB6_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB6_3 ; RV64ZVE32F-NEXT: .LBB6_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB6_4 ; RV64ZVE32F-NEXT: .LBB6_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v9, (a1) ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -468,45 +488,59 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB8_2 ; RV64ZVE32F-NEXT: .LBB8_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v9, (t0) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB8_3 ; RV64ZVE32F-NEXT: .LBB8_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse8.v v9, (a7) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB8_4 ; RV64ZVE32F-NEXT: .LBB8_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-NEXT: vse8.v v9, (a6) ; RV64ZVE32F-NEXT: andi a0, a3, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB8_5 ; RV64ZVE32F-NEXT: .LBB8_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vse8.v v9, (a5) ; RV64ZVE32F-NEXT: andi a0, a3, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB8_6 ; RV64ZVE32F-NEXT: .LBB8_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse8.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 64 ; RV64ZVE32F-NEXT: beqz a0, .LBB8_7 ; RV64ZVE32F-NEXT: .LBB8_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 ; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB8_8 ; RV64ZVE32F-NEXT: .LBB8_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %val, <8 x i8*> %ptrs, i32 1, <8 x i1> %m) ret void @@ -541,92 +575,113 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vse8.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB9_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB9_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB9_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB9_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB9_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB9_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_14 -; RV64ZVE32F-NEXT: .LBB9_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB9_10 -; RV64ZVE32F-NEXT: .LBB9_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB9_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB9_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB9_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB9_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB9_16 -; RV64ZVE32F-NEXT: .LBB9_12: # %else14 +; RV64ZVE32F-NEXT: .LBB9_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB9_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_8 -; RV64ZVE32F-NEXT: .LBB9_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_9 -; RV64ZVE32F-NEXT: j .LBB9_10 ; RV64ZVE32F-NEXT: .LBB9_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB9_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB9_14 ; RV64ZVE32F-NEXT: .LBB9_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v9, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %val, <8 x i8*> %ptrs, i32 1, <8 x i1> %m) @@ -709,9 +764,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB11_2 ; RV64ZVE32F-NEXT: .LBB11_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %val, <2 x i16*> %ptrs, i32 2, <2 x i1> %m) ret void @@ -758,9 +815,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB12_2 ; RV64ZVE32F-NEXT: .LBB12_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i32> %val to <2 x i16> call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %tval, <2 x i16*> %ptrs, i32 2, <2 x i1> %m) @@ -834,9 +893,11 @@ ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB13_2 ; RV64ZVE32F-NEXT: .LBB13_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse16.v v8, (a3) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a3) ; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i16> @@ -886,21 +947,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB14_2 ; RV64ZVE32F-NEXT: .LBB14_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB14_3 ; RV64ZVE32F-NEXT: .LBB14_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB14_4 ; RV64ZVE32F-NEXT: .LBB14_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> %m) ret void @@ -946,21 +1013,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB15_2 ; RV64ZVE32F-NEXT: .LBB15_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB15_3 ; RV64ZVE32F-NEXT: .LBB15_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB15_4 ; RV64ZVE32F-NEXT: .LBB15_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -1034,45 +1107,59 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB17_2 ; RV64ZVE32F-NEXT: .LBB17_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (t0) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB17_3 ; RV64ZVE32F-NEXT: .LBB17_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a7) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB17_4 ; RV64ZVE32F-NEXT: .LBB17_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-NEXT: vse16.v v9, (a6) ; RV64ZVE32F-NEXT: andi a0, a3, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB17_5 ; RV64ZVE32F-NEXT: .LBB17_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vse16.v v9, (a5) ; RV64ZVE32F-NEXT: andi a0, a3, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB17_6 ; RV64ZVE32F-NEXT: .LBB17_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 64 ; RV64ZVE32F-NEXT: beqz a0, .LBB17_7 ; RV64ZVE32F-NEXT: .LBB17_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB17_8 ; RV64ZVE32F-NEXT: .LBB17_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) ret void @@ -1110,99 +1197,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB18_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB18_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB18_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB18_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB18_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_14 -; RV64ZVE32F-NEXT: .LBB18_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB18_10 -; RV64ZVE32F-NEXT: .LBB18_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB18_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB18_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB18_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB18_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB18_16 -; RV64ZVE32F-NEXT: .LBB18_12: # %else14 +; RV64ZVE32F-NEXT: .LBB18_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB18_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_8 -; RV64ZVE32F-NEXT: .LBB18_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_9 -; RV64ZVE32F-NEXT: j .LBB18_10 ; RV64ZVE32F-NEXT: .LBB18_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB18_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB18_14 ; RV64ZVE32F-NEXT: .LBB18_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, i16* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) @@ -1241,99 +1349,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB19_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB19_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB19_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB19_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_14 -; RV64ZVE32F-NEXT: .LBB19_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_10 -; RV64ZVE32F-NEXT: .LBB19_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB19_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB19_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB19_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB19_16 -; RV64ZVE32F-NEXT: .LBB19_12: # %else14 +; RV64ZVE32F-NEXT: .LBB19_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB19_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_8 -; RV64ZVE32F-NEXT: .LBB19_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_9 -; RV64ZVE32F-NEXT: j .LBB19_10 ; RV64ZVE32F-NEXT: .LBB19_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB19_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB19_14 ; RV64ZVE32F-NEXT: .LBB19_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs @@ -1374,106 +1503,127 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB20_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB20_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB20_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB20_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB20_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_14 -; RV64ZVE32F-NEXT: .LBB20_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_10 -; RV64ZVE32F-NEXT: .LBB20_9: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB20_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB20_16 -; RV64ZVE32F-NEXT: .LBB20_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB20_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB20_8: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_8 -; RV64ZVE32F-NEXT: .LBB20_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB20_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_9 -; RV64ZVE32F-NEXT: j .LBB20_10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB20_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB20_16 +; RV64ZVE32F-NEXT: .LBB20_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB20_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB20_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB20_14 ; RV64ZVE32F-NEXT: .LBB20_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs @@ -1514,99 +1664,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB21_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB21_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB21_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB21_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB21_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB21_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_14 -; RV64ZVE32F-NEXT: .LBB21_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB21_10 -; RV64ZVE32F-NEXT: .LBB21_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB21_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB21_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB21_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB21_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB21_16 -; RV64ZVE32F-NEXT: .LBB21_12: # %else14 +; RV64ZVE32F-NEXT: .LBB21_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB21_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_8 -; RV64ZVE32F-NEXT: .LBB21_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_9 -; RV64ZVE32F-NEXT: j .LBB21_10 ; RV64ZVE32F-NEXT: .LBB21_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB21_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB21_14 ; RV64ZVE32F-NEXT: .LBB21_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) @@ -1689,9 +1860,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB23_2 ; RV64ZVE32F-NEXT: .LBB23_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 4, <2 x i1> %m) ret void @@ -1752,9 +1925,11 @@ ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB24_2 ; RV64ZVE32F-NEXT: .LBB24_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v8, (a3) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v9, (a3) ; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i32> @@ -1804,21 +1979,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB25_2 ; RV64ZVE32F-NEXT: .LBB25_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB25_3 ; RV64ZVE32F-NEXT: .LBB25_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB25_4 ; RV64ZVE32F-NEXT: .LBB25_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> %m) ret void @@ -1864,21 +2045,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB26_2 ; RV64ZVE32F-NEXT: .LBB26_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB26_3 ; RV64ZVE32F-NEXT: .LBB26_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB26_4 ; RV64ZVE32F-NEXT: .LBB26_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v9, (a1) ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -1952,45 +2139,59 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB28_2 ; RV64ZVE32F-NEXT: .LBB28_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v10, (t0) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB28_3 ; RV64ZVE32F-NEXT: .LBB28_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v10, (a7) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB28_4 ; RV64ZVE32F-NEXT: .LBB28_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v10, (a6) ; RV64ZVE32F-NEXT: andi a0, a3, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB28_5 ; RV64ZVE32F-NEXT: .LBB28_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v10, (a5) ; RV64ZVE32F-NEXT: andi a0, a3, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB28_6 ; RV64ZVE32F-NEXT: .LBB28_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v10, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 64 ; RV64ZVE32F-NEXT: beqz a0, .LBB28_7 ; RV64ZVE32F-NEXT: .LBB28_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB28_8 ; RV64ZVE32F-NEXT: .LBB28_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -2027,99 +2228,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB29_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB29_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB29_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB29_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_14 -; RV64ZVE32F-NEXT: .LBB29_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_10 -; RV64ZVE32F-NEXT: .LBB29_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB29_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB29_16 -; RV64ZVE32F-NEXT: .LBB29_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB29_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_8 -; RV64ZVE32F-NEXT: .LBB29_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_9 -; RV64ZVE32F-NEXT: j .LBB29_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB29_16 +; RV64ZVE32F-NEXT: .LBB29_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB29_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB29_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB29_14 ; RV64ZVE32F-NEXT: .LBB29_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) @@ -2157,99 +2379,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB30_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB30_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB30_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB30_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_14 -; RV64ZVE32F-NEXT: .LBB30_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_10 -; RV64ZVE32F-NEXT: .LBB30_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB30_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB30_16 -; RV64ZVE32F-NEXT: .LBB30_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB30_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_8 -; RV64ZVE32F-NEXT: .LBB30_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_9 -; RV64ZVE32F-NEXT: j .LBB30_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB30_16 +; RV64ZVE32F-NEXT: .LBB30_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB30_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB30_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB30_14 ; RV64ZVE32F-NEXT: .LBB30_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs @@ -2289,106 +2532,127 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB31_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB31_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB31_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB31_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB31_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_14 -; RV64ZVE32F-NEXT: .LBB31_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_10 -; RV64ZVE32F-NEXT: .LBB31_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB31_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB31_16 -; RV64ZVE32F-NEXT: .LBB31_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB31_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_8 -; RV64ZVE32F-NEXT: .LBB31_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_9 -; RV64ZVE32F-NEXT: j .LBB31_10 -; RV64ZVE32F-NEXT: .LBB31_15: # %cond.store11 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB31_12 -; RV64ZVE32F-NEXT: .LBB31_16: # %cond.store13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: bnez a1, .LBB31_16 +; RV64ZVE32F-NEXT: .LBB31_14: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB31_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB31_14 +; RV64ZVE32F-NEXT: .LBB31_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs @@ -2428,99 +2692,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB32_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB32_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB32_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB32_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB32_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_14 -; RV64ZVE32F-NEXT: .LBB32_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_10 -; RV64ZVE32F-NEXT: .LBB32_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB32_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB32_16 -; RV64ZVE32F-NEXT: .LBB32_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB32_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_8 -; RV64ZVE32F-NEXT: .LBB32_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_9 -; RV64ZVE32F-NEXT: j .LBB32_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB32_16 +; RV64ZVE32F-NEXT: .LBB32_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB32_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB32_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB32_14 ; RV64ZVE32F-NEXT: .LBB32_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) @@ -2559,99 +2844,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB33_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB33_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB33_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB33_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB33_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_14 -; RV64ZVE32F-NEXT: .LBB33_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_10 -; RV64ZVE32F-NEXT: .LBB33_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB33_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB33_16 -; RV64ZVE32F-NEXT: .LBB33_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB33_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_8 -; RV64ZVE32F-NEXT: .LBB33_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_9 -; RV64ZVE32F-NEXT: j .LBB33_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB33_16 +; RV64ZVE32F-NEXT: .LBB33_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB33_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB33_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB33_14 ; RV64ZVE32F-NEXT: .LBB33_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs @@ -2694,106 +3000,127 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a3) ; RV64ZVE32F-NEXT: .LBB34_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB34_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a3) ; RV64ZVE32F-NEXT: .LBB34_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB34_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a3) ; RV64ZVE32F-NEXT: .LBB34_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB34_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB34_14 -; RV64ZVE32F-NEXT: .LBB34_8: # %else8 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB34_10 -; RV64ZVE32F-NEXT: .LBB34_9: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: .LBB34_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB34_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: bnez a2, .LBB34_16 -; RV64ZVE32F-NEXT: .LBB34_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB34_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: .LBB34_8: # %else6 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB34_8 -; RV64ZVE32F-NEXT: .LBB34_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: .LBB34_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB34_9 -; RV64ZVE32F-NEXT: j .LBB34_10 +; RV64ZVE32F-NEXT: .LBB34_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_16 +; RV64ZVE32F-NEXT: .LBB34_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB34_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB34_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_14 ; RV64ZVE32F-NEXT: .LBB34_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: and a1, a2, a1 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs @@ -2832,101 +3159,125 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB35_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB35_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v15, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v16, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v16, (a2) ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_13 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 ; RV64ZVE32F-NEXT: .LBB35_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 -; RV64ZVE32F-NEXT: .LBB35_7: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 +; RV64ZVE32F-NEXT: .LBB35_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB35_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_9 -; RV64ZVE32F-NEXT: .LBB35_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB35_9: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB35_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB35_16 -; RV64ZVE32F-NEXT: .LBB35_11: # %else14 +; RV64ZVE32F-NEXT: .LBB35_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB35_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: .LBB35_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v14, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 -; RV64ZVE32F-NEXT: .LBB35_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB35_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v14, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_7 -; RV64ZVE32F-NEXT: .LBB35_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_8 -; RV64ZVE32F-NEXT: j .LBB35_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_7 +; RV64ZVE32F-NEXT: j .LBB35_8 ; RV64ZVE32F-NEXT: .LBB35_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB35_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB35_12 ; RV64ZVE32F-NEXT: .LBB35_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %idxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) @@ -3018,9 +3369,11 @@ ; RV32ZVE32F-NEXT: andi a0, a3, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB37_2 ; RV32ZVE32F-NEXT: .LBB37_4: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: sw a1, 0(a0) ; RV32ZVE32F-NEXT: ret @@ -3095,6 +3448,8 @@ ; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB38_2 ; RV32ZVE32F-NEXT: .LBB38_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 @@ -3103,6 +3458,8 @@ ; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB38_3 ; RV32ZVE32F-NEXT: .LBB38_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 @@ -3111,9 +3468,11 @@ ; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB38_4 ; RV32ZVE32F-NEXT: .LBB38_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) ; RV32ZVE32F-NEXT: ret @@ -3208,6 +3567,8 @@ ; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB39_2 ; RV32ZVE32F-NEXT: .LBB39_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 @@ -3216,6 +3577,8 @@ ; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB39_3 ; RV32ZVE32F-NEXT: .LBB39_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 @@ -3224,9 +3587,11 @@ ; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB39_4 ; RV32ZVE32F-NEXT: .LBB39_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) ; RV32ZVE32F-NEXT: ret @@ -3314,7 +3679,7 @@ ; RV32ZVE32F-NEXT: lw a2, 56(a0) ; RV32ZVE32F-NEXT: lw a3, 52(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) -; RV32ZVE32F-NEXT: lw a5, 44(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) ; RV32ZVE32F-NEXT: lw a7, 40(a0) ; RV32ZVE32F-NEXT: lw t0, 36(a0) ; RV32ZVE32F-NEXT: lw t1, 32(a0) @@ -3325,34 +3690,36 @@ ; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.x.s a6, v0 -; RV32ZVE32F-NEXT: andi s1, a6, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v0 +; RV32ZVE32F-NEXT: andi s1, a5, 1 ; RV32ZVE32F-NEXT: bnez s1, .LBB41_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_11 ; RV32ZVE32F-NEXT: .LBB41_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_12 ; RV32ZVE32F-NEXT: .LBB41_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_13 ; RV32ZVE32F-NEXT: .LBB41_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: andi a0, a5, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_14 ; RV32ZVE32F-NEXT: .LBB41_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: andi a0, a5, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_15 ; RV32ZVE32F-NEXT: .LBB41_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: andi a0, a5, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_16 ; RV32ZVE32F-NEXT: .LBB41_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: andi a0, a5, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_9 ; RV32ZVE32F-NEXT: .LBB41_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) ; RV32ZVE32F-NEXT: .LBB41_9: # %else14 @@ -3368,55 +3735,67 @@ ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) -; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_2 ; RV32ZVE32F-NEXT: .LBB41_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw s0, 4(a0) ; RV32ZVE32F-NEXT: sw t6, 0(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_3 ; RV32ZVE32F-NEXT: .LBB41_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw t5, 0(a0) ; RV32ZVE32F-NEXT: sw t4, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_4 ; RV32ZVE32F-NEXT: .LBB41_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw t3, 0(a0) ; RV32ZVE32F-NEXT: sw t2, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: andi a0, a5, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_5 ; RV32ZVE32F-NEXT: .LBB41_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw t1, 0(a0) ; RV32ZVE32F-NEXT: sw t0, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: andi a0, a5, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_6 ; RV32ZVE32F-NEXT: .LBB41_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_7 ; RV32ZVE32F-NEXT: .LBB41_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a3, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: andi a0, a5, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_8 ; RV32ZVE32F-NEXT: j .LBB41_9 ; @@ -3584,9 +3963,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_9 ; RV32ZVE32F-NEXT: .LBB42_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB42_9: # %else14 @@ -3605,6 +3986,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_2 ; RV32ZVE32F-NEXT: .LBB42_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3613,6 +3996,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_3 ; RV32ZVE32F-NEXT: .LBB42_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3621,6 +4006,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_4 ; RV32ZVE32F-NEXT: .LBB42_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3629,6 +4016,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_5 ; RV32ZVE32F-NEXT: .LBB42_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3637,6 +4026,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_6 ; RV32ZVE32F-NEXT: .LBB42_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3645,6 +4036,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_7 ; RV32ZVE32F-NEXT: .LBB42_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3674,9 +4067,12 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB42_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB42_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -3684,72 +4080,76 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB42_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB42_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: .LBB42_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_14 -; RV64ZVE32F-NEXT: .LBB42_8: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_10 -; RV64ZVE32F-NEXT: .LBB42_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB42_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_16 -; RV64ZVE32F-NEXT: .LBB42_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB42_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_8 -; RV64ZVE32F-NEXT: .LBB42_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_9 -; RV64ZVE32F-NEXT: j .LBB42_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_16 +; RV64ZVE32F-NEXT: .LBB42_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB42_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_14 ; RV64ZVE32F-NEXT: .LBB42_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a2, 0(a0) @@ -3833,9 +4233,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_9 ; RV32ZVE32F-NEXT: .LBB43_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB43_9: # %else14 @@ -3854,6 +4256,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_2 ; RV32ZVE32F-NEXT: .LBB43_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3862,6 +4266,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_3 ; RV32ZVE32F-NEXT: .LBB43_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3870,6 +4276,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_4 ; RV32ZVE32F-NEXT: .LBB43_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3878,6 +4286,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_5 ; RV32ZVE32F-NEXT: .LBB43_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3886,6 +4296,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_6 ; RV32ZVE32F-NEXT: .LBB43_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3894,6 +4306,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_7 ; RV32ZVE32F-NEXT: .LBB43_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -3923,9 +4337,12 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB43_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB43_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -3933,72 +4350,76 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB43_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB43_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: .LBB43_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_14 -; RV64ZVE32F-NEXT: .LBB43_8: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB43_10 -; RV64ZVE32F-NEXT: .LBB43_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB43_10: # %else10 +; RV64ZVE32F-NEXT: .LBB43_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_16 -; RV64ZVE32F-NEXT: .LBB43_12: # %else14 +; RV64ZVE32F-NEXT: .LBB43_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB43_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_8 -; RV64ZVE32F-NEXT: .LBB43_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_9 -; RV64ZVE32F-NEXT: j .LBB43_10 ; RV64ZVE32F-NEXT: .LBB43_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_14 ; RV64ZVE32F-NEXT: .LBB43_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a2, 0(a0) @@ -4083,9 +4504,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_9 ; RV32ZVE32F-NEXT: .LBB44_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB44_9: # %else14 @@ -4104,6 +4527,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_2 ; RV32ZVE32F-NEXT: .LBB44_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4112,6 +4537,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_3 ; RV32ZVE32F-NEXT: .LBB44_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4120,6 +4547,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_4 ; RV32ZVE32F-NEXT: .LBB44_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4128,6 +4557,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_5 ; RV32ZVE32F-NEXT: .LBB44_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4136,6 +4567,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_6 ; RV32ZVE32F-NEXT: .LBB44_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4144,6 +4577,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_7 ; RV32ZVE32F-NEXT: .LBB44_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4174,9 +4609,12 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB44_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -4185,12 +4623,14 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB44_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 ; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 @@ -4198,52 +4638,52 @@ ; RV64ZVE32F-NEXT: .LBB44_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_14 -; RV64ZVE32F-NEXT: .LBB44_8: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_10 -; RV64ZVE32F-NEXT: .LBB44_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB44_10: # %else10 +; RV64ZVE32F-NEXT: .LBB44_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_16 -; RV64ZVE32F-NEXT: .LBB44_12: # %else14 +; RV64ZVE32F-NEXT: .LBB44_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: andi a0, a0, 255 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_8 -; RV64ZVE32F-NEXT: .LBB44_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: andi a0, a0, 255 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_9 -; RV64ZVE32F-NEXT: j .LBB44_10 ; RV64ZVE32F-NEXT: .LBB44_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: andi a0, a0, 255 @@ -4251,11 +4691,13 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_14 ; RV64ZVE32F-NEXT: .LBB44_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 @@ -4339,9 +4781,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_9 ; RV32ZVE32F-NEXT: .LBB45_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB45_9: # %else14 @@ -4360,6 +4804,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_2 ; RV32ZVE32F-NEXT: .LBB45_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4368,6 +4814,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_3 ; RV32ZVE32F-NEXT: .LBB45_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4376,6 +4824,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_4 ; RV32ZVE32F-NEXT: .LBB45_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4384,6 +4834,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_5 ; RV32ZVE32F-NEXT: .LBB45_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4392,6 +4844,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_6 ; RV32ZVE32F-NEXT: .LBB45_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4400,6 +4854,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_7 ; RV32ZVE32F-NEXT: .LBB45_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4430,9 +4886,12 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB45_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB45_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -4440,72 +4899,76 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB45_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB45_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: .LBB45_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_14 -; RV64ZVE32F-NEXT: .LBB45_8: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB45_10 -; RV64ZVE32F-NEXT: .LBB45_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB45_10: # %else10 +; RV64ZVE32F-NEXT: .LBB45_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_16 -; RV64ZVE32F-NEXT: .LBB45_12: # %else14 +; RV64ZVE32F-NEXT: .LBB45_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB45_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_8 -; RV64ZVE32F-NEXT: .LBB45_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_9 -; RV64ZVE32F-NEXT: j .LBB45_10 ; RV64ZVE32F-NEXT: .LBB45_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_14 ; RV64ZVE32F-NEXT: .LBB45_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a2, 0(a0) @@ -4589,9 +5052,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_9 ; RV32ZVE32F-NEXT: .LBB46_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB46_9: # %else14 @@ -4610,6 +5075,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_2 ; RV32ZVE32F-NEXT: .LBB46_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4618,6 +5085,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_3 ; RV32ZVE32F-NEXT: .LBB46_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4626,6 +5095,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_4 ; RV32ZVE32F-NEXT: .LBB46_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4634,6 +5105,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_5 ; RV32ZVE32F-NEXT: .LBB46_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4642,6 +5115,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_6 ; RV32ZVE32F-NEXT: .LBB46_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4650,6 +5125,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_7 ; RV32ZVE32F-NEXT: .LBB46_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4680,9 +5157,12 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB46_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB46_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -4690,72 +5170,76 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB46_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB46_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: .LBB46_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_14 -; RV64ZVE32F-NEXT: .LBB46_8: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB46_10 -; RV64ZVE32F-NEXT: .LBB46_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB46_10: # %else10 +; RV64ZVE32F-NEXT: .LBB46_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_16 -; RV64ZVE32F-NEXT: .LBB46_12: # %else14 +; RV64ZVE32F-NEXT: .LBB46_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB46_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_8 -; RV64ZVE32F-NEXT: .LBB46_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_9 -; RV64ZVE32F-NEXT: j .LBB46_10 ; RV64ZVE32F-NEXT: .LBB46_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_14 ; RV64ZVE32F-NEXT: .LBB46_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a2, 0(a0) @@ -4840,9 +5324,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_9 ; RV32ZVE32F-NEXT: .LBB47_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB47_9: # %else14 @@ -4861,6 +5347,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_2 ; RV32ZVE32F-NEXT: .LBB47_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4869,6 +5357,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_3 ; RV32ZVE32F-NEXT: .LBB47_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4877,6 +5367,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_4 ; RV32ZVE32F-NEXT: .LBB47_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4885,6 +5377,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_5 ; RV32ZVE32F-NEXT: .LBB47_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4893,6 +5387,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_6 ; RV32ZVE32F-NEXT: .LBB47_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4901,6 +5397,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_7 ; RV32ZVE32F-NEXT: .LBB47_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -4934,9 +5432,12 @@ ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: sd a0, 0(t3) ; RV64ZVE32F-NEXT: .LBB47_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB47_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -4945,12 +5446,14 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t2, 0(a0) ; RV64ZVE32F-NEXT: .LBB47_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a0, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB47_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 ; RV64ZVE32F-NEXT: and a0, a0, a4 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 @@ -4958,52 +5461,52 @@ ; RV64ZVE32F-NEXT: .LBB47_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_14 -; RV64ZVE32F-NEXT: .LBB47_8: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_10 -; RV64ZVE32F-NEXT: .LBB47_9: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: and a0, a0, a4 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: .LBB47_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_16 -; RV64ZVE32F-NEXT: .LBB47_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB47_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: and a0, a0, a4 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_8 -; RV64ZVE32F-NEXT: .LBB47_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: and a0, a0, a4 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_9 -; RV64ZVE32F-NEXT: j .LBB47_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a0, a5, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_16 +; RV64ZVE32F-NEXT: .LBB47_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB47_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: and a0, a0, a4 @@ -5011,11 +5514,13 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_14 ; RV64ZVE32F-NEXT: .LBB47_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: and a0, a0, a4 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 @@ -5097,9 +5602,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_9 ; RV32ZVE32F-NEXT: .LBB48_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB48_9: # %else14 @@ -5118,6 +5625,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_2 ; RV32ZVE32F-NEXT: .LBB48_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5126,6 +5635,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_3 ; RV32ZVE32F-NEXT: .LBB48_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5134,6 +5645,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_4 ; RV32ZVE32F-NEXT: .LBB48_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5142,6 +5655,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_5 ; RV32ZVE32F-NEXT: .LBB48_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5150,6 +5665,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_6 ; RV32ZVE32F-NEXT: .LBB48_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5158,6 +5675,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_7 ; RV32ZVE32F-NEXT: .LBB48_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5188,12 +5707,17 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB48_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB48_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v13 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) @@ -5202,70 +5726,75 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB48_12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB48_13 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_14 ; RV64ZVE32F-NEXT: .LBB48_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB48_14 -; RV64ZVE32F-NEXT: .LBB48_7: # %else8 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_8 +; RV64ZVE32F-NEXT: .LBB48_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB48_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB48_9 -; RV64ZVE32F-NEXT: .LBB48_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB48_9: # %else10 +; RV64ZVE32F-NEXT: .LBB48_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_16 -; RV64ZVE32F-NEXT: .LBB48_11: # %else14 +; RV64ZVE32F-NEXT: .LBB48_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB48_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: .LBB48_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v12 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB48_6 -; RV64ZVE32F-NEXT: .LBB48_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB48_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB48_7 -; RV64ZVE32F-NEXT: .LBB48_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB48_8 -; RV64ZVE32F-NEXT: j .LBB48_9 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_7 +; RV64ZVE32F-NEXT: j .LBB48_8 ; RV64ZVE32F-NEXT: .LBB48_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB48_11 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_12 ; RV64ZVE32F-NEXT: .LBB48_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a2, 0(a0) @@ -5348,9 +5877,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_9 ; RV32ZVE32F-NEXT: .LBB49_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB49_9: # %else14 @@ -5369,6 +5900,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_2 ; RV32ZVE32F-NEXT: .LBB49_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5377,6 +5910,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_3 ; RV32ZVE32F-NEXT: .LBB49_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5385,6 +5920,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_4 ; RV32ZVE32F-NEXT: .LBB49_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5393,6 +5930,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_5 ; RV32ZVE32F-NEXT: .LBB49_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5401,6 +5940,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_6 ; RV32ZVE32F-NEXT: .LBB49_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5409,6 +5950,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_7 ; RV32ZVE32F-NEXT: .LBB49_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5439,12 +5982,17 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB49_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB49_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v13 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) @@ -5453,70 +6001,75 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB49_12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB49_13 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_14 ; RV64ZVE32F-NEXT: .LBB49_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB49_14 -; RV64ZVE32F-NEXT: .LBB49_7: # %else8 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_8 +; RV64ZVE32F-NEXT: .LBB49_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB49_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB49_9 -; RV64ZVE32F-NEXT: .LBB49_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB49_9: # %else10 +; RV64ZVE32F-NEXT: .LBB49_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_16 -; RV64ZVE32F-NEXT: .LBB49_11: # %else14 +; RV64ZVE32F-NEXT: .LBB49_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB49_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: .LBB49_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v12 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB49_6 -; RV64ZVE32F-NEXT: .LBB49_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB49_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB49_7 -; RV64ZVE32F-NEXT: .LBB49_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB49_8 -; RV64ZVE32F-NEXT: j .LBB49_9 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_7 +; RV64ZVE32F-NEXT: j .LBB49_8 ; RV64ZVE32F-NEXT: .LBB49_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB49_11 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_12 ; RV64ZVE32F-NEXT: .LBB49_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a2, 0(a0) @@ -5600,9 +6153,11 @@ ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_9 ; RV32ZVE32F-NEXT: .LBB50_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a3, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB50_9: # %else14 @@ -5621,6 +6176,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_2 ; RV32ZVE32F-NEXT: .LBB50_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5629,6 +6186,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_3 ; RV32ZVE32F-NEXT: .LBB50_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5637,6 +6196,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_4 ; RV32ZVE32F-NEXT: .LBB50_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5645,6 +6206,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_5 ; RV32ZVE32F-NEXT: .LBB50_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5653,6 +6216,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_6 ; RV32ZVE32F-NEXT: .LBB50_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5661,6 +6226,8 @@ ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_7 ; RV32ZVE32F-NEXT: .LBB50_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 @@ -5692,12 +6259,17 @@ ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB50_2: # %else +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a0, .LBB50_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v13 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 @@ -5707,63 +6279,66 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_13 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_14 ; RV64ZVE32F-NEXT: .LBB50_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_14 -; RV64ZVE32F-NEXT: .LBB50_7: # %else8 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_8 +; RV64ZVE32F-NEXT: .LBB50_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB50_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB50_9 -; RV64ZVE32F-NEXT: .LBB50_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB50_9: # %else10 +; RV64ZVE32F-NEXT: .LBB50_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a0, a4, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB50_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB50_16 -; RV64ZVE32F-NEXT: .LBB50_11: # %else14 +; RV64ZVE32F-NEXT: .LBB50_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB50_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: .LBB50_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v12 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB50_6 -; RV64ZVE32F-NEXT: .LBB50_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB50_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB50_7 -; RV64ZVE32F-NEXT: .LBB50_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 -; RV64ZVE32F-NEXT: slli a0, a0, 32 -; RV64ZVE32F-NEXT: srli a0, a0, 29 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_8 -; RV64ZVE32F-NEXT: j .LBB50_9 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_7 +; RV64ZVE32F-NEXT: j .LBB50_8 ; RV64ZVE32F-NEXT: .LBB50_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 32 @@ -5771,11 +6346,13 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB50_11 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_12 ; RV64ZVE32F-NEXT: .LBB50_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 @@ -5899,9 +6476,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB51_9 ; RV32ZVE32F-NEXT: .LBB51_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB51_9: # %else14 @@ -5928,6 +6507,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB51_2 ; RV32ZVE32F-NEXT: .LBB51_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -5936,6 +6517,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB51_3 ; RV32ZVE32F-NEXT: .LBB51_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -5944,6 +6527,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB51_4 ; RV32ZVE32F-NEXT: .LBB51_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -5952,6 +6537,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB51_5 ; RV32ZVE32F-NEXT: .LBB51_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -5960,6 +6547,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB51_6 ; RV32ZVE32F-NEXT: .LBB51_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -5968,6 +6557,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB51_7 ; RV32ZVE32F-NEXT: .LBB51_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -6165,9 +6756,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB53_2 ; RV64ZVE32F-NEXT: .LBB53_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> %val, <2 x half*> %ptrs, i32 2, <2 x i1> %m) ret void @@ -6215,21 +6808,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB54_2 ; RV64ZVE32F-NEXT: .LBB54_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB54_3 ; RV64ZVE32F-NEXT: .LBB54_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB54_4 ; RV64ZVE32F-NEXT: .LBB54_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> %m) ret void @@ -6275,21 +6874,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB55_2 ; RV64ZVE32F-NEXT: .LBB55_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB55_3 ; RV64ZVE32F-NEXT: .LBB55_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB55_4 ; RV64ZVE32F-NEXT: .LBB55_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -6363,45 +6968,59 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB57_2 ; RV64ZVE32F-NEXT: .LBB57_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (t0) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB57_3 ; RV64ZVE32F-NEXT: .LBB57_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a7) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB57_4 ; RV64ZVE32F-NEXT: .LBB57_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 ; RV64ZVE32F-NEXT: vse16.v v9, (a6) ; RV64ZVE32F-NEXT: andi a0, a3, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB57_5 ; RV64ZVE32F-NEXT: .LBB57_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vse16.v v9, (a5) ; RV64ZVE32F-NEXT: andi a0, a3, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB57_6 ; RV64ZVE32F-NEXT: .LBB57_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 64 ; RV64ZVE32F-NEXT: beqz a0, .LBB57_7 ; RV64ZVE32F-NEXT: .LBB57_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB57_8 ; RV64ZVE32F-NEXT: .LBB57_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) ret void @@ -6439,99 +7058,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB58_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB58_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB58_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB58_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB58_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB58_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_14 -; RV64ZVE32F-NEXT: .LBB58_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB58_10 -; RV64ZVE32F-NEXT: .LBB58_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB58_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB58_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB58_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB58_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB58_16 -; RV64ZVE32F-NEXT: .LBB58_12: # %else14 +; RV64ZVE32F-NEXT: .LBB58_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB58_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_8 -; RV64ZVE32F-NEXT: .LBB58_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_9 -; RV64ZVE32F-NEXT: j .LBB58_10 ; RV64ZVE32F-NEXT: .LBB58_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB58_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB58_14 ; RV64ZVE32F-NEXT: .LBB58_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds half, half* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) @@ -6570,99 +7210,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB59_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB59_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB59_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB59_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB59_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB59_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_14 -; RV64ZVE32F-NEXT: .LBB59_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB59_10 -; RV64ZVE32F-NEXT: .LBB59_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB59_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB59_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB59_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB59_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB59_16 -; RV64ZVE32F-NEXT: .LBB59_12: # %else14 +; RV64ZVE32F-NEXT: .LBB59_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB59_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_8 -; RV64ZVE32F-NEXT: .LBB59_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_9 -; RV64ZVE32F-NEXT: j .LBB59_10 ; RV64ZVE32F-NEXT: .LBB59_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB59_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB59_14 ; RV64ZVE32F-NEXT: .LBB59_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs @@ -6703,106 +7364,127 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB60_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB60_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB60_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_14 -; RV64ZVE32F-NEXT: .LBB60_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_10 -; RV64ZVE32F-NEXT: .LBB60_9: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB60_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB60_16 -; RV64ZVE32F-NEXT: .LBB60_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB60_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB60_8: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_8 -; RV64ZVE32F-NEXT: .LBB60_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB60_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_9 -; RV64ZVE32F-NEXT: j .LBB60_10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB60_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB60_16 +; RV64ZVE32F-NEXT: .LBB60_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB60_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB60_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB60_14 ; RV64ZVE32F-NEXT: .LBB60_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs @@ -6843,99 +7525,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB61_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB61_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB61_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB61_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB61_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB61_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_14 -; RV64ZVE32F-NEXT: .LBB61_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB61_10 -; RV64ZVE32F-NEXT: .LBB61_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB61_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB61_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB61_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB61_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB61_16 -; RV64ZVE32F-NEXT: .LBB61_12: # %else14 +; RV64ZVE32F-NEXT: .LBB61_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB61_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_8 -; RV64ZVE32F-NEXT: .LBB61_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_9 -; RV64ZVE32F-NEXT: j .LBB61_10 ; RV64ZVE32F-NEXT: .LBB61_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB61_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB61_14 ; RV64ZVE32F-NEXT: .LBB61_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 ; RV64ZVE32F-NEXT: slli a1, a1, 1 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v9, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) @@ -7018,9 +7721,11 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB63_2 ; RV64ZVE32F-NEXT: .LBB63_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %val, <2 x float*> %ptrs, i32 4, <2 x i1> %m) ret void @@ -7068,21 +7773,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB64_2 ; RV64ZVE32F-NEXT: .LBB64_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB64_3 ; RV64ZVE32F-NEXT: .LBB64_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB64_4 ; RV64ZVE32F-NEXT: .LBB64_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v9, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> %m) ret void @@ -7128,21 +7839,27 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB65_2 ; RV64ZVE32F-NEXT: .LBB65_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v9, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB65_3 ; RV64ZVE32F-NEXT: .LBB65_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v9, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB65_4 ; RV64ZVE32F-NEXT: .LBB65_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v9, (a1) ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -7216,45 +7933,59 @@ ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB67_2 ; RV64ZVE32F-NEXT: .LBB67_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v10, (t0) ; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB67_3 ; RV64ZVE32F-NEXT: .LBB67_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v10, (a7) ; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB67_4 ; RV64ZVE32F-NEXT: .LBB67_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v10, (a6) ; RV64ZVE32F-NEXT: andi a0, a3, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB67_5 ; RV64ZVE32F-NEXT: .LBB67_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v10, (a5) ; RV64ZVE32F-NEXT: andi a0, a3, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB67_6 ; RV64ZVE32F-NEXT: .LBB67_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v10, (a4) ; RV64ZVE32F-NEXT: andi a0, a3, 64 ; RV64ZVE32F-NEXT: beqz a0, .LBB67_7 ; RV64ZVE32F-NEXT: .LBB67_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: andi a0, a3, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB67_8 ; RV64ZVE32F-NEXT: .LBB67_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a1) ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -7291,99 +8022,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB68_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB68_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB68_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB68_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB68_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB68_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB68_14 -; RV64ZVE32F-NEXT: .LBB68_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB68_10 -; RV64ZVE32F-NEXT: .LBB68_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB68_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB68_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB68_16 -; RV64ZVE32F-NEXT: .LBB68_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB68_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB68_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB68_8 -; RV64ZVE32F-NEXT: .LBB68_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB68_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB68_9 -; RV64ZVE32F-NEXT: j .LBB68_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB68_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB68_16 +; RV64ZVE32F-NEXT: .LBB68_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB68_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB68_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB68_14 ; RV64ZVE32F-NEXT: .LBB68_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) @@ -7421,99 +8173,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB69_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB69_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB69_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB69_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB69_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB69_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB69_14 -; RV64ZVE32F-NEXT: .LBB69_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB69_10 -; RV64ZVE32F-NEXT: .LBB69_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB69_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB69_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB69_16 -; RV64ZVE32F-NEXT: .LBB69_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB69_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB69_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB69_8 -; RV64ZVE32F-NEXT: .LBB69_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB69_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB69_9 -; RV64ZVE32F-NEXT: j .LBB69_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB69_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB69_16 +; RV64ZVE32F-NEXT: .LBB69_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB69_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB69_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB69_14 ; RV64ZVE32F-NEXT: .LBB69_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs @@ -7553,106 +8326,127 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB70_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB70_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB70_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB70_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB70_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB70_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB70_14 -; RV64ZVE32F-NEXT: .LBB70_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB70_10 -; RV64ZVE32F-NEXT: .LBB70_9: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB70_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB70_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB70_16 -; RV64ZVE32F-NEXT: .LBB70_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB70_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: .LBB70_8: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB70_8 -; RV64ZVE32F-NEXT: .LBB70_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: .LBB70_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB70_9 -; RV64ZVE32F-NEXT: j .LBB70_10 +; RV64ZVE32F-NEXT: .LBB70_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB70_16 +; RV64ZVE32F-NEXT: .LBB70_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB70_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB70_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB70_14 ; RV64ZVE32F-NEXT: .LBB70_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs @@ -7692,99 +8486,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB71_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB71_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB71_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB71_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB71_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB71_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB71_14 -; RV64ZVE32F-NEXT: .LBB71_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB71_10 -; RV64ZVE32F-NEXT: .LBB71_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB71_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB71_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB71_16 -; RV64ZVE32F-NEXT: .LBB71_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB71_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB71_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB71_8 -; RV64ZVE32F-NEXT: .LBB71_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB71_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB71_9 -; RV64ZVE32F-NEXT: j .LBB71_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB71_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB71_16 +; RV64ZVE32F-NEXT: .LBB71_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB71_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB71_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB71_14 ; RV64ZVE32F-NEXT: .LBB71_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) @@ -7823,99 +8638,120 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB72_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB72_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB72_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB72_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB72_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB72_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB72_14 -; RV64ZVE32F-NEXT: .LBB72_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB72_10 -; RV64ZVE32F-NEXT: .LBB72_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB72_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB72_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB72_16 -; RV64ZVE32F-NEXT: .LBB72_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB72_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB72_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB72_8 -; RV64ZVE32F-NEXT: .LBB72_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB72_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB72_9 -; RV64ZVE32F-NEXT: j .LBB72_10 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB72_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB72_16 +; RV64ZVE32F-NEXT: .LBB72_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB72_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB72_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB72_14 ; RV64ZVE32F-NEXT: .LBB72_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs @@ -7958,106 +8794,127 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a3) ; RV64ZVE32F-NEXT: .LBB73_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB73_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v14, (a3) ; RV64ZVE32F-NEXT: .LBB73_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB73_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a3) ; RV64ZVE32F-NEXT: .LBB73_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB73_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB73_14 -; RV64ZVE32F-NEXT: .LBB73_8: # %else8 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB73_10 -; RV64ZVE32F-NEXT: .LBB73_9: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: .LBB73_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB73_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: bnez a2, .LBB73_16 -; RV64ZVE32F-NEXT: .LBB73_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB73_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: .LBB73_8: # %else6 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB73_8 -; RV64ZVE32F-NEXT: .LBB73_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: .LBB73_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB73_9 -; RV64ZVE32F-NEXT: j .LBB73_10 +; RV64ZVE32F-NEXT: .LBB73_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_16 +; RV64ZVE32F-NEXT: .LBB73_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB73_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB73_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_14 ; RV64ZVE32F-NEXT: .LBB73_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: and a1, a2, a1 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs @@ -8096,101 +8953,125 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB74_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB74_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v15, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v16, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v16, (a2) ; RV64ZVE32F-NEXT: .LBB74_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_13 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_14 ; RV64ZVE32F-NEXT: .LBB74_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_14 -; RV64ZVE32F-NEXT: .LBB74_7: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_8 +; RV64ZVE32F-NEXT: .LBB74_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB74_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_9 -; RV64ZVE32F-NEXT: .LBB74_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB74_9: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB74_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB74_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB74_16 -; RV64ZVE32F-NEXT: .LBB74_11: # %else14 +; RV64ZVE32F-NEXT: .LBB74_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB74_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: .LBB74_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v14, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB74_6 -; RV64ZVE32F-NEXT: .LBB74_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB74_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v14, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_7 -; RV64ZVE32F-NEXT: .LBB74_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_8 -; RV64ZVE32F-NEXT: j .LBB74_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_7 +; RV64ZVE32F-NEXT: j .LBB74_8 ; RV64ZVE32F-NEXT: .LBB74_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB74_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB74_12 ; RV64ZVE32F-NEXT: .LBB74_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: slli a1, a1, 2 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v10, (a0) ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %idxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) @@ -8276,9 +9157,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB76_2 ; RV32ZVE32F-NEXT: .LBB76_4: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: fsd fa1, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -8343,6 +9226,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB77_2 ; RV32ZVE32F-NEXT: .LBB77_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v9 @@ -8350,6 +9235,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB77_3 ; RV32ZVE32F-NEXT: .LBB77_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v9 @@ -8357,9 +9244,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB77_4 ; RV32ZVE32F-NEXT: .LBB77_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: fsd fa3, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -8440,6 +9329,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB78_2 ; RV32ZVE32F-NEXT: .LBB78_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v9 @@ -8447,6 +9338,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB78_3 ; RV32ZVE32F-NEXT: .LBB78_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v9 @@ -8454,9 +9347,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB78_4 ; RV32ZVE32F-NEXT: .LBB78_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: fsd fa3, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -8561,6 +9456,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB80_2 ; RV32ZVE32F-NEXT: .LBB80_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8568,6 +9465,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB80_3 ; RV32ZVE32F-NEXT: .LBB80_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8575,6 +9474,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB80_4 ; RV32ZVE32F-NEXT: .LBB80_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8582,6 +9483,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB80_5 ; RV32ZVE32F-NEXT: .LBB80_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8589,6 +9492,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB80_6 ; RV32ZVE32F-NEXT: .LBB80_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8596,6 +9501,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB80_7 ; RV32ZVE32F-NEXT: .LBB80_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8603,9 +9510,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB80_8 ; RV32ZVE32F-NEXT: .LBB80_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -8739,6 +9648,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB81_2 ; RV32ZVE32F-NEXT: .LBB81_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8746,6 +9657,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB81_3 ; RV32ZVE32F-NEXT: .LBB81_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8753,6 +9666,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB81_4 ; RV32ZVE32F-NEXT: .LBB81_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8760,6 +9675,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB81_5 ; RV32ZVE32F-NEXT: .LBB81_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8767,6 +9684,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB81_6 ; RV32ZVE32F-NEXT: .LBB81_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8774,6 +9693,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB81_7 ; RV32ZVE32F-NEXT: .LBB81_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8781,9 +9702,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB81_8 ; RV32ZVE32F-NEXT: .LBB81_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -8799,9 +9722,12 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB81_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB81_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -8809,72 +9735,76 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB81_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB81_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) ; RV64ZVE32F-NEXT: .LBB81_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB81_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_14 -; RV64ZVE32F-NEXT: .LBB81_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB81_10 -; RV64ZVE32F-NEXT: .LBB81_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB81_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB81_10: # %else10 +; RV64ZVE32F-NEXT: .LBB81_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB81_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB81_16 -; RV64ZVE32F-NEXT: .LBB81_12: # %else14 +; RV64ZVE32F-NEXT: .LBB81_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB81_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa3, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_8 -; RV64ZVE32F-NEXT: .LBB81_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_9 -; RV64ZVE32F-NEXT: j .LBB81_10 ; RV64ZVE32F-NEXT: .LBB81_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB81_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB81_14 ; RV64ZVE32F-NEXT: .LBB81_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: fsd fa7, 0(a0) @@ -8944,6 +9874,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB82_2 ; RV32ZVE32F-NEXT: .LBB82_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8951,6 +9883,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB82_3 ; RV32ZVE32F-NEXT: .LBB82_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8958,6 +9892,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB82_4 ; RV32ZVE32F-NEXT: .LBB82_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8965,6 +9901,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB82_5 ; RV32ZVE32F-NEXT: .LBB82_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8972,6 +9910,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB82_6 ; RV32ZVE32F-NEXT: .LBB82_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8979,6 +9919,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB82_7 ; RV32ZVE32F-NEXT: .LBB82_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -8986,9 +9928,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB82_8 ; RV32ZVE32F-NEXT: .LBB82_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -9004,9 +9948,12 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB82_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB82_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9014,72 +9961,76 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB82_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB82_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) ; RV64ZVE32F-NEXT: .LBB82_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB82_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_14 -; RV64ZVE32F-NEXT: .LBB82_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB82_10 -; RV64ZVE32F-NEXT: .LBB82_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB82_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB82_10: # %else10 +; RV64ZVE32F-NEXT: .LBB82_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB82_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB82_16 -; RV64ZVE32F-NEXT: .LBB82_12: # %else14 +; RV64ZVE32F-NEXT: .LBB82_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB82_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa3, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_8 -; RV64ZVE32F-NEXT: .LBB82_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_9 -; RV64ZVE32F-NEXT: j .LBB82_10 ; RV64ZVE32F-NEXT: .LBB82_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB82_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB82_14 ; RV64ZVE32F-NEXT: .LBB82_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: fsd fa7, 0(a0) @@ -9150,6 +10101,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB83_2 ; RV32ZVE32F-NEXT: .LBB83_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9157,6 +10110,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB83_3 ; RV32ZVE32F-NEXT: .LBB83_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9164,6 +10119,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB83_4 ; RV32ZVE32F-NEXT: .LBB83_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9171,6 +10128,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB83_5 ; RV32ZVE32F-NEXT: .LBB83_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9178,6 +10137,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB83_6 ; RV32ZVE32F-NEXT: .LBB83_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9185,6 +10146,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB83_7 ; RV32ZVE32F-NEXT: .LBB83_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9192,9 +10155,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB83_8 ; RV32ZVE32F-NEXT: .LBB83_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -9211,9 +10176,12 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB83_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB83_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9222,12 +10190,14 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB83_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB83_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9235,52 +10205,52 @@ ; RV64ZVE32F-NEXT: .LBB83_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_14 -; RV64ZVE32F-NEXT: .LBB83_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_10 -; RV64ZVE32F-NEXT: .LBB83_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB83_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB83_16 -; RV64ZVE32F-NEXT: .LBB83_12: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB83_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB83_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_8 -; RV64ZVE32F-NEXT: .LBB83_14: # %cond.store7 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB83_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_9 -; RV64ZVE32F-NEXT: j .LBB83_10 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB83_12: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB83_16 +; RV64ZVE32F-NEXT: .LBB83_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB83_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -9288,11 +10258,13 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB83_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB83_14 ; RV64ZVE32F-NEXT: .LBB83_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: andi a1, a1, 255 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 @@ -9362,6 +10334,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB84_2 ; RV32ZVE32F-NEXT: .LBB84_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9369,6 +10343,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB84_3 ; RV32ZVE32F-NEXT: .LBB84_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9376,6 +10352,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB84_4 ; RV32ZVE32F-NEXT: .LBB84_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9383,6 +10361,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB84_5 ; RV32ZVE32F-NEXT: .LBB84_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9390,6 +10370,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB84_6 ; RV32ZVE32F-NEXT: .LBB84_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9397,6 +10379,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB84_7 ; RV32ZVE32F-NEXT: .LBB84_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9404,9 +10388,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB84_8 ; RV32ZVE32F-NEXT: .LBB84_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -9423,9 +10409,12 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB84_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB84_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9433,72 +10422,76 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB84_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) ; RV64ZVE32F-NEXT: .LBB84_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB84_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_14 -; RV64ZVE32F-NEXT: .LBB84_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB84_10 -; RV64ZVE32F-NEXT: .LBB84_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB84_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB84_10: # %else10 +; RV64ZVE32F-NEXT: .LBB84_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB84_16 -; RV64ZVE32F-NEXT: .LBB84_12: # %else14 +; RV64ZVE32F-NEXT: .LBB84_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB84_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa3, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_8 -; RV64ZVE32F-NEXT: .LBB84_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_9 -; RV64ZVE32F-NEXT: j .LBB84_10 ; RV64ZVE32F-NEXT: .LBB84_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB84_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB84_14 ; RV64ZVE32F-NEXT: .LBB84_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: fsd fa7, 0(a0) @@ -9568,6 +10561,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB85_2 ; RV32ZVE32F-NEXT: .LBB85_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9575,6 +10570,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB85_3 ; RV32ZVE32F-NEXT: .LBB85_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9582,6 +10579,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB85_4 ; RV32ZVE32F-NEXT: .LBB85_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9589,6 +10588,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB85_5 ; RV32ZVE32F-NEXT: .LBB85_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9596,6 +10597,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB85_6 ; RV32ZVE32F-NEXT: .LBB85_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9603,6 +10606,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB85_7 ; RV32ZVE32F-NEXT: .LBB85_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9610,9 +10615,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB85_8 ; RV32ZVE32F-NEXT: .LBB85_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -9629,9 +10636,12 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB85_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB85_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -9639,72 +10649,76 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB85_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB85_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) ; RV64ZVE32F-NEXT: .LBB85_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB85_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_14 -; RV64ZVE32F-NEXT: .LBB85_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB85_10 -; RV64ZVE32F-NEXT: .LBB85_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB85_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB85_10: # %else10 +; RV64ZVE32F-NEXT: .LBB85_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB85_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB85_16 -; RV64ZVE32F-NEXT: .LBB85_12: # %else14 +; RV64ZVE32F-NEXT: .LBB85_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB85_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa3, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_8 -; RV64ZVE32F-NEXT: .LBB85_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_9 -; RV64ZVE32F-NEXT: j .LBB85_10 ; RV64ZVE32F-NEXT: .LBB85_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB85_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB85_14 ; RV64ZVE32F-NEXT: .LBB85_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: fsd fa7, 0(a0) @@ -9775,6 +10789,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB86_2 ; RV32ZVE32F-NEXT: .LBB86_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9782,6 +10798,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB86_3 ; RV32ZVE32F-NEXT: .LBB86_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9789,6 +10807,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB86_4 ; RV32ZVE32F-NEXT: .LBB86_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9796,6 +10816,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB86_5 ; RV32ZVE32F-NEXT: .LBB86_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9803,6 +10825,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB86_6 ; RV32ZVE32F-NEXT: .LBB86_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9810,6 +10834,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB86_7 ; RV32ZVE32F-NEXT: .LBB86_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9817,9 +10843,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB86_8 ; RV32ZVE32F-NEXT: .LBB86_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -9839,9 +10867,12 @@ ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa0, 0(a3) ; RV64ZVE32F-NEXT: .LBB86_2: # %else +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a3, .LBB86_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -9850,12 +10881,14 @@ ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB86_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV64ZVE32F-NEXT: beqz a3, .LBB86_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a0, a3 @@ -9863,52 +10896,52 @@ ; RV64ZVE32F-NEXT: .LBB86_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB86_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB86_8: # %else6 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB86_14 -; RV64ZVE32F-NEXT: .LBB86_8: # %else8 -; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: beqz a3, .LBB86_10 -; RV64ZVE32F-NEXT: .LBB86_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB86_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB86_10: # %else10 +; RV64ZVE32F-NEXT: .LBB86_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB86_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a2, a2, -128 ; RV64ZVE32F-NEXT: bnez a2, .LBB86_16 -; RV64ZVE32F-NEXT: .LBB86_12: # %else14 +; RV64ZVE32F-NEXT: .LBB86_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB86_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: fsd fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB86_8 -; RV64ZVE32F-NEXT: .LBB86_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: fsd fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB86_9 -; RV64ZVE32F-NEXT: j .LBB86_10 ; RV64ZVE32F-NEXT: .LBB86_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 @@ -9916,11 +10949,13 @@ ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa6, 0(a3) ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB86_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_14 ; RV64ZVE32F-NEXT: .LBB86_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: and a1, a2, a1 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 @@ -9988,6 +11023,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB87_2 ; RV32ZVE32F-NEXT: .LBB87_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -9995,6 +11032,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB87_3 ; RV32ZVE32F-NEXT: .LBB87_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10002,6 +11041,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB87_4 ; RV32ZVE32F-NEXT: .LBB87_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10009,6 +11050,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB87_5 ; RV32ZVE32F-NEXT: .LBB87_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10016,6 +11059,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB87_6 ; RV32ZVE32F-NEXT: .LBB87_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10023,6 +11068,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB87_7 ; RV32ZVE32F-NEXT: .LBB87_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10030,9 +11077,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB87_8 ; RV32ZVE32F-NEXT: .LBB87_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -10049,12 +11098,17 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB87_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB87_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) @@ -10063,70 +11117,75 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_13 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_14 ; RV64ZVE32F-NEXT: .LBB87_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_14 -; RV64ZVE32F-NEXT: .LBB87_7: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_8 +; RV64ZVE32F-NEXT: .LBB87_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB87_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_9 -; RV64ZVE32F-NEXT: .LBB87_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB87_9: # %else10 +; RV64ZVE32F-NEXT: .LBB87_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB87_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB87_16 -; RV64ZVE32F-NEXT: .LBB87_11: # %else14 +; RV64ZVE32F-NEXT: .LBB87_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB87_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: .LBB87_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB87_6 -; RV64ZVE32F-NEXT: .LBB87_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB87_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_7 -; RV64ZVE32F-NEXT: .LBB87_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_8 -; RV64ZVE32F-NEXT: j .LBB87_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_7 +; RV64ZVE32F-NEXT: j .LBB87_8 ; RV64ZVE32F-NEXT: .LBB87_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB87_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB87_12 ; RV64ZVE32F-NEXT: .LBB87_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: fsd fa7, 0(a0) @@ -10195,6 +11254,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB88_2 ; RV32ZVE32F-NEXT: .LBB88_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10202,6 +11263,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB88_3 ; RV32ZVE32F-NEXT: .LBB88_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10209,6 +11272,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB88_4 ; RV32ZVE32F-NEXT: .LBB88_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10216,6 +11281,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB88_5 ; RV32ZVE32F-NEXT: .LBB88_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10223,6 +11290,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB88_6 ; RV32ZVE32F-NEXT: .LBB88_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10230,6 +11299,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB88_7 ; RV32ZVE32F-NEXT: .LBB88_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10237,9 +11308,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB88_8 ; RV32ZVE32F-NEXT: .LBB88_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -10256,12 +11329,17 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB88_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB88_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) @@ -10270,70 +11348,75 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_13 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_14 ; RV64ZVE32F-NEXT: .LBB88_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_14 -; RV64ZVE32F-NEXT: .LBB88_7: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_8 +; RV64ZVE32F-NEXT: .LBB88_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB88_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_9 -; RV64ZVE32F-NEXT: .LBB88_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB88_9: # %else10 +; RV64ZVE32F-NEXT: .LBB88_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB88_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB88_16 -; RV64ZVE32F-NEXT: .LBB88_11: # %else14 +; RV64ZVE32F-NEXT: .LBB88_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB88_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: .LBB88_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB88_6 -; RV64ZVE32F-NEXT: .LBB88_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB88_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_7 -; RV64ZVE32F-NEXT: .LBB88_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_8 -; RV64ZVE32F-NEXT: j .LBB88_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_7 +; RV64ZVE32F-NEXT: j .LBB88_8 ; RV64ZVE32F-NEXT: .LBB88_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB88_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB88_12 ; RV64ZVE32F-NEXT: .LBB88_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 3 ; RV64ZVE32F-NEXT: add a0, a0, a1 ; RV64ZVE32F-NEXT: fsd fa7, 0(a0) @@ -10403,6 +11486,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB89_2 ; RV32ZVE32F-NEXT: .LBB89_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10410,6 +11495,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB89_3 ; RV32ZVE32F-NEXT: .LBB89_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10417,6 +11504,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB89_4 ; RV32ZVE32F-NEXT: .LBB89_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10424,6 +11513,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB89_5 ; RV32ZVE32F-NEXT: .LBB89_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10431,6 +11522,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB89_6 ; RV32ZVE32F-NEXT: .LBB89_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10438,6 +11531,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB89_7 ; RV32ZVE32F-NEXT: .LBB89_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10445,9 +11540,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB89_8 ; RV32ZVE32F-NEXT: .LBB89_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret ; @@ -10465,12 +11562,17 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa0, 0(a2) ; RV64ZVE32F-NEXT: .LBB89_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB89_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10480,63 +11582,66 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_13 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_13 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_14 ; RV64ZVE32F-NEXT: .LBB89_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_14 -; RV64ZVE32F-NEXT: .LBB89_7: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_8 +; RV64ZVE32F-NEXT: .LBB89_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB89_8: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_9 -; RV64ZVE32F-NEXT: .LBB89_8: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB89_9: # %else10 +; RV64ZVE32F-NEXT: .LBB89_10: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB89_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB89_16 -; RV64ZVE32F-NEXT: .LBB89_11: # %else14 +; RV64ZVE32F-NEXT: .LBB89_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB89_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: .LBB89_13: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB89_6 -; RV64ZVE32F-NEXT: .LBB89_13: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB89_14: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_7 -; RV64ZVE32F-NEXT: .LBB89_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 32 -; RV64ZVE32F-NEXT: srli a2, a2, 29 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_8 -; RV64ZVE32F-NEXT: j .LBB89_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_7 +; RV64ZVE32F-NEXT: j .LBB89_8 ; RV64ZVE32F-NEXT: .LBB89_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 32 @@ -10544,11 +11649,13 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB89_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB89_12 ; RV64ZVE32F-NEXT: .LBB89_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: slli a1, a1, 32 ; RV64ZVE32F-NEXT: srli a1, a1, 29 ; RV64ZVE32F-NEXT: add a0, a0, a1 @@ -10636,9 +11743,11 @@ ; RV32ZVE32F-NEXT: andi a0, a0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB90_9 ; RV32ZVE32F-NEXT: .LBB90_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: .LBB90_9: # %else14 ; RV32ZVE32F-NEXT: addi sp, s0, -64 @@ -10653,6 +11762,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: beqz a1, .LBB90_2 ; RV32ZVE32F-NEXT: .LBB90_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10660,6 +11771,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 4 ; RV32ZVE32F-NEXT: beqz a1, .LBB90_3 ; RV32ZVE32F-NEXT: .LBB90_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10667,6 +11780,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 8 ; RV32ZVE32F-NEXT: beqz a1, .LBB90_4 ; RV32ZVE32F-NEXT: .LBB90_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10674,6 +11789,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 16 ; RV32ZVE32F-NEXT: beqz a1, .LBB90_5 ; RV32ZVE32F-NEXT: .LBB90_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10681,6 +11798,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 32 ; RV32ZVE32F-NEXT: beqz a1, .LBB90_6 ; RV32ZVE32F-NEXT: .LBB90_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10688,6 +11807,8 @@ ; RV32ZVE32F-NEXT: andi a1, a0, 64 ; RV32ZVE32F-NEXT: beqz a1, .LBB90_7 ; RV32ZVE32F-NEXT: .LBB90_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a1, v10 @@ -10817,192 +11938,240 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; RV64ZVE32F-NEXT: vse8.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB91_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB91_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB91_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB91_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB91_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_28 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_29 -; RV64ZVE32F-NEXT: .LBB91_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB91_10 -; RV64ZVE32F-NEXT: .LBB91_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB91_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB91_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_30 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB91_31 -; RV64ZVE32F-NEXT: .LBB91_12: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: bnez a2, .LBB91_32 -; RV64ZVE32F-NEXT: .LBB91_13: # %else16 +; RV64ZVE32F-NEXT: .LBB91_14: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_16 +; RV64ZVE32F-NEXT: .LBB91_15: # %cond.store15 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 8 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB91_16: # %else16 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_15 -; RV64ZVE32F-NEXT: .LBB91_14: # %cond.store17 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.store17 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 9 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB91_15: # %else18 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 9 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB91_18: # %else18 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.store19 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB91_17: # %else20 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 10 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB91_20: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 1 ; RV64ZVE32F-NEXT: addiw a3, a2, -2048 ; RV64ZVE32F-NEXT: and a3, a1, a3 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_19 -; RV64ZVE32F-NEXT: # %bb.18: # %cond.store21 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.store21 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 11 ; RV64ZVE32F-NEXT: vse8.v v10, (a3) -; RV64ZVE32F-NEXT: .LBB91_19: # %else22 +; RV64ZVE32F-NEXT: .LBB91_22: # %else22 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_21 -; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.store23 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 12 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB91_21: # %else24 +; RV64ZVE32F-NEXT: .LBB91_24: # %else24 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 2 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.store25 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.store25 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 13 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB91_23: # %else26 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 13 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB91_26: # %else26 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 4 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_25 -; RV64ZVE32F-NEXT: # %bb.24: # %cond.store27 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.store27 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 14 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB91_25: # %else28 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 14 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB91_28: # %else28 ; RV64ZVE32F-NEXT: lui a2, 1048568 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB91_27 -; RV64ZVE32F-NEXT: # %bb.26: # %cond.store29 +; RV64ZVE32F-NEXT: beqz a1, .LBB91_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.store29 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 15 -; RV64ZVE32F-NEXT: vse8.v v8, (a0) -; RV64ZVE32F-NEXT: .LBB91_27: # %else30 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 15 +; RV64ZVE32F-NEXT: vse8.v v9, (a0) +; RV64ZVE32F-NEXT: .LBB91_30: # %else30 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB91_28: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_8 -; RV64ZVE32F-NEXT: .LBB91_29: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_9 -; RV64ZVE32F-NEXT: j .LBB91_10 -; RV64ZVE32F-NEXT: .LBB91_30: # %cond.store11 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: .LBB91_31: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 6 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_12 -; RV64ZVE32F-NEXT: .LBB91_31: # %cond.store13 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_14 +; RV64ZVE32F-NEXT: .LBB91_32: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_13 -; RV64ZVE32F-NEXT: .LBB91_32: # %cond.store15 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 8 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_14 -; RV64ZVE32F-NEXT: j .LBB91_15 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_15 +; RV64ZVE32F-NEXT: j .LBB91_16 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %idxs call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %val, <16 x i8*> %ptrs, i32 1, <16 x i1> %m) ret void @@ -11026,15 +12195,23 @@ ; RV64-NEXT: vsext.vf8 v16, v10 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 16 -; RV64-NEXT: vslidedown.vi v10, v10, 16 +; RV64-NEXT: vslidedown.vi v8, v10, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vslidedown.vi v8, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsoxei64.v v12, (a0), v16, v0.t ; RV64-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_baseidx_v32i8: @@ -11050,387 +12227,494 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vse8.v v8, (a2) ; RV64ZVE32F-NEXT: .LBB92_2: # %else +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB92_4: # %else2 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_6 ; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB92_6: # %else4 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_60 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_61 -; RV64ZVE32F-NEXT: .LBB92_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_10 -; RV64ZVE32F-NEXT: .LBB92_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 4 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_10: # %else10 +; RV64ZVE32F-NEXT: .LBB92_10: # %else8 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v15, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v16, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v16, (a2) +; RV64ZVE32F-NEXT: .LBB92_12: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v10, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_62 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 -; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB92_63 -; RV64ZVE32F-NEXT: .LBB92_12: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: bnez a2, .LBB92_64 -; RV64ZVE32F-NEXT: .LBB92_13: # %else16 +; RV64ZVE32F-NEXT: .LBB92_14: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_16 +; RV64ZVE32F-NEXT: .LBB92_15: # %cond.store15 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_16: # %else16 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_15 -; RV64ZVE32F-NEXT: .LBB92_14: # %cond.store17 +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.store17 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v14, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 9 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_15: # %else18 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 9 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_18: # %else18 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v15, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v16, v14, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.store19 +; RV64ZVE32F-NEXT: vmv.x.s a2, v16 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 10 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_17: # %else20 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 10 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_20: # %else20 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 1 ; RV64ZVE32F-NEXT: addiw a3, a2, -2048 ; RV64ZVE32F-NEXT: and a3, a1, a3 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB92_19 -; RV64ZVE32F-NEXT: # %bb.18: # %cond.store21 +; RV64ZVE32F-NEXT: vslidedown.vi v15, v14, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB92_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.store21 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v16, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v14 ; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetvli a4, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 11 -; RV64ZVE32F-NEXT: vse8.v v14, (a3) -; RV64ZVE32F-NEXT: .LBB92_19: # %else22 +; RV64ZVE32F-NEXT: vslidedown.vi v16, v8, 11 +; RV64ZVE32F-NEXT: vse8.v v16, (a3) +; RV64ZVE32F-NEXT: .LBB92_22: # %else22 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_21 -; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.store23 +; RV64ZVE32F-NEXT: vmv.x.s a2, v15 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 12 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_21: # %else24 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 12 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB92_24: # %else24 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 2 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.store25 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.store25 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v15, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 13 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_23: # %else26 +; RV64ZVE32F-NEXT: vslidedown.vi v16, v8, 13 +; RV64ZVE32F-NEXT: vse8.v v16, (a2) +; RV64ZVE32F-NEXT: .LBB92_26: # %else26 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 4 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_25 -; RV64ZVE32F-NEXT: # %bb.24: # %cond.store27 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v15, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.store27 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 14 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_25: # %else28 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 14 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_28: # %else28 ; RV64ZVE32F-NEXT: lui a2, 8 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_27 -; RV64ZVE32F-NEXT: # %bb.26: # %cond.store29 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.store29 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 15 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_27: # %else30 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 15 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB92_30: # %else30 ; RV64ZVE32F-NEXT: lui a2, 16 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_29 -; RV64ZVE32F-NEXT: # %bb.28: # %cond.store31 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_32 +; RV64ZVE32F-NEXT: # %bb.31: # %cond.store31 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 16 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_29: # %else32 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 16 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB92_32: # %else32 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 32 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_31 -; RV64ZVE32F-NEXT: # %bb.30: # %cond.store33 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_34 +; RV64ZVE32F-NEXT: # %bb.33: # %cond.store33 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 17 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_31: # %else34 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 17 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_34: # %else34 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 64 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_33 -; RV64ZVE32F-NEXT: # %bb.32: # %cond.store35 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_36 +; RV64ZVE32F-NEXT: # %bb.35: # %cond.store35 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 18 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_33: # %else36 +; RV64ZVE32F-NEXT: .LBB92_36: # %else36 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 128 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_35 -; RV64ZVE32F-NEXT: # %bb.34: # %cond.store37 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_38 +; RV64ZVE32F-NEXT: # %bb.37: # %cond.store37 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 19 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_35: # %else38 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 19 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_38: # %else38 ; RV64ZVE32F-NEXT: lui a2, 256 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_37 -; RV64ZVE32F-NEXT: # %bb.36: # %cond.store39 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_40 +; RV64ZVE32F-NEXT: # %bb.39: # %cond.store39 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 20 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_37: # %else40 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 20 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_40: # %else40 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: lui a2, 512 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_39 -; RV64ZVE32F-NEXT: # %bb.38: # %cond.store41 +; RV64ZVE32F-NEXT: vmv.v.i v13, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_42 +; RV64ZVE32F-NEXT: # %bb.41: # %cond.store41 +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 21 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_39: # %else42 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 21 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_42: # %else42 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 1024 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_41 -; RV64ZVE32F-NEXT: # %bb.40: # %cond.store43 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v11, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_44 +; RV64ZVE32F-NEXT: # %bb.43: # %cond.store43 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 22 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_41: # %else44 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 22 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_44: # %else44 ; RV64ZVE32F-NEXT: lui a2, 2048 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_43 -; RV64ZVE32F-NEXT: # %bb.42: # %cond.store45 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_46 +; RV64ZVE32F-NEXT: # %bb.45: # %cond.store45 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 23 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_43: # %else46 +; RV64ZVE32F-NEXT: .LBB92_46: # %else46 ; RV64ZVE32F-NEXT: lui a2, 4096 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_45 -; RV64ZVE32F-NEXT: # %bb.44: # %cond.store47 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_48 +; RV64ZVE32F-NEXT: # %bb.47: # %cond.store47 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 24 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_45: # %else48 +; RV64ZVE32F-NEXT: .LBB92_48: # %else48 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 8192 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_47 -; RV64ZVE32F-NEXT: # %bb.46: # %cond.store49 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_50 +; RV64ZVE32F-NEXT: # %bb.49: # %cond.store49 +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 25 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_47: # %else50 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 25 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_50: # %else50 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 16384 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_49 -; RV64ZVE32F-NEXT: # %bb.48: # %cond.store51 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_52 +; RV64ZVE32F-NEXT: # %bb.51: # %cond.store51 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v14, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_49: # %else52 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 26 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_52: # %else52 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; RV64ZVE32F-NEXT: lui a2, 32768 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_51 -; RV64ZVE32F-NEXT: # %bb.50: # %cond.store53 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_54 +; RV64ZVE32F-NEXT: # %bb.53: # %cond.store53 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 27 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_51: # %else54 +; RV64ZVE32F-NEXT: .LBB92_54: # %else54 ; RV64ZVE32F-NEXT: lui a2, 65536 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_53 -; RV64ZVE32F-NEXT: # %bb.52: # %cond.store55 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_56 +; RV64ZVE32F-NEXT: # %bb.55: # %cond.store55 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 28 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_53: # %else56 +; RV64ZVE32F-NEXT: .LBB92_56: # %else56 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu ; RV64ZVE32F-NEXT: lui a2, 131072 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_55 -; RV64ZVE32F-NEXT: # %bb.54: # %cond.store57 +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_58 +; RV64ZVE32F-NEXT: # %bb.57: # %cond.store57 +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 29 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_55: # %else58 +; RV64ZVE32F-NEXT: .LBB92_58: # %else58 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: lui a2, 262144 ; RV64ZVE32F-NEXT: and a2, a1, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_57 -; RV64ZVE32F-NEXT: # %bb.56: # %cond.store59 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_60 +; RV64ZVE32F-NEXT: # %bb.59: # %cond.store59 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 30 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_57: # %else60 +; RV64ZVE32F-NEXT: .LBB92_60: # %else60 ; RV64ZVE32F-NEXT: lui a2, 524288 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB92_59 -; RV64ZVE32F-NEXT: # %bb.58: # %cond.store61 +; RV64ZVE32F-NEXT: beqz a1, .LBB92_62 +; RV64ZVE32F-NEXT: # %bb.61: # %cond.store61 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v11, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v11 ; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 31 -; RV64ZVE32F-NEXT: vse8.v v8, (a0) -; RV64ZVE32F-NEXT: .LBB92_59: # %else62 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 31 +; RV64ZVE32F-NEXT: vse8.v v10, (a0) +; RV64ZVE32F-NEXT: .LBB92_62: # %else62 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB92_60: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_8 -; RV64ZVE32F-NEXT: .LBB92_61: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 4 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_9 -; RV64ZVE32F-NEXT: j .LBB92_10 -; RV64ZVE32F-NEXT: .LBB92_62: # %cond.store11 +; RV64ZVE32F-NEXT: .LBB92_63: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v16, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 6 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v16, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v16, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_12 -; RV64ZVE32F-NEXT: .LBB92_63: # %cond.store13 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_14 +; RV64ZVE32F-NEXT: .LBB92_64: # %cond.store13 +; RV64ZVE32F-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 7 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_13 -; RV64ZVE32F-NEXT: .LBB92_64: # %cond.store15 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v12, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 8 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_14 -; RV64ZVE32F-NEXT: j .LBB92_15 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_15 +; RV64ZVE32F-NEXT: j .LBB92_16 %ptrs = getelementptr inbounds i8, i8* %base, <32 x i8> %idxs call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %val, <32 x i8*> %ptrs, i32 1, <32 x i1> %m) ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll @@ -9,7 +9,9 @@ define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -22,7 +24,9 @@ define half @vpreduce_ord_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -37,7 +41,9 @@ define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -50,7 +56,9 @@ define half @vpreduce_ord_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -65,7 +73,9 @@ define float @vpreduce_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -78,7 +88,9 @@ define float @vpreduce_ord_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -93,7 +105,9 @@ define float @vpreduce_fadd_v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -106,7 +120,9 @@ define float @vpreduce_ord_fadd_v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -121,8 +137,10 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v64f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a2, a0, -32 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: addi a2, a0, -32 +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: bltu a0, a2, .LBB8_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 @@ -134,12 +152,16 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: .LBB8_4: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v25, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu ; CHECK-NEXT: vfredusum.vs v25, v8, v25, v0.t ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, ft0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu ; CHECK-NEXT: vmv1r.v v0, v24 @@ -153,8 +175,10 @@ define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v64f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a2, a0, -32 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: addi a2, a0, -32 +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: bltu a0, a2, .LBB9_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 @@ -166,12 +190,16 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: .LBB9_4: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v25, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu ; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, ft0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu ; CHECK-NEXT: vmv1r.v v0, v24 @@ -187,7 +215,9 @@ define double @vpreduce_fadd_v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -200,7 +230,9 @@ define double @vpreduce_ord_fadd_v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -215,7 +247,9 @@ define double @vpreduce_fadd_v3f64(double %s, <3 x double> %v, <3 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v3f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu ; CHECK-NEXT: vfredusum.vs v10, v8, v10, v0.t @@ -228,7 +262,9 @@ define double @vpreduce_ord_fadd_v3f64(double %s, <3 x double> %v, <3 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v3f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu ; CHECK-NEXT: vfredosum.vs v10, v8, v10, v0.t @@ -243,7 +279,9 @@ define double @vpreduce_fadd_v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu ; CHECK-NEXT: vfredusum.vs v10, v8, v10, v0.t @@ -256,7 +294,9 @@ define double @vpreduce_ord_fadd_v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu ; CHECK-NEXT: vfredosum.vs v10, v8, v10, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -22,9 +22,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <1 x half>, <1 x half>* %x %red = call half @llvm.vector.reduce.fadd.v1f16(half %s, <1 x half> %v) @@ -38,9 +44,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v2f16(half %s, <2 x half> %v) @@ -52,9 +64,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %red = call half @llvm.vector.reduce.fadd.v2f16(half %s, <2 x half> %v) @@ -68,9 +86,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v4f16(half %s, <4 x half> %v) @@ -82,9 +106,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call half @llvm.vector.reduce.fadd.v4f16(half %s, <4 x half> %v) @@ -98,9 +128,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <8 x half>, <8 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v8f16(half %s, <8 x half> %v) @@ -112,9 +148,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <8 x half>, <8 x half>* %x %red = call half @llvm.vector.reduce.fadd.v8f16(half %s, <8 x half> %v) @@ -128,9 +170,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vfredusum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <16 x half>, <16 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v16f16(half %s, <16 x half> %v) @@ -142,9 +190,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <16 x half>, <16 x half>* %x %red = call half @llvm.vector.reduce.fadd.v16f16(half %s, <16 x half> %v) @@ -159,11 +213,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vfredusum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <32 x half>, <32 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v32f16(half %s, <32 x half> %v) @@ -176,11 +234,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vfredosum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <32 x half>, <32 x half>* %x %red = call half @llvm.vector.reduce.fadd.v32f16(half %s, <32 x half> %v) @@ -195,11 +257,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <64 x half>, <64 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v64f16(half %s, <64 x half> %v) @@ -212,11 +278,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <64 x half>, <64 x half>* %x %red = call half @llvm.vector.reduce.fadd.v64f16(half %s, <64 x half> %v) @@ -234,11 +304,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <128 x half>, <128 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v128f16(half %s, <128 x half> %v) @@ -253,16 +327,24 @@ ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v24, fa0 -; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vfredosum.vs v16, v16, v24 -; CHECK-NEXT: vfmv.f.s ft0, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v16, v24 +; CHECK-NEXT: vfmv.f.s ft0, v25 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, ft0 -; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <128 x half>, <128 x half>* %x %red = call half @llvm.vector.reduce.fadd.v128f16(half %s, <128 x half> %v) @@ -289,9 +371,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <1 x float>, <1 x float>* %x %red = call float @llvm.vector.reduce.fadd.v1f32(float %s, <1 x float> %v) @@ -319,12 +407,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <1 x half>, <1 x half>* %x %e = fpext <1 x half> %v to <1 x float> @@ -339,9 +431,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v2f32(float %s, <2 x float> %v) @@ -353,9 +451,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %red = call float @llvm.vector.reduce.fadd.v2f32(float %s, <2 x float> %v) @@ -367,12 +471,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %e = fpext <2 x half> %v to <2 x float> @@ -385,12 +493,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %e = fpext <2 x half> %v to <2 x float> @@ -405,9 +517,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %s, <4 x float> %v) @@ -419,9 +537,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call float @llvm.vector.reduce.fadd.v4f32(float %s, <4 x float> %v) @@ -431,13 +555,18 @@ define float @vreduce_fwadd_v4f32(<4 x half>* %x, float %s) { ; CHECK-LABEL: vreduce_fwadd_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %e = fpext <4 x half> %v to <4 x float> @@ -448,13 +577,18 @@ define float @vreduce_ord_fwadd_v4f32(<4 x half>* %x, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %e = fpext <4 x half> %v to <4 x float> @@ -469,9 +603,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vfredusum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <8 x float>, <8 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v8f32(float %s, <8 x float> %v) @@ -483,9 +623,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <8 x float>, <8 x float>* %x %red = call float @llvm.vector.reduce.fadd.v8f32(float %s, <8 x float> %v) @@ -497,12 +643,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <8 x half>, <8 x half>* %x %e = fpext <8 x half> %v to <8 x float> @@ -515,12 +665,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <8 x half>, <8 x half>* %x %e = fpext <8 x half> %v to <8 x float> @@ -535,9 +689,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vfredusum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <16 x float>, <16 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v16f32(float %s, <16 x float> %v) @@ -549,9 +709,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vfredosum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <16 x float>, <16 x float>* %x %red = call float @llvm.vector.reduce.fadd.v16f32(float %s, <16 x float> %v) @@ -563,12 +729,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vfwredusum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <16 x half>, <16 x half>* %x %e = fpext <16 x half> %v to <16 x float> @@ -581,12 +751,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vfwredosum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <16 x half>, <16 x half>* %x %e = fpext <16 x half> %v to <16 x float> @@ -602,11 +776,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <32 x float>, <32 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v32f32(float %s, <32 x float> %v) @@ -619,11 +797,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <32 x float>, <32 x float>* %x %red = call float @llvm.vector.reduce.fadd.v32f32(float %s, <32 x float> %v) @@ -636,12 +818,16 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vfwredusum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <32 x half>, <32 x half>* %x %e = fpext <32 x half> %v to <32 x float> @@ -655,12 +841,16 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vfwredosum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <32 x half>, <32 x half>* %x %e = fpext <32 x half> %v to <32 x float> @@ -679,11 +869,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <64 x float>, <64 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v64f32(float %s, <64 x float> %v) @@ -698,16 +892,24 @@ ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v24, fa0 -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vfredosum.vs v16, v16, v24 -; CHECK-NEXT: vfmv.f.s ft0, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v16, v24 +; CHECK-NEXT: vfmv.f.s ft0, v25 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, ft0 -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <64 x float>, <64 x float>* %x %red = call float @llvm.vector.reduce.fadd.v64f32(float %s, <64 x float> %v) @@ -721,15 +923,21 @@ ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfwadd.vv v24, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vfwadd.vv v16, v8, v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v24, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v9, v16, v8 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %v = load <64 x half>, <64 x half>* %x %e = fpext <64 x half> %v to <64 x float> @@ -744,20 +952,30 @@ ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfwredosum.vs v12, v16, v12 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu +; CHECK-NEXT: vfwredosum.vs v13, v16, v12 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v12 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vfmv.f.s ft0, v13 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, ft0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu +; CHECK-NEXT: vfwredosum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <64 x half>, <64 x half>* %x %e = fpext <64 x half> %v to <64 x float> @@ -785,9 +1003,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <1 x double>, <1 x double>* %x %red = call double @llvm.vector.reduce.fadd.v1f64(double %s, <1 x double> %v) @@ -813,13 +1037,18 @@ define double @vreduce_ord_fwadd_v1f64(<1 x float>* %x, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <1 x float>, <1 x float>* %x %e = fpext <1 x float> %v to <1 x double> @@ -834,9 +1063,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x double>, <2 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v2f64(double %s, <2 x double> %v) @@ -848,9 +1083,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x double>, <2 x double>* %x %red = call double @llvm.vector.reduce.fadd.v2f64(double %s, <2 x double> %v) @@ -860,13 +1101,18 @@ define double @vreduce_fwadd_v2f64(<2 x float>* %x, double %s) { ; CHECK-LABEL: vreduce_fwadd_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %e = fpext <2 x float> %v to <2 x double> @@ -877,13 +1123,18 @@ define double @vreduce_ord_fwadd_v2f64(<2 x float>* %x, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %e = fpext <2 x float> %v to <2 x double> @@ -898,9 +1149,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredusum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v4f64(double %s, <4 x double> %v) @@ -912,9 +1169,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call double @llvm.vector.reduce.fadd.v4f64(double %s, <4 x double> %v) @@ -926,12 +1189,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %e = fpext <4 x float> %v to <4 x double> @@ -944,12 +1211,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %e = fpext <4 x float> %v to <4 x double> @@ -964,9 +1235,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; CHECK-NEXT: vfredusum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <8 x double>, <8 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v8f64(double %s, <8 x double> %v) @@ -978,9 +1255,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; CHECK-NEXT: vfredosum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <8 x double>, <8 x double>* %x %red = call double @llvm.vector.reduce.fadd.v8f64(double %s, <8 x double> %v) @@ -992,12 +1275,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vfwredusum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <8 x float>, <8 x float>* %x %e = fpext <8 x float> %v to <8 x double> @@ -1010,12 +1297,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vfwredosum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <8 x float>, <8 x float>* %x %e = fpext <8 x float> %v to <8 x double> @@ -1030,9 +1321,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <16 x double>, <16 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v16f64(double %s, <16 x double> %v) @@ -1044,9 +1341,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <16 x double>, <16 x double>* %x %red = call double @llvm.vector.reduce.fadd.v16f64(double %s, <16 x double> %v) @@ -1058,12 +1361,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vfwredusum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <16 x float>, <16 x float>* %x %e = fpext <16 x float> %v to <16 x double> @@ -1076,12 +1383,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vfwredosum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <16 x float>, <16 x float>* %x %e = fpext <16 x float> %v to <16 x double> @@ -1098,10 +1409,17 @@ ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v24, fa0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vfadd.vv v8, v8, v16 -; CHECK-NEXT: vfredusum.vs v8, v8, v24 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v16, v8, v24 +; CHECK-NEXT: vfmv.f.s fa0, v16 ; CHECK-NEXT: ret %v = load <32 x double>, <32 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v32f64(double %s, <32 x double> %v) @@ -1111,16 +1429,28 @@ define double @vreduce_ord_fadd_v32f64(<32 x double>* %x, double %s) { ; CHECK-LABEL: vreduce_ord_fadd_v32f64: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v24, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v24 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: vfmv.s.f v8, ft0 -; CHECK-NEXT: vfredosum.vs v8, v16, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v16, v24 +; CHECK-NEXT: vfmv.f.s ft0, v25 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu +; CHECK-NEXT: vfmv.s.f v16, ft0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredosum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <32 x double>, <32 x double>* %x %red = call double @llvm.vector.reduce.fadd.v32f64(double %s, <32 x double> %v) @@ -1169,16 +1499,22 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwadd.vv v24, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v8, fa0 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v24, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfwadd.vv v8, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredusum.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <32 x float>, <32 x float>* %x %e = fpext <32 x float> %v to <32 x double> @@ -1192,20 +1528,30 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v8, v16, 16 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwredosum.vs v12, v16, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vfwredosum.vs v13, v16, v12 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v12 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s ft0, v13 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, ft0 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vfwredosum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %v = load <32 x float>, <32 x float>* %x %e = fpext <32 x float> %v to <32 x double> @@ -1224,9 +1570,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI68_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %red = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> %v) @@ -1244,9 +1592,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI69_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %v) @@ -1262,9 +1612,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan half @llvm.vector.reduce.fmin.v4f16(<4 x half> %v) @@ -1280,9 +1632,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan ninf half @llvm.vector.reduce.fmin.v4f16(<4 x half> %v) @@ -1304,9 +1658,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI72_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v16, (a0), zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vfredmin.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <128 x half>, <128 x half>* %x %red = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %v) @@ -1324,9 +1680,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI73_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %red = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %v) @@ -1344,9 +1702,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI74_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) @@ -1362,9 +1722,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI75_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) @@ -1380,9 +1742,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI76_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) @@ -1410,9 +1774,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI77_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v16, (a0), zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vfredmin.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <128 x float>, <128 x float>* %x %red = call float @llvm.vector.reduce.fmin.v128f32(<128 x float> %v) @@ -1430,9 +1796,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI78_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x double>, <2 x double>* %x %red = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %v) @@ -1450,9 +1818,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI79_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredmin.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %v) @@ -1468,9 +1838,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI80_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredmin.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %v) @@ -1486,9 +1858,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI81_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredmin.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan ninf double @llvm.vector.reduce.fmin.v4f64(<4 x double> %v) @@ -1509,9 +1883,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI82_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v16, (a0), zero -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredmin.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <32 x double>, <32 x double>* %x %red = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %v) @@ -1529,9 +1905,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI83_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %red = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> %v) @@ -1549,9 +1927,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI84_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %v) @@ -1567,9 +1947,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI85_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan half @llvm.vector.reduce.fmax.v4f16(<4 x half> %v) @@ -1585,9 +1967,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI86_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan ninf half @llvm.vector.reduce.fmax.v4f16(<4 x half> %v) @@ -1609,9 +1993,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI87_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v16, (a0), zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vfredmax.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <128 x half>, <128 x half>* %x %red = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %v) @@ -1629,9 +2015,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI88_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %red = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %v) @@ -1649,9 +2037,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI89_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) @@ -1667,9 +2057,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI90_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) @@ -1685,9 +2077,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI91_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) @@ -1715,9 +2109,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI92_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v16, (a0), zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vfredmax.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <128 x float>, <128 x float>* %x %red = call float @llvm.vector.reduce.fmax.v128f32(<128 x float> %v) @@ -1735,9 +2131,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI93_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <2 x double>, <2 x double>* %x %red = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %v) @@ -1755,9 +2153,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI94_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredmax.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %v) @@ -1773,9 +2173,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI95_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredmax.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan double @llvm.vector.reduce.fmax.v4f64(<4 x double> %v) @@ -1791,9 +2193,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI96_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vfredmax.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan ninf double @llvm.vector.reduce.fmax.v4f64(<4 x double> %v) @@ -1814,9 +2218,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI97_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v16, (a0), zero -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; CHECK-NEXT: vfredmax.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %v = load <32 x double>, <32 x double>* %x %red = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %v) @@ -1828,9 +2234,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float %s, <4 x float> %v) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -9,7 +9,9 @@ define signext i8 @vpreduce_add_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -25,7 +27,9 @@ ; CHECK-LABEL: vpreduce_umax_v2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -40,7 +44,9 @@ define signext i8 @vpreduce_smax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -56,7 +62,9 @@ ; CHECK-LABEL: vpreduce_umin_v2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -71,7 +79,9 @@ define signext i8 @vpreduce_smin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -86,7 +96,9 @@ define signext i8 @vpreduce_and_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -101,7 +113,9 @@ define signext i8 @vpreduce_or_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -116,7 +130,9 @@ define signext i8 @vpreduce_xor_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -132,7 +148,9 @@ ; CHECK-LABEL: vpreduce_umin_v3i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -147,7 +165,9 @@ define signext i8 @vpreduce_add_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -163,7 +183,9 @@ ; CHECK-LABEL: vpreduce_umax_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -178,7 +200,9 @@ define signext i8 @vpreduce_smax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -194,7 +218,9 @@ ; CHECK-LABEL: vpreduce_umin_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -209,7 +235,9 @@ define signext i8 @vpreduce_smin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -224,7 +252,9 @@ define signext i8 @vpreduce_and_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -239,7 +269,9 @@ define signext i8 @vpreduce_or_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -254,7 +286,9 @@ define signext i8 @vpreduce_xor_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -269,7 +303,9 @@ define signext i16 @vpreduce_add_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -286,7 +322,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -297,7 +335,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -312,7 +352,9 @@ define signext i16 @vpreduce_smax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -329,7 +371,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -340,7 +384,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -355,7 +401,9 @@ define signext i16 @vpreduce_smin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -370,7 +418,9 @@ define signext i16 @vpreduce_and_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -385,7 +435,9 @@ define signext i16 @vpreduce_or_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -400,7 +452,9 @@ define signext i16 @vpreduce_xor_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -415,7 +469,9 @@ define signext i16 @vpreduce_add_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -432,7 +488,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -443,7 +501,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -458,7 +518,9 @@ define signext i16 @vpreduce_smax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -475,7 +537,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -486,7 +550,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -501,7 +567,9 @@ define signext i16 @vpreduce_smin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -516,7 +584,9 @@ define signext i16 @vpreduce_and_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -531,7 +601,9 @@ define signext i16 @vpreduce_or_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -546,7 +618,9 @@ define signext i16 @vpreduce_xor_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -561,7 +635,9 @@ define signext i32 @vpreduce_add_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -576,7 +652,9 @@ define signext i32 @vpreduce_umax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umax_v2i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -587,7 +665,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -602,7 +682,9 @@ define signext i32 @vpreduce_smax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -617,7 +699,9 @@ define signext i32 @vpreduce_umin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umin_v2i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -628,7 +712,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -643,7 +729,9 @@ define signext i32 @vpreduce_smin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -658,7 +746,9 @@ define signext i32 @vpreduce_and_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -673,7 +763,9 @@ define signext i32 @vpreduce_or_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -688,7 +780,9 @@ define signext i32 @vpreduce_xor_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -703,7 +797,9 @@ define signext i32 @vpreduce_add_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -718,7 +814,9 @@ define signext i32 @vpreduce_umax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umax_v4i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -729,7 +827,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -744,7 +844,9 @@ define signext i32 @vpreduce_smax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -759,7 +861,9 @@ define signext i32 @vpreduce_umin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umin_v4i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -770,7 +874,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -785,7 +891,9 @@ define signext i32 @vpreduce_smin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -800,7 +908,9 @@ define signext i32 @vpreduce_and_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -815,7 +925,9 @@ define signext i32 @vpreduce_or_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -830,7 +942,9 @@ define signext i32 @vpreduce_xor_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -845,8 +959,10 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a3, a1, -32 ; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: addi a3, a1, -32 +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: bltu a1, a3, .LBB49_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 @@ -858,12 +974,16 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB49_4: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v25, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu ; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t ; CHECK-NEXT: vmv.x.s a0, v25 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, tu, mu ; CHECK-NEXT: vmv1r.v v0, v24 @@ -898,7 +1018,9 @@ ; ; RV64-LABEL: vpreduce_add_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -932,7 +1054,9 @@ ; ; RV64-LABEL: vpreduce_umax_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -966,7 +1090,9 @@ ; ; RV64-LABEL: vpreduce_smax_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -1000,7 +1126,9 @@ ; ; RV64-LABEL: vpreduce_umin_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -1034,7 +1162,9 @@ ; ; RV64-LABEL: vpreduce_smin_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -1068,7 +1198,9 @@ ; ; RV64-LABEL: vpreduce_and_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredand.vs v9, v8, v9, v0.t @@ -1102,7 +1234,9 @@ ; ; RV64-LABEL: vpreduce_or_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredor.vs v9, v8, v9, v0.t @@ -1136,7 +1270,9 @@ ; ; RV64-LABEL: vpreduce_xor_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -1170,7 +1306,9 @@ ; ; RV64-LABEL: vpreduce_add_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredsum.vs v10, v8, v10, v0.t @@ -1204,7 +1342,9 @@ ; ; RV64-LABEL: vpreduce_umax_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredmaxu.vs v10, v8, v10, v0.t @@ -1238,7 +1378,9 @@ ; ; RV64-LABEL: vpreduce_smax_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredmax.vs v10, v8, v10, v0.t @@ -1272,7 +1414,9 @@ ; ; RV64-LABEL: vpreduce_umin_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredminu.vs v10, v8, v10, v0.t @@ -1306,7 +1450,9 @@ ; ; RV64-LABEL: vpreduce_smin_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredmin.vs v10, v8, v10, v0.t @@ -1340,7 +1486,9 @@ ; ; RV64-LABEL: vpreduce_and_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredand.vs v10, v8, v10, v0.t @@ -1374,7 +1522,9 @@ ; ; RV64-LABEL: vpreduce_or_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredor.vs v10, v8, v10, v0.t @@ -1408,7 +1558,9 @@ ; ; RV64-LABEL: vpreduce_xor_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredxor.vs v10, v8, v10, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -23,9 +23,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v) @@ -39,9 +45,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v) @@ -55,9 +67,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) @@ -71,9 +89,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) @@ -88,11 +112,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredsum.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v) @@ -107,11 +135,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredsum.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %v) @@ -126,11 +158,15 @@ ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %v) @@ -148,11 +184,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %v) @@ -208,9 +248,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v) @@ -222,12 +268,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %e = sext <2 x i8> %v to <2 x i16> @@ -240,12 +290,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %e = zext <2 x i8> %v to <2 x i16> @@ -260,9 +314,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) @@ -274,12 +334,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %e = sext <4 x i8> %v to <4 x i16> @@ -292,12 +356,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %e = zext <4 x i8> %v to <4 x i16> @@ -312,9 +380,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) @@ -324,13 +398,18 @@ define i16 @vwreduce_add_v8i16(<8 x i8>* %x) { ; CHECK-LABEL: vwreduce_add_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %e = sext <8 x i8> %v to <8 x i16> @@ -341,13 +420,18 @@ define i16 @vwreduce_uadd_v8i16(<8 x i8>* %x) { ; CHECK-LABEL: vwreduce_uadd_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %e = zext <8 x i8> %v to <8 x i16> @@ -362,9 +446,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredsum.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vredsum.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v) @@ -376,12 +466,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %e = sext <16 x i8> %v to <16 x i16> @@ -394,12 +488,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %e = zext <16 x i8> %v to <16 x i16> @@ -415,11 +513,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vredsum.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %v) @@ -432,12 +534,16 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vwredsum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %e = sext <32 x i8> %v to <32 x i16> @@ -451,12 +557,16 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vwredsumu.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %e = zext <32 x i8> %v to <32 x i16> @@ -472,11 +582,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %v) @@ -489,12 +603,16 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vwredsum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %e = sext <64 x i8> %v to <64 x i16> @@ -508,12 +626,16 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vwredsumu.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %e = zext <64 x i8> %v to <64 x i16> @@ -532,11 +654,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %v) @@ -548,17 +674,23 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwadd.vv v24, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v24, v8 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %e = sext <128 x i8> %v to <128 x i16> @@ -571,17 +703,23 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwaddu.vv v24, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v24, v8 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %e = zext <128 x i8> %v to <128 x i16> @@ -638,9 +776,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v) @@ -652,12 +796,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %e = sext <2 x i16> %v to <2 x i32> @@ -670,12 +818,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %e = zext <2 x i16> %v to <2 x i32> @@ -690,9 +842,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) @@ -702,13 +860,18 @@ define i32 @vwreduce_add_v4i32(<4 x i16>* %x) { ; CHECK-LABEL: vwreduce_add_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %e = sext <4 x i16> %v to <4 x i32> @@ -719,13 +882,18 @@ define i32 @vwreduce_uadd_v4i32(<4 x i16>* %x) { ; CHECK-LABEL: vwreduce_uadd_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %e = zext <4 x i16> %v to <4 x i32> @@ -740,9 +908,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredsum.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vredsum.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v) @@ -754,12 +928,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %e = sext <8 x i16> %v to <8 x i32> @@ -772,12 +950,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %e = zext <8 x i16> %v to <8 x i32> @@ -792,9 +974,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vredsum.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vredsum.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v) @@ -806,12 +994,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vwredsum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %e = sext <16 x i16> %v to <16 x i32> @@ -824,12 +1016,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vwredsumu.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %e = zext <16 x i16> %v to <16 x i32> @@ -845,11 +1041,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %v) @@ -862,12 +1062,16 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vwredsum.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %e = sext <32 x i16> %v to <32 x i32> @@ -881,12 +1085,16 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vwredsumu.vs v13, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %e = zext <32 x i16> %v to <32 x i32> @@ -905,11 +1113,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %v) @@ -921,17 +1133,23 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwadd.vv v24, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v24, v8 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %e = sext <64 x i16> %v to <64 x i32> @@ -944,17 +1162,23 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwaddu.vv v24, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v24, v8 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu +; CHECK-NEXT: vredsum.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %e = zext <64 x i16> %v to <64 x i32> @@ -1043,12 +1267,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vredsum.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredsum.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1056,9 +1286,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vredsum.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredsum.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v) @@ -1068,28 +1304,38 @@ define i64 @vwreduce_add_v2i64(<2 x i32>* %x) { ; RV32-LABEL: vwreduce_add_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v10, v8, v9 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_add_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v10, v8, v9 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %e = sext <2 x i32> %v to <2 x i64> @@ -1100,28 +1346,38 @@ define i64 @vwreduce_uadd_v2i64(<2 x i32>* %x) { ; RV32-LABEL: vwreduce_uadd_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; RV32-NEXT: vwredsumu.vs v10, v8, v9 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_uadd_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; RV64-NEXT: vwredsumu.vs v10, v8, v9 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %e = zext <2 x i32> %v to <2 x i64> @@ -1136,12 +1392,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vredsum.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredsum.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1149,9 +1411,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vredsum.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredsum.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) @@ -1163,15 +1431,19 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v10, v8, v9 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1179,12 +1451,16 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v10, v8, v9 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %e = sext <4 x i32> %v to <4 x i64> @@ -1197,15 +1473,19 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV32-NEXT: vwredsumu.vs v10, v8, v9 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1213,12 +1493,16 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV64-NEXT: vwredsumu.vs v10, v8, v9 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %e = zext <4 x i32> %v to <4 x i64> @@ -1233,12 +1517,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vredsum.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredsum.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1246,9 +1536,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vredsum.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredsum.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %v) @@ -1260,15 +1556,19 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vwredsum.vs v8, v8, v10 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV32-NEXT: vwredsum.vs v11, v8, v10 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1276,12 +1576,16 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vwredsum.vs v8, v8, v10 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64-NEXT: vwredsum.vs v11, v8, v10 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %e = sext <8 x i32> %v to <8 x i64> @@ -1294,15 +1598,19 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vwredsumu.vs v8, v8, v10 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV32-NEXT: vwredsumu.vs v11, v8, v10 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1310,12 +1618,16 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vwredsumu.vs v8, v8, v10 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64-NEXT: vwredsumu.vs v11, v8, v10 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %e = zext <8 x i32> %v to <8 x i64> @@ -1330,12 +1642,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredsum.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredsum.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1343,9 +1661,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredsum.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredsum.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %v) @@ -1357,15 +1681,19 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vwredsum.vs v8, v8, v12 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; RV32-NEXT: vwredsum.vs v13, v8, v12 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1373,12 +1701,16 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: vwredsum.vs v8, v8, v12 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; RV64-NEXT: vwredsum.vs v13, v8, v12 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %e = sext <16 x i32> %v to <16 x i64> @@ -1391,15 +1723,19 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vwredsumu.vs v8, v8, v12 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; RV32-NEXT: vwredsumu.vs v13, v8, v12 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1407,12 +1743,16 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: vwredsumu.vs v8, v8, v12 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; RV64-NEXT: vwredsumu.vs v13, v8, v12 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %e = zext <16 x i32> %v to <16 x i64> @@ -1429,13 +1769,20 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v24, zero +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: vredsum.vs v8, v8, v24 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredsum.vs v16, v8, v24 +; RV32-NEXT: vmv.x.s a0, v16 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v16, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1445,10 +1792,17 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v24, zero +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: vredsum.vs v8, v8, v24 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredsum.vs v16, v8, v24 +; RV64-NEXT: vmv.x.s a0, v16 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %v) @@ -1460,18 +1814,24 @@ ; RV32: # %bb.0: ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vwadd.vv v24, v8, v16 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vmv.s.x v8, zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredsum.vs v8, v24, v8 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vwadd.vv v8, v16, v24 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredsum.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1479,16 +1839,22 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: vwadd.vv v24, v8, v16 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vmv.s.x v8, zero -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredsum.vs v8, v24, v8 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vwadd.vv v8, v16, v24 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredsum.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %e = sext <32 x i32> %v to <32 x i64> @@ -1501,18 +1867,24 @@ ; RV32: # %bb.0: ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vwaddu.vv v24, v8, v16 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vmv.s.x v8, zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredsum.vs v8, v24, v8 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vwaddu.vv v8, v16, v24 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredsum.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1520,16 +1892,22 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: vwaddu.vv v24, v8, v16 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vmv.s.x v8, zero -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredsum.vs v8, v24, v8 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vwaddu.vv v8, v16, v24 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredsum.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %e = zext <32 x i32> %v to <32 x i64> @@ -1553,12 +1931,18 @@ ; RV32-NEXT: vadd.vv v16, v24, v16 ; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredsum.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredsum.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1575,9 +1959,15 @@ ; RV64-NEXT: vadd.vv v16, v24, v16 ; RV64-NEXT: vadd.vv v8, v8, v0 ; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredsum.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredsum.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %v) @@ -1590,65 +1980,44 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v0, v8, 16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vslidedown.vi v24, v16, 16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwadd.vv v0, v24, v8 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vwadd.vv v8, v24, v0 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwadd.vv v0, v8, v16 +; RV32-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwadd.vv v24, v16, v0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredsum.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredsum.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v17, a2 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add sp, sp, a2 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1658,62 +2027,41 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v0, v8, 16 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vslidedown.vi v24, v16, 16 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwadd.vv v0, v24, v8 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vwadd.vv v8, v24, v0 ; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwadd.vv v0, v8, v16 +; RV64-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwadd.vv v24, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredsum.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredsum.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add sp, sp, a1 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -1729,65 +2077,44 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v0, v8, 16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vslidedown.vi v24, v16, 16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwaddu.vv v0, v24, v8 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vwaddu.vv v8, v24, v0 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vwaddu.vv v0, v8, v16 +; RV32-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vwaddu.vv v24, v16, v0 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredsum.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredsum.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v17, a2 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add sp, sp, a2 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1797,62 +2124,41 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v0, v8, 16 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vslidedown.vi v24, v16, 16 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwaddu.vv v0, v24, v8 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vwaddu.vv v8, v24, v0 ; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vwaddu.vv v0, v8, v16 +; RV64-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vwaddu.vv v24, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredsum.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredsum.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add sp, sp, a1 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -1885,9 +2191,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v) @@ -1903,9 +2211,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v) @@ -1921,9 +2231,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v) @@ -1939,9 +2251,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %v) @@ -1958,9 +2272,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredand.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v) @@ -1977,9 +2293,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredand.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %v) @@ -1996,9 +2314,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredand.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %v) @@ -2018,9 +2338,11 @@ ; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredand.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %v) @@ -2050,9 +2372,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v) @@ -2068,9 +2392,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v) @@ -2086,9 +2412,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v) @@ -2104,9 +2432,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vredand.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v) @@ -2123,9 +2453,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vredand.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %v) @@ -2142,9 +2474,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredand.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %v) @@ -2164,9 +2498,11 @@ ; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredand.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %v) @@ -2196,9 +2532,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v) @@ -2214,9 +2552,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v) @@ -2232,9 +2572,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vredand.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v) @@ -2250,9 +2592,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vredand.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v) @@ -2269,9 +2613,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredand.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %v) @@ -2291,9 +2637,11 @@ ; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredand.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %v) @@ -2333,12 +2681,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredand.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2348,9 +2698,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v9, -1 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredand.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v) @@ -2366,12 +2718,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredand.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2381,9 +2735,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v10, -1 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredand.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) @@ -2399,12 +2755,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredand.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2414,9 +2772,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v12, -1 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredand.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %v) @@ -2432,12 +2792,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredand.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2447,9 +2809,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v16, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredand.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %v) @@ -2468,12 +2832,14 @@ ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredand.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2486,9 +2852,11 @@ ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v16, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredand.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %v) @@ -2513,12 +2881,14 @@ ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredand.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2537,9 +2907,11 @@ ; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v16, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredand.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.and.v64i64(<64 x i64> %v) @@ -2567,9 +2939,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v) @@ -2583,9 +2961,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v) @@ -2599,9 +2983,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v) @@ -2615,9 +3005,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v) @@ -2632,11 +3028,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v) @@ -2651,11 +3051,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredor.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %v) @@ -2670,11 +3074,15 @@ ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %v) @@ -2692,11 +3100,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %v) @@ -2724,9 +3136,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v) @@ -2740,9 +3158,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v) @@ -2756,9 +3180,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v) @@ -2772,9 +3202,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vredor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v) @@ -2789,11 +3225,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vredor.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %v) @@ -2808,11 +3248,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %v) @@ -2830,11 +3274,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %v) @@ -2862,9 +3310,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v) @@ -2878,9 +3332,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v) @@ -2894,9 +3354,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vredor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v) @@ -2910,9 +3376,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vredor.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vredor.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v) @@ -2927,11 +3399,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %v) @@ -2949,11 +3425,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %v) @@ -2991,12 +3471,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vredor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredor.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3004,9 +3490,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vredor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredor.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v) @@ -3020,12 +3512,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vredor.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredor.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3033,9 +3531,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vredor.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredor.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) @@ -3049,12 +3553,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vredor.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredor.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3062,9 +3572,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vredor.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredor.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %v) @@ -3078,12 +3594,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredor.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredor.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3091,9 +3613,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredor.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredor.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %v) @@ -3109,13 +3637,20 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v24, zero +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vredor.vs v8, v8, v24 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredor.vs v16, v8, v24 +; RV32-NEXT: vmv.x.s a0, v16 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v16, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3125,10 +3660,17 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v24, zero +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vredor.vs v8, v8, v24 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredor.vs v16, v8, v24 +; RV64-NEXT: vmv.x.s a0, v16 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %v) @@ -3151,12 +3693,18 @@ ; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v0 ; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredor.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredor.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3173,9 +3721,15 @@ ; RV64-NEXT: vor.vv v16, v24, v16 ; RV64-NEXT: vor.vv v8, v8, v0 ; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredor.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredor.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.or.v64i64(<64 x i64> %v) @@ -3203,9 +3757,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %v) @@ -3219,9 +3779,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %v) @@ -3235,9 +3801,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v) @@ -3251,9 +3823,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %v) @@ -3268,11 +3846,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredxor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %v) @@ -3287,11 +3869,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredxor.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %v) @@ -3306,11 +3892,15 @@ ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredxor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %v) @@ -3328,11 +3918,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vxor.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredxor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %v) @@ -3360,9 +3954,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %v) @@ -3376,9 +3976,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v) @@ -3392,9 +3998,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %v) @@ -3408,9 +4020,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredxor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vredxor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v) @@ -3425,11 +4043,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vredxor.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %v) @@ -3444,11 +4066,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredxor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %v) @@ -3466,11 +4092,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vxor.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredxor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %v) @@ -3498,9 +4128,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v) @@ -3514,9 +4150,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v) @@ -3530,9 +4172,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredxor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vredxor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v) @@ -3546,9 +4194,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vredxor.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vredxor.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v) @@ -3563,11 +4217,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredxor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %v) @@ -3585,11 +4243,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vxor.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredxor.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %v) @@ -3627,12 +4289,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredxor.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3640,9 +4308,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredxor.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v) @@ -3656,12 +4330,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vredxor.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredxor.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3669,9 +4349,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vredxor.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredxor.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) @@ -3685,12 +4371,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vredxor.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredxor.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3698,9 +4390,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vredxor.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredxor.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %v) @@ -3714,12 +4412,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredxor.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredxor.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3727,9 +4431,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredxor.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredxor.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %v) @@ -3745,13 +4455,20 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v24, zero +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vxor.vv v8, v8, v16 -; RV32-NEXT: vredxor.vs v8, v8, v24 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredxor.vs v16, v8, v24 +; RV32-NEXT: vmv.x.s a0, v16 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v16, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3761,10 +4478,17 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v24, zero +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vxor.vv v8, v8, v16 -; RV64-NEXT: vredxor.vs v8, v8, v24 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredxor.vs v16, v8, v24 +; RV64-NEXT: vmv.x.s a0, v16 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %v) @@ -3787,12 +4511,18 @@ ; RV32-NEXT: vxor.vv v16, v24, v16 ; RV32-NEXT: vxor.vv v8, v8, v0 ; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredxor.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredxor.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -3809,9 +4539,15 @@ ; RV64-NEXT: vxor.vv v16, v24, v16 ; RV64-NEXT: vxor.vv v8, v8, v0 ; RV64-NEXT: vxor.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredxor.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredxor.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.xor.v64i64(<64 x i64> %v) @@ -3840,9 +4576,15 @@ ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmin.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %v) @@ -3857,9 +4599,15 @@ ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmin.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v) @@ -3874,9 +4622,15 @@ ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmin.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v) @@ -3891,9 +4645,15 @@ ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmin.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v) @@ -3909,11 +4669,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredmin.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %v) @@ -3929,11 +4693,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredmin.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %v) @@ -3949,11 +4717,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredmin.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %v) @@ -3972,11 +4744,15 @@ ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vmin.vv v8, v8, v16 ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredmin.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %v) @@ -4006,9 +4782,15 @@ ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v2i16: @@ -4017,9 +4799,15 @@ ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %v) @@ -4035,9 +4823,15 @@ ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v4i16: @@ -4046,9 +4840,15 @@ ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v) @@ -4064,9 +4864,15 @@ ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v8i16: @@ -4075,9 +4881,15 @@ ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v) @@ -4093,9 +4905,15 @@ ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, a0 -; RV32-NEXT: vredmin.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; RV32-NEXT: vredmin.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v16i16: @@ -4104,9 +4922,15 @@ ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vredmin.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; RV64-NEXT: vredmin.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %v) @@ -4123,11 +4947,15 @@ ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, a0 -; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; RV32-NEXT: vredmin.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v32i16: @@ -4137,11 +4965,15 @@ ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; RV64-NEXT: vredmin.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %v) @@ -4158,11 +4990,15 @@ ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, a0 -; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; RV32-NEXT: vredmin.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v64i16: @@ -4172,11 +5008,15 @@ ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; RV64-NEXT: vredmin.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %v) @@ -4196,11 +5036,15 @@ ; RV32-NEXT: vmin.vv v8, v8, v16 ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, a0 -; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; RV32-NEXT: vredmin.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v128i16: @@ -4213,11 +5057,15 @@ ; RV64-NEXT: vmin.vv v8, v8, v16 ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; RV64-NEXT: vredmin.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %v) @@ -4247,9 +5095,15 @@ ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v2i32: @@ -4258,9 +5112,15 @@ ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v) @@ -4276,9 +5136,15 @@ ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v4i32: @@ -4287,9 +5153,15 @@ ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v) @@ -4305,9 +5177,15 @@ ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, a0 -; RV32-NEXT: vredmin.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV32-NEXT: vredmin.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v8i32: @@ -4316,9 +5194,15 @@ ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vredmin.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64-NEXT: vredmin.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v) @@ -4334,9 +5218,15 @@ ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, a0 -; RV32-NEXT: vredmin.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; RV32-NEXT: vredmin.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v16i32: @@ -4345,9 +5235,15 @@ ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: vredmin.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; RV64-NEXT: vredmin.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %v) @@ -4364,11 +5260,15 @@ ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, a0 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; RV32-NEXT: vredmin.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v32i32: @@ -4378,11 +5278,15 @@ ; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; RV64-NEXT: vredmin.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %v) @@ -4402,11 +5306,15 @@ ; RV32-NEXT: vmin.vv v8, v8, v16 ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, a0 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; RV32-NEXT: vredmin.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_v64i32: @@ -4419,11 +5327,15 @@ ; RV64-NEXT: vmin.vv v8, v8, v16 ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; RV64-NEXT: vredmin.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %v) @@ -4471,12 +5383,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4487,9 +5401,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v) @@ -4513,12 +5433,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredmin.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4529,9 +5451,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vredmin.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredmin.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) @@ -4555,12 +5483,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredmin.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4571,9 +5501,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: vredmin.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredmin.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %v) @@ -4597,12 +5533,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmin.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4613,9 +5551,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vredmin.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmin.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %v) @@ -4642,12 +5586,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmin.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4660,10 +5606,17 @@ ; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v24, a0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vmin.vv v8, v8, v16 -; RV64-NEXT: vredmin.vs v8, v8, v24 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmin.vs v16, v8, v24 +; RV64-NEXT: vmv.x.s a0, v16 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %v) @@ -4695,12 +5648,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmin.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4720,9 +5675,15 @@ ; RV64-NEXT: vmin.vv v8, v8, v16 ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vredmin.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmin.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.smin.v64i64(<64 x i64> %v) @@ -4751,9 +5712,15 @@ ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %v) @@ -4768,9 +5735,15 @@ ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v) @@ -4785,9 +5758,15 @@ ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v) @@ -4802,9 +5781,15 @@ ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v) @@ -4820,11 +5805,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredmax.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %v) @@ -4840,11 +5829,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredmax.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %v) @@ -4860,11 +5853,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredmax.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %v) @@ -4883,11 +5880,15 @@ ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vmax.vv v8, v8, v16 ; CHECK-NEXT: li a0, -128 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredmax.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %v) @@ -4916,9 +5917,15 @@ ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %v) @@ -4933,9 +5940,15 @@ ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v) @@ -4950,9 +5963,15 @@ ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v) @@ -4967,9 +5986,15 @@ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vredmax.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %v) @@ -4985,11 +6010,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vredmax.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %v) @@ -5005,11 +6034,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredmax.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %v) @@ -5028,11 +6061,15 @@ ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vmax.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredmax.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %v) @@ -5061,9 +6098,15 @@ ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v) @@ -5078,9 +6121,15 @@ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v) @@ -5095,9 +6144,15 @@ ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vredmax.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v) @@ -5112,9 +6167,15 @@ ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vredmax.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %v) @@ -5130,11 +6191,15 @@ ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredmax.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %v) @@ -5153,11 +6218,15 @@ ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vmax.vv v8, v8, v16 ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredmax.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %v) @@ -5203,12 +6272,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredmax.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -5219,9 +6290,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vredmax.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredmax.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v) @@ -5243,12 +6320,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredmax.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -5259,9 +6338,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vredmax.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredmax.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) @@ -5283,12 +6368,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredmax.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -5299,9 +6386,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: vredmax.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredmax.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %v) @@ -5323,12 +6416,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmax.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -5339,9 +6434,15 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vredmax.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmax.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %v) @@ -5366,12 +6467,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmax.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -5384,10 +6487,17 @@ ; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v24, a0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vmax.vv v8, v8, v16 -; RV64-NEXT: vredmax.vs v8, v8, v24 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmax.vs v16, v8, v24 +; RV64-NEXT: vmv.x.s a0, v16 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %v) @@ -5417,12 +6527,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmax.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -5442,9 +6554,15 @@ ; RV64-NEXT: vmax.vv v8, v8, v16 ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, a0 -; RV64-NEXT: vredmax.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmax.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> %v) @@ -5474,9 +6592,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %v) @@ -5492,9 +6612,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %v) @@ -5510,9 +6632,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v) @@ -5528,9 +6652,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v) @@ -5547,9 +6673,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredminu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %v) @@ -5566,9 +6694,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredminu.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %v) @@ -5585,9 +6715,11 @@ ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredminu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %v) @@ -5607,9 +6739,11 @@ ; CHECK-NEXT: vminu.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredminu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %v) @@ -5639,9 +6773,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %v) @@ -5657,9 +6793,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v) @@ -5675,9 +6813,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v) @@ -5693,9 +6833,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vredminu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %v) @@ -5712,9 +6854,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vredminu.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %v) @@ -5731,9 +6875,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredminu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %v) @@ -5753,9 +6899,11 @@ ; CHECK-NEXT: vminu.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredminu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %v) @@ -5785,9 +6933,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v) @@ -5803,9 +6953,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v) @@ -5821,9 +6973,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vredminu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v) @@ -5839,9 +6993,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vredminu.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %v) @@ -5858,9 +7014,11 @@ ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredminu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %v) @@ -5880,9 +7038,11 @@ ; CHECK-NEXT: vminu.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v16, -1 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredminu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %v) @@ -5922,12 +7082,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredminu.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -5937,9 +7099,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v9, -1 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredminu.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v) @@ -5955,12 +7119,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredminu.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -5970,9 +7136,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v10, -1 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredminu.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) @@ -5988,12 +7156,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredminu.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6003,9 +7173,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v12, -1 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredminu.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %v) @@ -6021,12 +7193,14 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredminu.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6036,9 +7210,11 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v16, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredminu.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %v) @@ -6057,12 +7233,14 @@ ; RV32-NEXT: vminu.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredminu.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6075,9 +7253,11 @@ ; RV64-NEXT: vminu.vv v8, v8, v16 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v16, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredminu.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %v) @@ -6102,12 +7282,14 @@ ; RV32-NEXT: vminu.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredminu.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6126,9 +7308,11 @@ ; RV64-NEXT: vminu.vv v8, v8, v16 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v16, -1 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredminu.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.umin.v64i64(<64 x i64> %v) @@ -6156,9 +7340,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i8>, <2 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %v) @@ -6172,9 +7362,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i8>, <4 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v) @@ -6188,9 +7384,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i8>, <8 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v) @@ -6204,9 +7406,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <16 x i8>, <16 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v) @@ -6221,11 +7429,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu +; CHECK-NEXT: vredmaxu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <32 x i8>, <32 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %v) @@ -6240,11 +7452,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu +; CHECK-NEXT: vredmaxu.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <64 x i8>, <64 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %v) @@ -6259,11 +7475,15 @@ ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredmaxu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i8>, <128 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %v) @@ -6281,11 +7501,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vmaxu.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, mu +; CHECK-NEXT: vredmaxu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <256 x i8>, <256 x i8>* %x %red = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %v) @@ -6313,9 +7537,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i16>, <2 x i16>* %x %red = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %v) @@ -6329,9 +7559,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i16>, <4 x i16>* %x %red = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v) @@ -6345,9 +7581,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <8 x i16>, <8 x i16>* %x %red = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v) @@ -6361,9 +7603,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, mu +; CHECK-NEXT: vredmaxu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <16 x i16>, <16 x i16>* %x %red = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %v) @@ -6378,11 +7626,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu +; CHECK-NEXT: vredmaxu.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <32 x i16>, <32 x i16>* %x %red = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %v) @@ -6397,11 +7649,15 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredmaxu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i16>, <64 x i16>* %x %red = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %v) @@ -6419,11 +7675,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vmaxu.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu +; CHECK-NEXT: vredmaxu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <128 x i16>, <128 x i16>* %x %red = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %v) @@ -6451,9 +7711,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <2 x i32>, <2 x i32>* %x %red = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v) @@ -6467,9 +7733,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %v = load <4 x i32>, <4 x i32>* %x %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v) @@ -6483,9 +7755,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vredmaxu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %v = load <8 x i32>, <8 x i32>* %x %red = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v) @@ -6499,9 +7777,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v12 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, mu +; CHECK-NEXT: vredmaxu.vs v13, v8, v12 +; CHECK-NEXT: vmv.x.s a0, v13 ; CHECK-NEXT: ret %v = load <16 x i32>, <16 x i32>* %x %red = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %v) @@ -6516,11 +7800,15 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredmaxu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <32 x i32>, <32 x i32>* %x %red = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %v) @@ -6538,11 +7826,15 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vmaxu.vv v8, v8, v16 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v16 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu +; CHECK-NEXT: vredmaxu.vs v17, v8, v16 +; CHECK-NEXT: vmv.x.s a0, v17 ; CHECK-NEXT: ret %v = load <64 x i32>, <64 x i32>* %x %red = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %v) @@ -6580,12 +7872,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vredmaxu.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV32-NEXT: vredmaxu.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6593,9 +7891,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vredmaxu.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vredmaxu.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %v = load <2 x i64>, <2 x i64>* %x %red = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v) @@ -6609,12 +7913,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vredmaxu.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vredmaxu.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6622,9 +7932,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vredmaxu.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vredmaxu.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %v = load <4 x i64>, <4 x i64>* %x %red = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) @@ -6638,12 +7954,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vredmaxu.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV32-NEXT: vredmaxu.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6651,9 +7973,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vredmaxu.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, tu, mu +; RV64-NEXT: vredmaxu.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %v = load <8 x i64>, <8 x i64>* %x %red = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %v) @@ -6667,12 +7995,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredmaxu.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmaxu.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6680,9 +8014,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredmaxu.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmaxu.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <16 x i64>, <16 x i64>* %x %red = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %v) @@ -6698,13 +8038,20 @@ ; RV32-NEXT: vle64.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v24, zero +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vmaxu.vv v8, v8, v16 -; RV32-NEXT: vredmaxu.vs v8, v8, v24 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmaxu.vs v16, v8, v24 +; RV32-NEXT: vmv.x.s a0, v16 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v16, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6714,10 +8061,17 @@ ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v24, zero +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vmaxu.vv v8, v8, v16 -; RV64-NEXT: vredmaxu.vs v8, v8, v24 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmaxu.vs v16, v8, v24 +; RV64-NEXT: vmv.x.s a0, v16 ; RV64-NEXT: ret %v = load <32 x i64>, <32 x i64>* %x %red = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %v) @@ -6740,12 +8094,18 @@ ; RV32-NEXT: vmaxu.vv v16, v24, v16 ; RV32-NEXT: vmaxu.vv v8, v8, v0 ; RV32-NEXT: vmaxu.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v16, zero -; RV32-NEXT: vredmaxu.vs v8, v8, v16 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v17, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV32-NEXT: vredmaxu.vs v17, v8, v16 +; RV32-NEXT: vmv.x.s a0, v17 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v17, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -6762,9 +8122,15 @@ ; RV64-NEXT: vmaxu.vv v16, v24, v16 ; RV64-NEXT: vmaxu.vv v8, v8, v0 ; RV64-NEXT: vmaxu.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v16, zero -; RV64-NEXT: vredmaxu.vs v8, v8, v16 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v17, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, tu, mu +; RV64-NEXT: vredmaxu.vs v17, v8, v16 +; RV64-NEXT: vmv.x.s a0, v17 ; RV64-NEXT: ret %v = load <64 x i64>, <64 x i64>* %x %red = call i64 @llvm.vector.reduce.umax.v64i64(<64 x i64> %v) @@ -6808,6 +8174,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v9 ; CHECK-NEXT: vrgather.vi v9, v8, 1 @@ -6826,8 +8195,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v9 ; CHECK-NEXT: vrgather.vi v9, v8, 1 @@ -6846,10 +8221,19 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v9 ; CHECK-NEXT: vrgather.vi v9, v8, 1 @@ -6869,12 +8253,24 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: vrgather.vi v10, v8, 1 @@ -6895,14 +8291,29 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: vrgather.vi v12, v8, 1 @@ -6923,17 +8334,35 @@ ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: vrgather.vi v16, v8, 1 @@ -6957,17 +8386,35 @@ ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: vrgather.vi v16, v8, 1 @@ -7016,6 +8463,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v9 ; CHECK-NEXT: vrgather.vi v9, v8, 1 @@ -7034,8 +8484,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v9 ; CHECK-NEXT: vrgather.vi v9, v8, 1 @@ -7054,10 +8510,19 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: vrgather.vi v10, v8, 1 @@ -7077,12 +8542,24 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: vrgather.vi v12, v8, 1 @@ -7103,14 +8580,29 @@ ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: vrgather.vi v16, v8, 1 @@ -7134,14 +8626,29 @@ ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: vrgather.vi v16, v8, 1 @@ -7190,6 +8697,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v9 ; CHECK-NEXT: vrgather.vi v9, v8, 1 @@ -7208,8 +8718,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: vrgather.vi v10, v8, 1 @@ -7228,10 +8744,19 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vslidedown.vi v12, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: vrgather.vi v12, v8, 1 @@ -7251,12 +8776,24 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: vrgather.vi v16, v8, 1 @@ -7279,12 +8816,24 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 8 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vmul.vv v8, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 2 ; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: vrgather.vi v16, v8, 1 @@ -7357,6 +8906,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV32-NEXT: vslidedown.vi v10, v8, 2 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: vrgather.vi v10, v8, 1 @@ -7372,6 +8924,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vslidedown.vi v10, v8, 2 ; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: vrgather.vi v10, v8, 1 @@ -7390,8 +8945,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vslidedown.vi v12, v8, 4 ; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vslidedown.vi v12, v8, 2 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vrgather.vi v12, v8, 1 @@ -7407,8 +8968,14 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 4 ; RV64-NEXT: vmul.vv v8, v8, v12 +; RV64-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 2 ; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: vrgather.vi v12, v8, 1 @@ -7427,10 +8994,19 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 4 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 2 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vrgather.vi v16, v8, 1 @@ -7446,10 +9022,19 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 8 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 4 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 2 ; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: vrgather.vi v16, v8, 1 @@ -7471,19 +9056,30 @@ ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 4 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 2 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vrgather.vi v16, v8, 1 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 0, e32, m8, ta, mu ; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vslidedown.vi v16, v8, 1 +; RV32-NEXT: vmv.x.s a1, v16 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_mul_v32i64: @@ -7493,10 +9089,19 @@ ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 8 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 4 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 2 ; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: vrgather.vi v16, v8, 1 @@ -7524,19 +9129,30 @@ ; RV32-NEXT: vmul.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v0 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 4 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV32-NEXT: vslidedown.vi v16, v8, 2 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vrgather.vi v16, v8, 1 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 0, e32, m8, ta, mu ; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vslidedown.vi v16, v8, 1 +; RV32-NEXT: vmv.x.s a1, v16 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_mul_v64i64: @@ -7552,10 +9168,19 @@ ; RV64-NEXT: vmul.vv v16, v24, v16 ; RV64-NEXT: vmul.vv v8, v8, v0 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 8 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 4 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 2 ; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: vrgather.vi v16, v8, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -9,8 +9,11 @@ define <7 x i1> @fcmp_oeq_vv_v7f16(<7 x half> %va, <7 x half> %vb, <7 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_v7f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call <7 x i1> @llvm.vp.fcmp.v7f16(<7 x half> %va, <7 x half> %vb, metadata !"oeq", <7 x i1> %m, i32 %evl) ret <7 x i1> %v @@ -21,8 +24,11 @@ define <8 x i1> @fcmp_oeq_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"oeq", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -31,8 +37,11 @@ define <8 x i1> @fcmp_oeq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -43,8 +52,11 @@ define <8 x i1> @fcmp_oeq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -55,8 +67,11 @@ define <8 x i1> @fcmp_ogt_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ogt", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -65,8 +80,11 @@ define <8 x i1> @fcmp_ogt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -77,8 +95,11 @@ define <8 x i1> @fcmp_ogt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -89,8 +110,11 @@ define <8 x i1> @fcmp_oge_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"oge", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -99,8 +123,11 @@ define <8 x i1> @fcmp_oge_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -111,8 +138,11 @@ define <8 x i1> @fcmp_oge_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -123,8 +153,11 @@ define <8 x i1> @fcmp_olt_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"olt", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -133,8 +166,11 @@ define <8 x i1> @fcmp_olt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -145,8 +181,11 @@ define <8 x i1> @fcmp_olt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -157,8 +196,11 @@ define <8 x i1> @fcmp_ole_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ole", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -167,8 +209,11 @@ define <8 x i1> @fcmp_ole_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -179,8 +224,11 @@ define <8 x i1> @fcmp_ole_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -191,11 +239,15 @@ define <8 x i1> @fcmp_one_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vv v11, v9, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v11, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"one", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -204,11 +256,15 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -219,11 +275,15 @@ define <8 x i1> @fcmp_one_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -234,11 +294,15 @@ define <8 x i1> @fcmp_ord_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ord_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v9, v9, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v9, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ord", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -249,11 +313,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -266,11 +334,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -281,11 +353,15 @@ define <8 x i1> @fcmp_ueq_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vv v11, v9, v8, v0.t +; CHECK-NEXT: vmnor.mm v0, v11, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ueq", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -294,11 +370,15 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmnor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -309,11 +389,15 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmnor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -324,10 +408,11 @@ define <8 x i1> @fcmp_ugt_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ugt", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -336,10 +421,11 @@ define <8 x i1> @fcmp_ugt_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -350,10 +436,11 @@ define <8 x i1> @fcmp_ugt_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfge.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -364,10 +451,11 @@ define <8 x i1> @fcmp_uge_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"uge", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -376,10 +464,11 @@ define <8 x i1> @fcmp_uge_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -390,10 +479,11 @@ define <8 x i1> @fcmp_uge_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -404,10 +494,11 @@ define <8 x i1> @fcmp_ult_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ult", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -416,10 +507,11 @@ define <8 x i1> @fcmp_ult_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfge.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -430,10 +522,11 @@ define <8 x i1> @fcmp_ult_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfle.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -444,10 +537,11 @@ define <8 x i1> @fcmp_ule_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ule", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -456,10 +550,11 @@ define <8 x i1> @fcmp_ule_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -470,10 +565,11 @@ define <8 x i1> @fcmp_ule_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -484,8 +580,11 @@ define <8 x i1> @fcmp_une_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"une", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -494,8 +593,11 @@ define <8 x i1> @fcmp_une_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -506,8 +608,11 @@ define <8 x i1> @fcmp_une_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_swap_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -518,11 +623,15 @@ define <8 x i1> @fcmp_uno_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uno_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vv v9, v9, v9, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vv v10, v9, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"uno", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -533,11 +642,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -550,11 +663,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -570,41 +687,57 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: vmv1r.v v1, v0 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: addi a4, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v24, (a4) +; CHECK-NEXT: vle16.v v8, (a4) +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, mu ; CHECK-NEXT: addi a4, a2, -64 -; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vslidedown.vi v0, v2, 8 ; CHECK-NEXT: bltu a2, a4, .LBB43_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a4 ; CHECK-NEXT: .LBB43_2: ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v2, v16, v24, v0.t +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v1, 0 +; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, mu +; CHECK-NEXT: vmfeq.vv v1, v16, v8, v0.t ; CHECK-NEXT: bltu a2, a1, .LBB43_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: .LBB43_4: -; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v16, v24, v8, v0.t +; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vsetivli zero, 16, e8, m1, tu, mu -; CHECK-NEXT: vslideup.vi v16, v2, 8 +; CHECK-NEXT: vslideup.vi v16, v1, 8 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -617,7 +750,9 @@ define <7 x i1> @fcmp_oeq_vv_v7f64(<7 x double> %va, <7 x double> %vb, <7 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_v7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -630,7 +765,9 @@ define <8 x i1> @fcmp_oeq_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -641,7 +778,9 @@ define <8 x i1> @fcmp_oeq_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -654,7 +793,9 @@ define <8 x i1> @fcmp_oeq_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -667,7 +808,9 @@ define <8 x i1> @fcmp_ogt_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -678,7 +821,9 @@ define <8 x i1> @fcmp_ogt_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -691,7 +836,9 @@ define <8 x i1> @fcmp_ogt_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -704,7 +851,9 @@ define <8 x i1> @fcmp_oge_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -715,7 +864,9 @@ define <8 x i1> @fcmp_oge_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfge.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -728,7 +879,9 @@ define <8 x i1> @fcmp_oge_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -741,7 +894,9 @@ define <8 x i1> @fcmp_olt_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -752,7 +907,9 @@ define <8 x i1> @fcmp_olt_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -765,7 +922,9 @@ define <8 x i1> @fcmp_olt_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -778,7 +937,9 @@ define <8 x i1> @fcmp_ole_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -789,7 +950,9 @@ define <8 x i1> @fcmp_ole_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -802,7 +965,9 @@ define <8 x i1> @fcmp_ole_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfge.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -815,10 +980,14 @@ define <8 x i1> @fcmp_one_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v17, v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmor.mm v0, v17, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"one", <8 x i1> %m, i32 %evl) @@ -828,10 +997,14 @@ define <8 x i1> @fcmp_one_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v13, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmor.mm v0, v13, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -843,10 +1016,14 @@ define <8 x i1> @fcmp_one_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v13, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmor.mm v0, v13, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -858,10 +1035,14 @@ define <8 x i1> @fcmp_ord_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ord_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vv v16, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmand.mm v0, v12, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ord", <8 x i1> %m, i32 %evl) @@ -873,10 +1054,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vf v16, v12, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmand.mm v0, v12, v16 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -890,10 +1075,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vf v16, v12, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmand.mm v0, v16, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -905,10 +1094,14 @@ define <8 x i1> @fcmp_ueq_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v17, v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnor.mm v0, v17, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ueq", <8 x i1> %m, i32 %evl) @@ -918,10 +1111,14 @@ define <8 x i1> @fcmp_ueq_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v13, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnor.mm v0, v13, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -933,10 +1130,14 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v13, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnor.mm v0, v13, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -948,9 +1149,10 @@ define <8 x i1> @fcmp_ugt_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vv v16, v8, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ugt", <8 x i1> %m, i32 %evl) @@ -960,9 +1162,10 @@ define <8 x i1> @fcmp_ugt_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -974,9 +1177,10 @@ define <8 x i1> @fcmp_ugt_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfge.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -988,9 +1192,10 @@ define <8 x i1> @fcmp_uge_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v16, v8, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"uge", <8 x i1> %m, i32 %evl) @@ -1000,9 +1205,10 @@ define <8 x i1> @fcmp_uge_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1014,9 +1220,10 @@ define <8 x i1> @fcmp_uge_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1028,9 +1235,10 @@ define <8 x i1> @fcmp_ult_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vv v16, v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ult", <8 x i1> %m, i32 %evl) @@ -1040,9 +1248,10 @@ define <8 x i1> @fcmp_ult_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfge.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1054,9 +1263,10 @@ define <8 x i1> @fcmp_ult_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfle.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1068,9 +1278,10 @@ define <8 x i1> @fcmp_ule_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vv v16, v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ule", <8 x i1> %m, i32 %evl) @@ -1080,9 +1291,10 @@ define <8 x i1> @fcmp_ule_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1094,9 +1306,10 @@ define <8 x i1> @fcmp_ule_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1108,7 +1321,9 @@ define <8 x i1> @fcmp_une_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1119,7 +1334,9 @@ define <8 x i1> @fcmp_une_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1132,7 +1349,9 @@ define <8 x i1> @fcmp_une_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_swap_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1145,10 +1364,14 @@ define <8 x i1> @fcmp_uno_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uno_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v16, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmor.mm v0, v12, v16 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"uno", <8 x i1> %m, i32 %evl) @@ -1160,10 +1383,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vf v16, v12, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmor.mm v0, v12, v16 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1177,10 +1404,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vf v16, v12, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmor.mm v0, v16, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1201,15 +1432,14 @@ ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v2, 2 ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a3, a2, -16 ; CHECK-NEXT: csrr a1, vlenb @@ -1223,30 +1453,37 @@ ; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: .LBB87_2: ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v1, 0 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v1, v16, v8, v0.t ; CHECK-NEXT: bltu a2, a0, .LBB87_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: .LBB87_4: -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v16, v24, v8, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v16, v1, 2 ; CHECK-NEXT: vmv1r.v v0, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -15,9 +15,12 @@ ; CHECK-NEXT: li a1, 127 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vv v8, v10, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i7(<8 x i7> %va, <8 x i7> %vb, metadata !"eq", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -28,11 +31,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a2 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vand.vx v10, v8, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i7> poison, i7 %b, i32 0 %vb = shufflevector <8 x i7> %elt.head, <8 x i7> poison, <8 x i32> zeroinitializer @@ -45,11 +51,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vv v0, v9, v8, v0.t +; CHECK-NEXT: vand.vx v9, v8, a2 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vand.vx v10, v8, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vv v8, v10, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i7> poison, i7 %b, i32 0 %vb = shufflevector <8 x i7> %elt.head, <8 x i7> poison, <8 x i32> zeroinitializer @@ -62,8 +71,11 @@ define <5 x i1> @icmp_eq_vv_v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_v5i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <5 x i1> @llvm.vp.icmp.v5i8(<5 x i8> %va, <5 x i8> %vb, metadata !"eq", <5 x i1> %m, i32 %evl) ret <5 x i1> %v @@ -72,8 +84,11 @@ define <5 x i1> @icmp_eq_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_v5i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer @@ -84,8 +99,11 @@ define <5 x i1> @icmp_eq_vx_swap_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_v5i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer @@ -98,8 +116,11 @@ define <8 x i1> @icmp_eq_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"eq", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -108,8 +129,11 @@ define <8 x i1> @icmp_eq_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -120,8 +144,11 @@ define <8 x i1> @icmp_eq_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -132,8 +159,11 @@ define <8 x i1> @icmp_eq_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -144,8 +174,11 @@ define <8 x i1> @icmp_eq_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -156,8 +189,11 @@ define <8 x i1> @icmp_ne_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ne", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -166,8 +202,11 @@ define <8 x i1> @icmp_ne_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -178,8 +217,11 @@ define <8 x i1> @icmp_ne_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -190,8 +232,11 @@ define <8 x i1> @icmp_ne_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -202,8 +247,11 @@ define <8 x i1> @icmp_ne_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -214,8 +262,11 @@ define <8 x i1> @icmp_ugt_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ugt", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -224,8 +275,11 @@ define <8 x i1> @icmp_ugt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -236,8 +290,11 @@ define <8 x i1> @icmp_ugt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -248,8 +305,11 @@ define <8 x i1> @icmp_ugt_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -260,8 +320,11 @@ define <8 x i1> @icmp_ugt_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -272,8 +335,11 @@ define <8 x i1> @icmp_uge_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsleu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"uge", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -283,9 +349,12 @@ ; CHECK-LABEL: icmp_uge_vx_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsleu.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -296,8 +365,11 @@ define <8 x i1> @icmp_uge_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsleu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -308,8 +380,11 @@ define <8 x i1> @icmp_uge_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -320,8 +395,11 @@ define <8 x i1> @icmp_uge_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -332,8 +410,11 @@ define <8 x i1> @icmp_ult_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ult", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -342,8 +423,11 @@ define <8 x i1> @icmp_ult_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -354,8 +438,11 @@ define <8 x i1> @icmp_ult_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -366,8 +453,11 @@ define <8 x i1> @icmp_ult_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -378,8 +468,11 @@ define <8 x i1> @icmp_ult_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -390,8 +483,11 @@ define <8 x i1> @icmp_sgt_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmslt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sgt", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -400,8 +496,11 @@ define <8 x i1> @icmp_sgt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -412,8 +511,11 @@ define <8 x i1> @icmp_sgt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -424,8 +526,11 @@ define <8 x i1> @icmp_sgt_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -436,8 +541,11 @@ define <8 x i1> @icmp_sgt_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -448,8 +556,11 @@ define <8 x i1> @icmp_sge_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sge", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -459,9 +570,12 @@ ; CHECK-LABEL: icmp_sge_vx_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -472,8 +586,11 @@ define <8 x i1> @icmp_sge_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -484,8 +601,11 @@ define <8 x i1> @icmp_sge_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -496,8 +616,11 @@ define <8 x i1> @icmp_sge_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -508,8 +631,11 @@ define <8 x i1> @icmp_slt_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmslt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"slt", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -518,8 +644,11 @@ define <8 x i1> @icmp_slt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -530,8 +659,11 @@ define <8 x i1> @icmp_slt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -542,8 +674,11 @@ define <8 x i1> @icmp_slt_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -554,8 +689,11 @@ define <8 x i1> @icmp_slt_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -566,8 +704,11 @@ define <8 x i1> @icmp_sle_vv_v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sle", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -576,8 +717,11 @@ define <8 x i1> @icmp_sle_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vx_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -589,9 +733,12 @@ ; CHECK-LABEL: icmp_sle_vx_swap_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -602,8 +749,11 @@ define <8 x i1> @icmp_sle_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -614,8 +764,11 @@ define <8 x i1> @icmp_sle_vi_swap_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 4, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -647,18 +800,20 @@ ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: bltu a3, a4, .LBB51_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: .LBB51_2: ; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: vlm.v v24, (a2) -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v25, (a2) +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v1, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu ; CHECK-NEXT: addi a0, a3, -128 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 @@ -670,8 +825,10 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: .LBB51_4: -; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -696,26 +853,31 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, mu -; CHECK-NEXT: vlm.v v25, (a1) +; CHECK-NEXT: vlm.v v2, (a1) ; CHECK-NEXT: addi a4, a2, -128 -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: bltu a2, a4, .LBB52_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a4 ; CHECK-NEXT: .LBB52_2: -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vmseq.vx v16, v24, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB52_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: .LBB52_4: -; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmv1r.v v8, v25 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmseq.vx v17, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vmv1r.v v8, v16 ; CHECK-NEXT: ret %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0 %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer @@ -728,26 +890,31 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, mu -; CHECK-NEXT: vlm.v v25, (a1) +; CHECK-NEXT: vlm.v v2, (a1) ; CHECK-NEXT: addi a4, a2, -128 -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: bltu a2, a4, .LBB53_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a4 ; CHECK-NEXT: .LBB53_2: -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vmseq.vx v16, v24, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB53_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: .LBB53_4: -; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmv1r.v v8, v25 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmseq.vx v17, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vmv1r.v v8, v16 ; CHECK-NEXT: ret %elt.head = insertelement <256 x i8> poison, i8 %b, i32 0 %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer @@ -760,7 +927,9 @@ define <8 x i1> @icmp_eq_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmseq.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -771,7 +940,9 @@ define <8 x i1> @icmp_eq_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmseq.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -784,7 +955,9 @@ define <8 x i1> @icmp_eq_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmseq.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -797,7 +970,9 @@ define <8 x i1> @icmp_eq_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmseq.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -810,7 +985,9 @@ define <8 x i1> @icmp_eq_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmseq.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -823,7 +1000,9 @@ define <8 x i1> @icmp_ne_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsne.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -834,7 +1013,9 @@ define <8 x i1> @icmp_ne_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsne.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -847,7 +1028,9 @@ define <8 x i1> @icmp_ne_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsne.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -860,7 +1043,9 @@ define <8 x i1> @icmp_ne_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsne.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -873,7 +1058,9 @@ define <8 x i1> @icmp_ne_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsne.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -886,7 +1073,9 @@ define <8 x i1> @icmp_ugt_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsltu.vv v12, v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -897,7 +1086,9 @@ define <8 x i1> @icmp_ugt_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsgtu.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -910,7 +1101,9 @@ define <8 x i1> @icmp_ugt_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsltu.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -923,7 +1116,9 @@ define <8 x i1> @icmp_ugt_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsgtu.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -936,7 +1131,9 @@ define <8 x i1> @icmp_ugt_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsleu.vi v10, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -949,7 +1146,9 @@ define <8 x i1> @icmp_uge_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsleu.vv v12, v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -962,7 +1161,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmv.v.x v12, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsleu.vv v10, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -975,7 +1176,9 @@ define <8 x i1> @icmp_uge_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsleu.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -988,7 +1191,9 @@ define <8 x i1> @icmp_uge_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsgtu.vi v10, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1001,7 +1206,9 @@ define <8 x i1> @icmp_uge_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsleu.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1014,7 +1221,9 @@ define <8 x i1> @icmp_ult_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsltu.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1025,7 +1234,9 @@ define <8 x i1> @icmp_ult_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsltu.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1038,7 +1249,9 @@ define <8 x i1> @icmp_ult_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsgtu.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1051,7 +1264,9 @@ define <8 x i1> @icmp_ult_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsleu.vi v10, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1064,7 +1279,9 @@ define <8 x i1> @icmp_ult_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsgtu.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1077,7 +1294,9 @@ define <8 x i1> @icmp_sgt_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmslt.vv v12, v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1088,7 +1307,9 @@ define <8 x i1> @icmp_sgt_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsgt.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1101,7 +1322,9 @@ define <8 x i1> @icmp_sgt_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmslt.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1114,7 +1337,9 @@ define <8 x i1> @icmp_sgt_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsgt.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1127,7 +1352,9 @@ define <8 x i1> @icmp_sgt_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vi v10, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1140,7 +1367,9 @@ define <8 x i1> @icmp_sge_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vv v12, v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1153,7 +1382,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmv.v.x v12, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vv v10, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1166,7 +1397,9 @@ define <8 x i1> @icmp_sge_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1179,7 +1412,9 @@ define <8 x i1> @icmp_sge_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsgt.vi v10, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1192,7 +1427,9 @@ define <8 x i1> @icmp_sge_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1205,7 +1442,9 @@ define <8 x i1> @icmp_slt_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmslt.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1216,7 +1455,9 @@ define <8 x i1> @icmp_slt_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmslt.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1229,7 +1470,9 @@ define <8 x i1> @icmp_slt_vx_swap_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsgt.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1242,7 +1485,9 @@ define <8 x i1> @icmp_slt_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vi v10, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1255,7 +1500,9 @@ define <8 x i1> @icmp_slt_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsgt.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1268,7 +1515,9 @@ define <8 x i1> @icmp_sle_vv_v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1279,7 +1528,9 @@ define <8 x i1> @icmp_sle_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vx_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vx v10, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1294,7 +1545,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vmv.v.x v12, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vv v10, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1307,7 +1560,9 @@ define <8 x i1> @icmp_sle_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsle.vi v10, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1320,7 +1575,9 @@ define <8 x i1> @icmp_sle_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu ; CHECK-NEXT: vmsgt.vi v10, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1338,41 +1595,57 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: vmv1r.v v1, v0 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: addi a4, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v24, (a4) +; CHECK-NEXT: vle32.v v8, (a4) +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; CHECK-NEXT: addi a4, a2, -32 -; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vslidedown.vi v0, v2, 4 ; CHECK-NEXT: bltu a2, a4, .LBB99_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a4 ; CHECK-NEXT: .LBB99_2: ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vmseq.vv v2, v16, v24, v0.t +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v1, 0 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; CHECK-NEXT: vmseq.vv v1, v16, v8, v0.t ; CHECK-NEXT: bltu a2, a1, .LBB99_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: .LBB99_4: -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vv v16, v24, v8, v0.t +; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v16, v2, 4 +; CHECK-NEXT: vslideup.vi v16, v1, 4 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -1385,21 +1658,27 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; CHECK-NEXT: addi a3, a1, -32 -; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vslidedown.vi v0, v24, 4 ; CHECK-NEXT: bltu a1, a3, .LBB100_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB100_2: -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t ; CHECK-NEXT: bltu a1, a2, .LBB100_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB100_4: -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu @@ -1417,21 +1696,27 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; CHECK-NEXT: addi a3, a1, -32 -; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vslidedown.vi v0, v24, 4 ; CHECK-NEXT: bltu a1, a3, .LBB101_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB101_2: -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t ; CHECK-NEXT: bltu a1, a2, .LBB101_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB101_4: -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu @@ -1449,7 +1734,9 @@ define <8 x i1> @icmp_eq_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmseq.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1467,7 +1754,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmseq.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1475,7 +1764,9 @@ ; ; RV64-LABEL: icmp_eq_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmseq.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1495,7 +1786,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmseq.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1503,7 +1796,9 @@ ; ; RV64-LABEL: icmp_eq_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmseq.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1516,7 +1811,9 @@ define <8 x i1> @icmp_eq_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmseq.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1529,7 +1826,9 @@ define <8 x i1> @icmp_eq_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmseq.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1542,7 +1841,9 @@ define <8 x i1> @icmp_ne_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsne.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1560,7 +1861,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsne.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1568,7 +1871,9 @@ ; ; RV64-LABEL: icmp_ne_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsne.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1588,7 +1893,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsne.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1596,7 +1903,9 @@ ; ; RV64-LABEL: icmp_ne_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsne.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1609,7 +1918,9 @@ define <8 x i1> @icmp_ne_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsne.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1622,7 +1933,9 @@ define <8 x i1> @icmp_ne_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsne.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1635,7 +1948,9 @@ define <8 x i1> @icmp_ugt_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsltu.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1653,7 +1968,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsltu.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1661,7 +1978,9 @@ ; ; RV64-LABEL: icmp_ugt_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsgtu.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1681,7 +2000,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsltu.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1689,7 +2010,9 @@ ; ; RV64-LABEL: icmp_ugt_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsltu.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1702,7 +2025,9 @@ define <8 x i1> @icmp_ugt_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsgtu.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1715,7 +2040,9 @@ define <8 x i1> @icmp_ugt_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsleu.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1728,7 +2055,9 @@ define <8 x i1> @icmp_uge_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsleu.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1746,7 +2075,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsleu.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1756,7 +2087,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vmv.v.x v16, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsleu.vv v12, v16, v8, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1776,7 +2109,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsleu.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1784,7 +2119,9 @@ ; ; RV64-LABEL: icmp_uge_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsleu.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1797,7 +2134,9 @@ define <8 x i1> @icmp_uge_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsgtu.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1810,7 +2149,9 @@ define <8 x i1> @icmp_uge_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsleu.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1823,7 +2164,9 @@ define <8 x i1> @icmp_ult_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsltu.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1841,7 +2184,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsltu.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1849,7 +2194,9 @@ ; ; RV64-LABEL: icmp_ult_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsltu.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1869,7 +2216,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsltu.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1877,7 +2226,9 @@ ; ; RV64-LABEL: icmp_ult_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsgtu.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1890,7 +2241,9 @@ define <8 x i1> @icmp_ult_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsleu.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1903,7 +2256,9 @@ define <8 x i1> @icmp_ult_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsgtu.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1916,7 +2271,9 @@ define <8 x i1> @icmp_sgt_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmslt.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1934,7 +2291,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmslt.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1942,7 +2301,9 @@ ; ; RV64-LABEL: icmp_sgt_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsgt.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1962,7 +2323,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmslt.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -1970,7 +2333,9 @@ ; ; RV64-LABEL: icmp_sgt_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmslt.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -1983,7 +2348,9 @@ define <8 x i1> @icmp_sgt_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1996,7 +2363,9 @@ define <8 x i1> @icmp_sgt_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2009,7 +2378,9 @@ define <8 x i1> @icmp_sge_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsle.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2027,7 +2398,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsle.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -2037,7 +2410,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vmv.v.x v16, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsle.vv v12, v16, v8, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -2057,7 +2432,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsle.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -2065,7 +2442,9 @@ ; ; RV64-LABEL: icmp_sge_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsle.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -2078,7 +2457,9 @@ define <8 x i1> @icmp_sge_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2091,7 +2472,9 @@ define <8 x i1> @icmp_sge_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2104,7 +2487,9 @@ define <8 x i1> @icmp_slt_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmslt.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2122,7 +2507,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmslt.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -2130,7 +2517,9 @@ ; ; RV64-LABEL: icmp_slt_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmslt.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -2150,7 +2539,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmslt.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -2158,7 +2549,9 @@ ; ; RV64-LABEL: icmp_slt_vx_swap_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsgt.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -2171,7 +2564,9 @@ define <8 x i1> @icmp_slt_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2184,7 +2579,9 @@ define <8 x i1> @icmp_slt_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2197,7 +2594,9 @@ define <8 x i1> @icmp_sle_vv_v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsle.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2215,7 +2614,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsle.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -2223,7 +2624,9 @@ ; ; RV64-LABEL: icmp_sle_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsle.vx v12, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -2243,7 +2646,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vmsle.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: addi sp, sp, 16 @@ -2253,7 +2658,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vmv.v.x v16, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu ; RV64-NEXT: vmsle.vv v12, v16, v8, v0.t ; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: ret @@ -2266,7 +2673,9 @@ define <8 x i1> @icmp_sle_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2279,7 +2688,9 @@ define <8 x i1> @icmp_sle_vi_swap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll @@ -155,13 +155,17 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v1, 2 ; CHECK-NEXT: bltu a0, a2, .LBB12_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu @@ -189,6 +193,8 @@ ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll @@ -310,9 +310,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll @@ -310,9 +310,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -71,15 +71,19 @@ ; RV32-NEXT: andi a0, a0, 2 ; RV32-NEXT: beqz a0, .LBB4_2 ; RV32-NEXT: .LBB4_4: # %cond.load1 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: lb a1, 1(a0) ; RV32-NEXT: lbu a0, 0(a0) ; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: or a0, a1, a0 -; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; RV32-NEXT: vmv.s.x v8, a0 ; RV32-NEXT: vslideup.vi v9, v8, 1 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret @@ -108,15 +112,19 @@ ; RV64-NEXT: andi a0, a0, 2 ; RV64-NEXT: beqz a0, .LBB4_2 ; RV64-NEXT: .LBB4_4: # %cond.load1 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: lb a1, 1(a0) ; RV64-NEXT: lbu a0, 0(a0) ; RV64-NEXT: slli a1, a1, 8 ; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 2, e16, mf4, tu, mu +; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: vslideup.vi v9, v8, 1 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret @@ -152,9 +160,11 @@ ; RV32-NEXT: andi a0, a0, 2 ; RV32-NEXT: beqz a0, .LBB5_2 ; RV32-NEXT: .LBB5_4: # %cond.load1 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v11, v8, 1 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: lw a0, 0(a0) ; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, mu @@ -189,15 +199,19 @@ ; RV64-NEXT: andi a0, a0, 2 ; RV64-NEXT: beqz a0, .LBB5_2 ; RV64-NEXT: .LBB5_4: # %cond.load1 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: lwu a1, 4(a0) ; RV64-NEXT: lwu a0, 0(a0) ; RV64-NEXT: slli a1, a1, 32 ; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: vslideup.vi v9, v8, 1 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret @@ -236,10 +250,14 @@ ; RV32-NEXT: andi a1, a0, 2 ; RV32-NEXT: beqz a1, .LBB6_2 ; RV32-NEXT: .LBB6_6: # %cond.store1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV32-NEXT: vslidedown.vi v10, v8, 1 ; RV32-NEXT: vmv.x.s a1, v10 -; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32-NEXT: vslidedown.vi v10, v9, 1 ; RV32-NEXT: vmv.x.s a2, v10 ; RV32-NEXT: sb a1, 0(a2) @@ -248,10 +266,14 @@ ; RV32-NEXT: andi a1, a0, 4 ; RV32-NEXT: beqz a1, .LBB6_3 ; RV32-NEXT: .LBB6_7: # %cond.store3 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV32-NEXT: vslidedown.vi v10, v8, 2 ; RV32-NEXT: vmv.x.s a1, v10 -; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32-NEXT: vslidedown.vi v10, v9, 2 ; RV32-NEXT: vmv.x.s a2, v10 ; RV32-NEXT: sb a1, 0(a2) @@ -260,10 +282,14 @@ ; RV32-NEXT: andi a0, a0, 8 ; RV32-NEXT: beqz a0, .LBB6_4 ; RV32-NEXT: .LBB6_8: # %cond.store5 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 3 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32-NEXT: vslidedown.vi v10, v8, 3 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32-NEXT: vslidedown.vi v8, v9, 3 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: sb a0, 0(a1) @@ -299,10 +325,14 @@ ; RV64-NEXT: andi a1, a0, 2 ; RV64-NEXT: beqz a1, .LBB6_2 ; RV64-NEXT: .LBB6_6: # %cond.store1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64-NEXT: vslidedown.vi v9, v8, 1 ; RV64-NEXT: vmv.x.s a1, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v10, 1 ; RV64-NEXT: vmv.x.s a2, v12 ; RV64-NEXT: sb a1, 0(a2) @@ -311,10 +341,14 @@ ; RV64-NEXT: andi a1, a0, 4 ; RV64-NEXT: beqz a1, .LBB6_3 ; RV64-NEXT: .LBB6_7: # %cond.store3 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64-NEXT: vslidedown.vi v9, v8, 2 ; RV64-NEXT: vmv.x.s a1, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v10, 2 ; RV64-NEXT: vmv.x.s a2, v12 ; RV64-NEXT: sb a1, 0(a2) @@ -323,10 +357,14 @@ ; RV64-NEXT: andi a0, a0, 8 ; RV64-NEXT: beqz a0, .LBB6_4 ; RV64-NEXT: .LBB6_8: # %cond.store5 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 3 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vslidedown.vi v9, v8, 3 +; RV64-NEXT: vmv.x.s a0, v9 +; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, mu ; RV64-NEXT: vslidedown.vi v8, v10, 3 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: sb a0, 0(a1) @@ -361,9 +399,14 @@ ; RV32-NEXT: andi a0, a0, 2 ; RV32-NEXT: beqz a0, .LBB7_2 ; RV32-NEXT: .LBB7_4: # %cond.store1 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vslidedown.vi v8, v9, 1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: sh a0, 0(a1) @@ -393,10 +436,14 @@ ; RV64-NEXT: andi a0, a0, 2 ; RV64-NEXT: beqz a0, .LBB7_2 ; RV64-NEXT: .LBB7_4: # %cond.store1 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vslidedown.vi v8, v9, 1 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: sh a0, 0(a1) @@ -450,8 +497,10 @@ ; RV32-NEXT: or a0, a3, a0 ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, mu ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: .LBB8_4: # %else2 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -498,8 +547,10 @@ ; RV64-NEXT: or a0, a3, a0 ; RV64-NEXT: slli a0, a0, 16 ; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli zero, zero, e32, mf2, tu, mu ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: .LBB8_4: # %else2 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -536,9 +587,11 @@ ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: beqz a1, .LBB9_2 ; CHECK-NEXT: .LBB9_4: # %cond.store1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vmv.x.s a1, v9 ; CHECK-NEXT: sh a1, 4(a0) ; CHECK-NEXT: srli a1, a1, 16 ; CHECK-NEXT: sh a1, 6(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -494,6 +494,8 @@ ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: ret %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer @@ -1534,8 +1536,10 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a1, 0 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vslidedown.vi v0, v1, 2 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: addi a2, a0, -16 @@ -1560,9 +1564,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v24, v0 ; RV64-NEXT: li a1, 0 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: addi a2, a0, -16 -; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vslidedown.vi v0, v24, 2 ; RV64-NEXT: bltu a0, a2, .LBB108_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a1, a2 @@ -1643,12 +1649,16 @@ ; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, mu ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vadd_vx_v32i64_evl12: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, mu ; RV64-NEXT: vadd.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 ; RV64-NEXT: ret %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer @@ -1659,20 +1669,48 @@ define <32 x i64> @vadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { ; RV32-LABEL: vadd_vx_v32i64_evl27: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vmv8r.v v16, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: li a0, 32 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vmv.v.i v8, -1 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, mu -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v16, v16, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vadd_vx_v32i64_evl27: ; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v24, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll @@ -1299,7 +1299,9 @@ ; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: lui a1, 341 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV32-NEXT: vmerge.vxm v24, v24, a0, v0 @@ -1327,7 +1329,9 @@ ; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: lui a1, 341 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, tu, mu ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV32-NEXT: vmerge.vxm v16, v16, a0, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll @@ -661,8 +661,10 @@ ; CHECK-NEXT: mul a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v1, 2 ; CHECK-NEXT: addi a1, a2, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v24, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll @@ -323,9 +323,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+f,+d -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+f,+d -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -target-abi=lp64d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x float> @vfwadd_v2f16(<2 x half> *%x, <2 x half> *%y) { @@ -91,25 +91,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwadd.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwadd.vv v16, v24, v0 +; CHECK-NEXT: vfwadd.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -191,24 +228,61 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwadd.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwadd.vv v16, v24, v0 +; CHECK-NEXT: vfwadd.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -371,6 +445,8 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v24, v16, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+f,+d -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+f,+d -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -target-abi=lp64d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x float> @vfwmul_v2f16(<2 x half> *%x, <2 x half> *%y) { @@ -91,25 +91,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwmul.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwmul.vv v16, v24, v0 +; CHECK-NEXT: vfwmul.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -191,24 +228,61 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwmul.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwmul.vv v16, v24, v0 +; CHECK-NEXT: vfwmul.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -371,6 +445,8 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+f,+d -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+f,+d -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+f,+d,+m -target-abi=lp64d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x float> @vfwsub_v2f16(<2 x half> *%x, <2 x half> *%y) { @@ -91,25 +91,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwsub.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwsub.vv v16, v24, v0 +; CHECK-NEXT: vfwsub.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -191,24 +228,61 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vfwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwsub.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwsub.vv v16, v24, v0 +; CHECK-NEXT: vfwsub.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -371,6 +445,8 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -292,15 +292,22 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB13_2: +; RV64-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 ; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vluxei64.v v9, (a0), v16, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: li a2, 16 +; RV64-NEXT: vmv1r.v v12, v9 ; RV64-NEXT: bltu a1, a2, .LBB13_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a1, 16 @@ -309,7 +316,10 @@ ; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, mu ; RV64-NEXT: vmv1r.v v0, v10 -; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmv1r.v v8, v10 ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetvli zero, a0, e8, m2, tu, mu ; RV64-NEXT: vslideup.vi v8, v12, 16 @@ -1931,8 +1941,12 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB86_2: +; RV32-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v1, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu @@ -1957,6 +1971,8 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB86_2: +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v24, 2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu @@ -1983,24 +1999,42 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB87_2: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsll.vi v8, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a1, -16 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: bltu a1, a3, .LBB87_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB87_4: +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vslidedown.vi v16, v0, 2 +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v32i8_v32f64: @@ -2012,8 +2046,12 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB87_2: +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 +; RV64-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2043,6 +2081,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2056,6 +2096,8 @@ ; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v12, v16 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v10, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu @@ -2078,6 +2120,8 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v10, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2088,6 +2132,8 @@ ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB88_2: ; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2116,6 +2162,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV32-NEXT: vslidedown.vi v12, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2129,6 +2177,8 @@ ; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v12, v16 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v10, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu @@ -2151,6 +2201,8 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v10, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2161,6 +2213,8 @@ ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB89_2: ; RV64-NEXT: vzext.vf8 v24, v8 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v10, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2193,24 +2247,42 @@ ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB90_2: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsll.vi v8, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a1, -16 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: bltu a1, a3, .LBB90_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB90_4: +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vslidedown.vi v16, v0, 2 +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v32i16_v32f64: @@ -2222,8 +2294,12 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB90_2: +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v12, 2 +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2253,6 +2329,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, mu ; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2266,6 +2344,8 @@ ; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v8, v16 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v12, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu @@ -2288,6 +2368,8 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2298,6 +2380,8 @@ ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB91_2: ; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v12, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2326,6 +2410,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v12, v0 ; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, mu ; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2339,6 +2425,8 @@ ; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v8, v16 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v12, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu @@ -2361,6 +2449,8 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2371,6 +2461,8 @@ ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB92_2: ; RV64-NEXT: vzext.vf4 v24, v8 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v12, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2414,11 +2506,16 @@ ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a2, a3 ; RV32-NEXT: .LBB93_4: +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vslidedown.vi v16, v0, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; @@ -2431,8 +2528,12 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB93_2: +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v1, 2 +; RV64-NEXT: vsetvli a3, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2462,6 +2563,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetvli a3, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2475,6 +2578,8 @@ ; RV32-NEXT: vsll.vi v8, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v4, v8 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v1, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu @@ -2497,6 +2602,8 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v1, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetvli a3, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2507,6 +2614,8 @@ ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB94_2: ; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v1, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2535,6 +2644,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetvli a3, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2548,6 +2659,8 @@ ; RV32-NEXT: vsll.vi v8, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v4, v8 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v1, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu @@ -2570,6 +2683,8 @@ ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v1, v0 ; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetvli a3, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2580,6 +2695,8 @@ ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB95_2: ; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v1, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu @@ -2617,6 +2734,8 @@ ; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v28, v16 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v24, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu @@ -2644,6 +2763,8 @@ ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 ; RV64-NEXT: .LBB96_2: +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v24, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll @@ -388,6 +388,8 @@ ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vi v0, v8, 2 ; CHECK-NEXT: addi a3, a0, 128 @@ -426,6 +428,8 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB32_4: +; CHECK-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vi v0, v8, 2 ; CHECK-NEXT: addi a5, a1, 128 @@ -442,6 +446,8 @@ ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB32_8: +; CHECK-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu ; CHECK-NEXT: vslidedown.vi v0, v8, 4 ; CHECK-NEXT: addi a5, a1, 256 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -1088,6 +1088,8 @@ ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB79_2: ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32-NEXT: vslidedown.vi v0, v1, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, tu, mu @@ -1144,6 +1146,8 @@ ; RV64-NEXT: mv a1, a3 ; RV64-NEXT: .LBB79_2: ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV64-NEXT: vslidedown.vi v0, v1, 2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, tu, mu @@ -1183,6 +1187,8 @@ ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB80_2: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1735,16 +1735,22 @@ ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: addi a2, a1, -16 ; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t +; RV32-NEXT: vmv8r.v v8, v24 ; RV32-NEXT: bltu a1, a2, .LBB79_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a0, a2 ; RV32-NEXT: .LBB79_4: +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vslidedown.vi v8, v0, 2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, mu -; RV32-NEXT: vsoxei32.v v16, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vsoxei32.v v16, (zero), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_v32f64: @@ -1763,30 +1769,33 @@ ; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a3, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: mv a1, a2 -; RV64-NEXT: bltu a2, a3, .LBB79_2 +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB79_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: .LBB79_2: -; RV64-NEXT: li a3, 0 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: vle64.v v16, (a0) -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: addi a0, a2, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t ; RV64-NEXT: bltu a2, a0, .LBB79_4 ; RV64-NEXT: # %bb.3: -; RV64-NEXT: mv a3, a0 +; RV64-NEXT: mv a1, a0 ; RV64-NEXT: .LBB79_4: +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vslidedown.vi v8, v0, 2 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t @@ -1802,30 +1811,48 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_v32i32_v32f64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; RV32-NEXT: vle32.v v24, (a1) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: mv a1, a2 -; RV32-NEXT: bltu a2, a3, .LBB80_2 -; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: bltu a2, a1, .LBB80_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a3, 16 ; RV32-NEXT: .LBB80_2: -; RV32-NEXT: li a3, 0 -; RV32-NEXT: vsll.vi v24, v24, 3 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV32-NEXT: addi a1, a2, -16 -; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t -; RV32-NEXT: bltu a2, a1, .LBB80_4 +; RV32-NEXT: li a1, 0 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: addi a3, a2, -16 +; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: bltu a2, a3, .LBB80_4 ; RV32-NEXT: # %bb.3: -; RV32-NEXT: mv a3, a1 +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB80_4: +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vslidedown.vi v8, v0, 2 +; RV32-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu -; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_v32i32_v32f64: @@ -1868,19 +1895,24 @@ ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a1, a3 ; RV64-NEXT: .LBB80_4: +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vslidedown.vi v8, v0, 2 +; RV64-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8re8.v v8, (a2) # Unknown-size Folded Reload -; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: vl8re8.v v0, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vslidedown.vi v24, v0, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf2 v24, v8 -; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsext.vf2 v0, v24 +; RV64-NEXT: vsll.vi v24, v0, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: li a1, 10 ; RV64-NEXT: mul a0, a0, a1 @@ -1898,57 +1930,76 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 18 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a4, a3, 3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; RV32-NEXT: vle32.v v24, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle32.v v0, (a1) +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vslidedown.vi v24, v0, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: li a3, 16 -; RV32-NEXT: vsext.vf2 v16, v24 -; RV32-NEXT: mv a1, a2 -; RV32-NEXT: bltu a2, a3, .LBB81_2 -; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 +; RV32-NEXT: vsext.vf2 v8, v0 +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: bltu a2, a1, .LBB81_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a3, 16 ; RV32-NEXT: .LBB81_2: -; RV32-NEXT: li a3, 0 -; RV32-NEXT: vsext.vf2 v24, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 +; RV32-NEXT: li a1, 0 +; RV32-NEXT: vsext.vf2 v16, v24 +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu +; RV32-NEXT: vncvt.x.x.w v24, v8 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: addi a3, a2, -16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: bltu a2, a1, .LBB81_4 +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: bltu a2, a3, .LBB81_4 ; RV32-NEXT: # %bb.3: -; RV32-NEXT: mv a3, a1 +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB81_4: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v24, 3 -; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v16, v8 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vslidedown.vi v8, v0, 2 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 18 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1958,64 +2009,57 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 24 +; RV64-NEXT: li a4, 10 ; RV64-NEXT: mul a3, a3, a4 ; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v8, v24, 16 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsext.vf2 v8, v24 -; RV64-NEXT: mv a1, a2 -; RV64-NEXT: bltu a2, a3, .LBB81_2 -; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 +; RV64-NEXT: vsext.vf2 v0, v16 +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB81_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a3, 16 ; RV64-NEXT: .LBB81_2: -; RV64-NEXT: li a3, 0 -; RV64-NEXT: addi a4, sp, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV64-NEXT: li a1, 0 ; RV64-NEXT: vsext.vf2 v16, v24 -; RV64-NEXT: vsll.vi v8, v8, 3 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV64-NEXT: addi a1, a2, -16 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 3 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t -; RV64-NEXT: bltu a2, a1, .LBB81_4 +; RV64-NEXT: vsll.vi v24, v0, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: addi a3, a2, -16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: bltu a2, a3, .LBB81_4 ; RV64-NEXT: # %bb.3: -; RV64-NEXT: mv a3, a1 +; RV64-NEXT: mv a1, a3 ; RV64-NEXT: .LBB81_4: +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vslidedown.vi v8, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 +; RV64-NEXT: li a1, 10 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 @@ -2032,57 +2076,76 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 18 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a4, a3, 3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; RV32-NEXT: vle32.v v24, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle32.v v0, (a1) +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vslidedown.vi v24, v0, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: li a3, 16 -; RV32-NEXT: vzext.vf2 v16, v24 -; RV32-NEXT: mv a1, a2 -; RV32-NEXT: bltu a2, a3, .LBB82_2 -; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 +; RV32-NEXT: vzext.vf2 v8, v0 +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: bltu a2, a1, .LBB82_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a3, 16 ; RV32-NEXT: .LBB82_2: -; RV32-NEXT: li a3, 0 -; RV32-NEXT: vzext.vf2 v24, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 +; RV32-NEXT: li a1, 0 +; RV32-NEXT: vzext.vf2 v16, v24 +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu +; RV32-NEXT: vncvt.x.x.w v24, v8 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: addi a3, a2, -16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: bltu a2, a1, .LBB82_4 +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: bltu a2, a3, .LBB82_4 ; RV32-NEXT: # %bb.3: -; RV32-NEXT: mv a3, a1 +; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB82_4: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v24, 3 -; RV32-NEXT: vsetvli zero, a3, e32, m4, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; RV32-NEXT: vncvt.x.x.w v16, v8 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vslidedown.vi v8, v0, 2 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 18 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -2092,64 +2155,57 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 24 +; RV64-NEXT: li a4, 10 ; RV64-NEXT: mul a3, a3, a4 ; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v8, v24, 16 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: li a3, 16 -; RV64-NEXT: vzext.vf2 v8, v24 -; RV64-NEXT: mv a1, a2 -; RV64-NEXT: bltu a2, a3, .LBB82_2 -; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 +; RV64-NEXT: vzext.vf2 v0, v16 +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB82_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a3, 16 ; RV64-NEXT: .LBB82_2: -; RV64-NEXT: li a3, 0 -; RV64-NEXT: addi a4, sp, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV64-NEXT: li a1, 0 ; RV64-NEXT: vzext.vf2 v16, v24 -; RV64-NEXT: vsll.vi v8, v8, 3 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu -; RV64-NEXT: addi a1, a2, -16 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 3 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t -; RV64-NEXT: bltu a2, a1, .LBB82_4 +; RV64-NEXT: vsll.vi v24, v0, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: addi a3, a2, -16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: bltu a2, a3, .LBB82_4 ; RV64-NEXT: # %bb.3: -; RV64-NEXT: mv a3, a1 +; RV64-NEXT: mv a1, a3 ; RV64-NEXT: .LBB82_4: +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vslidedown.vi v8, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 24 +; RV64-NEXT: li a1, 10 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll @@ -299,10 +299,13 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB23_4: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v8, v0, 2 ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vse64.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.vp.store.v32f64.p0v32f64(<32 x double> %val, <32 x double>* %ptr, <32 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -449,6 +449,8 @@ ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vi v0, v24, 2 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu @@ -486,15 +488,31 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c) { ; CHECK-LABEL: select_evl_v32i64: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.select.v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 17) ret <32 x i64> %v @@ -604,46 +622,39 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: bltu a2, a3, .LBB35_2 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: bltu a2, a1, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: .LBB35_2: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu ; CHECK-NEXT: addi a0, a2, -32 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8re8.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: bltu a2, a0, .LBB35_4 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: mv a3, a0 +; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: .LBB35_4: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwadd_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwadd_v2i16: @@ -250,25 +250,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vwadd.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +322,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vwadd.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,24 +394,61 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vwadd.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwaddu_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwaddu_v2i16: @@ -250,25 +250,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vwaddu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +322,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vwaddu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,24 +394,61 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vwaddu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwmul_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwmul_v2i16: @@ -251,25 +251,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmul.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vwmul.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -287,25 +324,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmul.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vwmul.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -323,24 +397,61 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmul.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vwmul.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwmulsu_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwmulsu_v2i16: @@ -267,25 +267,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulsu.vv v0, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vwmulsu.vv v16, v8, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -303,25 +340,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulsu.vv v0, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vwmulsu.vv v16, v8, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -339,24 +413,61 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulsu.vv v0, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vwmulsu.vv v16, v8, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s define <2 x i16> @vwmulu_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwmulu_v2i16: @@ -251,25 +251,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vwmulu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -287,25 +324,62 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vwmulu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -323,24 +397,61 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vwmulu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwsub_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwsub_v2i16: @@ -250,25 +250,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vwsub.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +322,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vwsub.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,24 +394,61 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vwsub.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <2 x i16> @vwsubu_v2i16(<2 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: vwsubu_v2i16: @@ -250,25 +250,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vwsubu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +322,62 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vwsubu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,24 +394,61 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v0, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vwsubu.vv v16, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll @@ -155,13 +155,17 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v0, v1, 2 ; CHECK-NEXT: bltu a0, a2, .LBB12_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu @@ -189,6 +193,8 @@ ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu ; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll --- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll @@ -4,11 +4,15 @@ define i64 @reduce_add(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_add: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredsum.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) @@ -21,9 +25,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredsum.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) @@ -34,11 +40,15 @@ define i64 @reduce_and(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_and: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredand.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) @@ -51,9 +61,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredand.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) @@ -64,11 +76,15 @@ define i64 @reduce_or(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_or: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) @@ -81,9 +97,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) @@ -94,11 +112,15 @@ define i64 @reduce_xor(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_xor: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredxor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) @@ -109,11 +131,15 @@ define i64 @reduce_xor2(<4 x i64> %v) { ; CHECK-LABEL: reduce_xor2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredxor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: andi a0, a0, 8 ; CHECK-NEXT: ret entry: @@ -125,11 +151,15 @@ define i64 @reduce_umax(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_umax: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredmaxu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) @@ -142,9 +172,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredmaxu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) @@ -155,11 +187,15 @@ define i64 @reduce_umin(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_umin: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredminu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) @@ -172,9 +208,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredminu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) @@ -185,11 +223,15 @@ define i64 @reduce_smax(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_smax: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredmax.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) @@ -202,9 +244,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredmax.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) @@ -215,11 +259,15 @@ define i64 @reduce_smin(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_smin: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredmin.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) @@ -232,9 +280,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, 8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vredmin.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret entry: %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) @@ -245,11 +295,15 @@ define float @reduce_fadd(float %x, <4 x float> %v) { ; CHECK-LABEL: reduce_fadd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret entry: %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float %x, <4 x float> %v) @@ -259,11 +313,15 @@ define float @reduce_fmax(float %x, <4 x float> %v) { ; CHECK-LABEL: reduce_fmax: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret entry: %rdx = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) @@ -274,11 +332,15 @@ define float @reduce_fmin(float %x, <4 x float> %v) { ; CHECK-LABEL: reduce_fmin: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret entry: %rdx = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -367,6 +367,10 @@ define @insert_nxv32f16_undef_nxv1f16_0( %subvec) { ; CHECK-LABEL: insert_nxv32f16_undef_nxv1f16_0: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %v = call @llvm.vector.insert.nxv1f16.nxv32f16( undef, %subvec, i64 0) ret %v @@ -379,8 +383,13 @@ ; CHECK-NEXT: srli a1, a0, 3 ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vx v14, v8, a0 +; CHECK-NEXT: vslideup.vx v16, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv1r.v v14, v16 ; CHECK-NEXT: ret %v = call @llvm.vector.insert.nxv1f16.nxv32f16( undef, %subvec, i64 26) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv32.ll @@ -15,7 +15,9 @@ define @insertelt_nxv1f16_imm( %v, half %elt) { ; CHECK-LABEL: insertelt_nxv1f16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -27,7 +29,9 @@ define @insertelt_nxv1f16_idx( %v, half %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv1f16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu @@ -50,7 +54,9 @@ define @insertelt_nxv2f16_imm( %v, half %elt) { ; CHECK-LABEL: insertelt_nxv2f16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -62,7 +68,9 @@ define @insertelt_nxv2f16_idx( %v, half %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv2f16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu @@ -86,6 +94,8 @@ ; CHECK-LABEL: insertelt_nxv4f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -98,6 +108,8 @@ ; CHECK-LABEL: insertelt_nxv4f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu @@ -121,6 +133,8 @@ ; CHECK-LABEL: insertelt_nxv8f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -133,6 +147,8 @@ ; CHECK-LABEL: insertelt_nxv8f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m2, tu, mu @@ -156,6 +172,8 @@ ; CHECK-LABEL: insertelt_nxv16f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -168,6 +186,8 @@ ; CHECK-LABEL: insertelt_nxv16f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu @@ -191,6 +211,8 @@ ; CHECK-LABEL: insertelt_nxv32f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -203,6 +225,8 @@ ; CHECK-LABEL: insertelt_nxv32f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu @@ -225,7 +249,9 @@ define @insertelt_nxv1f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv1f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -237,7 +263,9 @@ define @insertelt_nxv1f32_idx( %v, float %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv1f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu @@ -260,7 +288,9 @@ define @insertelt_nxv2f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv2f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -272,7 +302,9 @@ define @insertelt_nxv2f32_idx( %v, float %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv2f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu @@ -295,7 +327,9 @@ define @insertelt_nxv4f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv4f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -307,7 +341,9 @@ define @insertelt_nxv4f32_idx( %v, float %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv4f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu @@ -330,7 +366,9 @@ define @insertelt_nxv8f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv8f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -342,7 +380,9 @@ define @insertelt_nxv8f32_idx( %v, float %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv8f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu @@ -365,7 +405,9 @@ define @insertelt_nxv16f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv16f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -377,7 +419,9 @@ define @insertelt_nxv16f32_idx( %v, float %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv16f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu @@ -400,7 +444,9 @@ define @insertelt_nxv1f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv1f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -412,7 +458,9 @@ define @insertelt_nxv1f64_idx( %v, double %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv1f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, tu, mu @@ -435,7 +483,9 @@ define @insertelt_nxv2f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv2f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -447,7 +497,9 @@ define @insertelt_nxv2f64_idx( %v, double %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv2f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m2, tu, mu @@ -470,7 +522,9 @@ define @insertelt_nxv4f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv4f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -482,7 +536,9 @@ define @insertelt_nxv4f64_idx( %v, double %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv4f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m4, tu, mu @@ -505,7 +561,9 @@ define @insertelt_nxv8f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv8f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -517,7 +575,9 @@ define @insertelt_nxv8f64_idx( %v, double %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv8f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp-rv64.ll @@ -15,7 +15,9 @@ define @insertelt_nxv1f16_imm( %v, half %elt) { ; CHECK-LABEL: insertelt_nxv1f16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -27,7 +29,9 @@ define @insertelt_nxv1f16_idx( %v, half %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1f16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu @@ -50,7 +54,9 @@ define @insertelt_nxv2f16_imm( %v, half %elt) { ; CHECK-LABEL: insertelt_nxv2f16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -62,7 +68,9 @@ define @insertelt_nxv2f16_idx( %v, half %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2f16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu @@ -86,6 +94,8 @@ ; CHECK-LABEL: insertelt_nxv4f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -98,6 +108,8 @@ ; CHECK-LABEL: insertelt_nxv4f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu @@ -121,6 +133,8 @@ ; CHECK-LABEL: insertelt_nxv8f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -133,6 +147,8 @@ ; CHECK-LABEL: insertelt_nxv8f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m2, tu, mu @@ -156,6 +172,8 @@ ; CHECK-LABEL: insertelt_nxv16f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -168,6 +186,8 @@ ; CHECK-LABEL: insertelt_nxv16f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu @@ -191,6 +211,8 @@ ; CHECK-LABEL: insertelt_nxv32f16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -203,6 +225,8 @@ ; CHECK-LABEL: insertelt_nxv32f16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, tu, mu @@ -225,7 +249,9 @@ define @insertelt_nxv1f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv1f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -237,7 +263,9 @@ define @insertelt_nxv1f32_idx( %v, float %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu @@ -260,7 +288,9 @@ define @insertelt_nxv2f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv2f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -272,7 +302,9 @@ define @insertelt_nxv2f32_idx( %v, float %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu @@ -295,7 +327,9 @@ define @insertelt_nxv4f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv4f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -307,7 +341,9 @@ define @insertelt_nxv4f32_idx( %v, float %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv4f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu @@ -330,7 +366,9 @@ define @insertelt_nxv8f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv8f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -342,7 +380,9 @@ define @insertelt_nxv8f32_idx( %v, float %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv8f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu @@ -365,7 +405,9 @@ define @insertelt_nxv16f32_imm( %v, float %elt) { ; CHECK-LABEL: insertelt_nxv16f32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetivli zero, 4, e32, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -377,7 +419,9 @@ define @insertelt_nxv16f32_idx( %v, float %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv16f32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, mu @@ -400,7 +444,9 @@ define @insertelt_nxv1f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv1f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -412,7 +458,9 @@ define @insertelt_nxv1f64_idx( %v, double %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, tu, mu @@ -435,7 +483,9 @@ define @insertelt_nxv2f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv2f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -447,7 +497,9 @@ define @insertelt_nxv2f64_idx( %v, double %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m2, tu, mu @@ -470,7 +522,9 @@ define @insertelt_nxv4f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv4f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -482,7 +536,9 @@ define @insertelt_nxv4f64_idx( %v, double %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv4f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m4, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m4, tu, mu @@ -505,7 +561,9 @@ define @insertelt_nxv8f64_imm( %v, double %elt) { ; CHECK-LABEL: insertelt_nxv8f64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetivli zero, 4, e64, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -517,7 +575,9 @@ define @insertelt_nxv8f64_idx( %v, double %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv8f64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m8, tu, mu ; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll --- a/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll @@ -5,8 +5,11 @@ define @insertelt_nxv1i1( %x, i1 %elt) { ; CHECK-LABEL: insertelt_nxv1i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 3, e8, mf8, tu, mu @@ -22,8 +25,11 @@ define @insertelt_idx_nxv1i1( %x, i1 %elt, i64 %idx) { ; CHECK-LABEL: insertelt_idx_nxv1i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 @@ -40,8 +46,11 @@ define @insertelt_nxv2i1( %x, i1 %elt) { ; CHECK-LABEL: insertelt_nxv2i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, tu, mu @@ -57,8 +66,11 @@ define @insertelt_idx_nxv2i1( %x, i1 %elt, i64 %idx) { ; CHECK-LABEL: insertelt_idx_nxv2i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 @@ -75,8 +87,11 @@ define @insertelt_nxv4i1( %x, i1 %elt) { ; CHECK-LABEL: insertelt_nxv4i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, mu @@ -92,8 +107,11 @@ define @insertelt_idx_nxv4i1( %x, i1 %elt, i64 %idx) { ; CHECK-LABEL: insertelt_idx_nxv4i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 @@ -110,8 +128,11 @@ define @insertelt_nxv8i1( %x, i1 %elt) { ; CHECK-LABEL: insertelt_nxv8i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 3, e8, m1, tu, mu @@ -127,8 +148,11 @@ define @insertelt_idx_nxv8i1( %x, i1 %elt, i64 %idx) { ; CHECK-LABEL: insertelt_idx_nxv8i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 @@ -145,8 +169,11 @@ define @insertelt_nxv16i1( %x, i1 %elt) { ; CHECK-LABEL: insertelt_nxv16i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: vsetivli zero, 3, e8, m2, tu, mu @@ -162,8 +189,11 @@ define @insertelt_idx_nxv16i1( %x, i1 %elt, i64 %idx) { ; CHECK-LABEL: insertelt_idx_nxv16i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 @@ -180,8 +210,11 @@ define @insertelt_nxv32i1( %x, i1 %elt) { ; CHECK-LABEL: insertelt_nxv32i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, mu ; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vsetivli zero, 3, e8, m4, tu, mu @@ -197,8 +230,11 @@ define @insertelt_idx_nxv32i1( %x, i1 %elt, i64 %idx) { ; CHECK-LABEL: insertelt_idx_nxv32i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m4, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, mu ; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 @@ -215,8 +251,11 @@ define @insertelt_nxv64i1( %x, i1 %elt) { ; CHECK-LABEL: insertelt_nxv64i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m8, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m8, ta, mu ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: vsetivli zero, 3, e8, m8, tu, mu @@ -232,8 +271,11 @@ define @insertelt_idx_nxv64i1( %x, i1 %elt, i64 %idx) { ; CHECK-LABEL: insertelt_idx_nxv64i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, tu, mu ; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m8, ta, mu ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll @@ -15,7 +15,9 @@ define @insertelt_nxv1i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv1i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -27,7 +29,9 @@ define @insertelt_nxv1i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu @@ -50,7 +54,9 @@ define @insertelt_nxv2i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv2i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -62,7 +68,9 @@ define @insertelt_nxv2i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, mu @@ -85,7 +93,9 @@ define @insertelt_nxv4i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv4i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -97,7 +107,9 @@ define @insertelt_nxv4i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv4i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu @@ -120,7 +132,9 @@ define @insertelt_nxv8i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv8i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -132,7 +146,9 @@ define @insertelt_nxv8i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv8i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu @@ -155,7 +171,9 @@ define @insertelt_nxv16i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv16i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -167,7 +185,9 @@ define @insertelt_nxv16i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv16i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, mu @@ -190,7 +210,9 @@ define @insertelt_nxv32i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv32i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -202,7 +224,9 @@ define @insertelt_nxv32i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv32i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, mu @@ -225,7 +249,9 @@ define @insertelt_nxv64i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv64i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -237,7 +263,9 @@ define @insertelt_nxv64i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv64i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, mu @@ -260,7 +288,9 @@ define @insertelt_nxv1i16_imm( %v, i16 signext %elt) { ; CHECK-LABEL: insertelt_nxv1i16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -272,7 +302,9 @@ define @insertelt_nxv1i16_idx( %v, i16 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1i16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu @@ -295,7 +327,9 @@ define @insertelt_nxv2i16_imm( %v, i16 signext %elt) { ; CHECK-LABEL: insertelt_nxv2i16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -307,7 +341,9 @@ define @insertelt_nxv2i16_idx( %v, i16 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2i16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu @@ -331,6 +367,8 @@ ; CHECK-LABEL: insertelt_nxv4i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -343,6 +381,8 @@ ; CHECK-LABEL: insertelt_nxv4i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu @@ -366,6 +406,8 @@ ; CHECK-LABEL: insertelt_nxv8i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -378,6 +420,8 @@ ; CHECK-LABEL: insertelt_nxv8i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu @@ -401,6 +445,8 @@ ; CHECK-LABEL: insertelt_nxv16i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -413,6 +459,8 @@ ; CHECK-LABEL: insertelt_nxv16i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu @@ -436,6 +484,8 @@ ; CHECK-LABEL: insertelt_nxv32i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -448,6 +498,8 @@ ; CHECK-LABEL: insertelt_nxv32i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, mu @@ -470,7 +522,9 @@ define @insertelt_nxv1i32_imm( %v, i32 %elt) { ; CHECK-LABEL: insertelt_nxv1i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -482,7 +536,9 @@ define @insertelt_nxv1i32_idx( %v, i32 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv1i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu @@ -505,7 +561,9 @@ define @insertelt_nxv2i32_imm( %v, i32 %elt) { ; CHECK-LABEL: insertelt_nxv2i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -517,7 +575,9 @@ define @insertelt_nxv2i32_idx( %v, i32 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv2i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu @@ -540,7 +600,9 @@ define @insertelt_nxv4i32_imm( %v, i32 %elt) { ; CHECK-LABEL: insertelt_nxv4i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -552,7 +614,9 @@ define @insertelt_nxv4i32_idx( %v, i32 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv4i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu @@ -575,7 +639,9 @@ define @insertelt_nxv8i32_imm( %v, i32 %elt) { ; CHECK-LABEL: insertelt_nxv8i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -587,7 +653,9 @@ define @insertelt_nxv8i32_idx( %v, i32 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv8i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu @@ -610,7 +678,9 @@ define @insertelt_nxv16i32_imm( %v, i32 %elt) { ; CHECK-LABEL: insertelt_nxv16i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -622,7 +692,9 @@ define @insertelt_nxv16i32_idx( %v, i32 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv16i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu @@ -820,7 +892,9 @@ ; CHECK-LABEL: insertelt_nxv2i64_imm_c10: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 10 -; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -833,7 +907,9 @@ ; CHECK-LABEL: insertelt_nxv2i64_idx_c10: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 10 -; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a2, zero, e64, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a1 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m2, tu, mu @@ -858,7 +934,9 @@ ; CHECK-LABEL: insertelt_nxv2i64_imm_cn1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -871,7 +949,9 @@ ; CHECK-LABEL: insertelt_nxv2i64_idx_cn1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a2, zero, e64, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a1 ; CHECK-NEXT: addi a1, a0, 1 ; CHECK-NEXT: vsetvli zero, a1, e64, m2, tu, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll @@ -15,7 +15,9 @@ define @insertelt_nxv1i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv1i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -27,7 +29,9 @@ define @insertelt_nxv1i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu @@ -50,7 +54,9 @@ define @insertelt_nxv2i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv2i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -62,7 +68,9 @@ define @insertelt_nxv2i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, mu @@ -85,7 +93,9 @@ define @insertelt_nxv4i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv4i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -97,7 +107,9 @@ define @insertelt_nxv4i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv4i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu @@ -120,7 +132,9 @@ define @insertelt_nxv8i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv8i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -132,7 +146,9 @@ define @insertelt_nxv8i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv8i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu @@ -155,7 +171,9 @@ define @insertelt_nxv16i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv16i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -167,7 +185,9 @@ define @insertelt_nxv16i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv16i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, mu @@ -190,7 +210,9 @@ define @insertelt_nxv32i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv32i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -202,7 +224,9 @@ define @insertelt_nxv32i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv32i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, mu @@ -225,7 +249,9 @@ define @insertelt_nxv64i8_imm( %v, i8 signext %elt) { ; CHECK-LABEL: insertelt_nxv64i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -237,7 +263,9 @@ define @insertelt_nxv64i8_idx( %v, i8 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv64i8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, mu @@ -260,7 +288,9 @@ define @insertelt_nxv1i16_imm( %v, i16 signext %elt) { ; CHECK-LABEL: insertelt_nxv1i16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -272,7 +302,9 @@ define @insertelt_nxv1i16_idx( %v, i16 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1i16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu @@ -295,7 +327,9 @@ define @insertelt_nxv2i16_imm( %v, i16 signext %elt) { ; CHECK-LABEL: insertelt_nxv2i16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -307,7 +341,9 @@ define @insertelt_nxv2i16_idx( %v, i16 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2i16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu @@ -331,6 +367,8 @@ ; CHECK-LABEL: insertelt_nxv4i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -343,6 +381,8 @@ ; CHECK-LABEL: insertelt_nxv4i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu @@ -366,6 +406,8 @@ ; CHECK-LABEL: insertelt_nxv8i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -378,6 +420,8 @@ ; CHECK-LABEL: insertelt_nxv8i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu @@ -401,6 +445,8 @@ ; CHECK-LABEL: insertelt_nxv16i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -413,6 +459,8 @@ ; CHECK-LABEL: insertelt_nxv16i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu @@ -436,6 +484,8 @@ ; CHECK-LABEL: insertelt_nxv32i16_imm: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vsetivli zero, 4, e16, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -448,6 +498,8 @@ ; CHECK-LABEL: insertelt_nxv32i16_idx: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, mu @@ -470,7 +522,9 @@ define @insertelt_nxv1i32_imm( %v, i32 signext %elt) { ; CHECK-LABEL: insertelt_nxv1i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -482,7 +536,9 @@ define @insertelt_nxv1i32_idx( %v, i32 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv1i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu @@ -505,7 +561,9 @@ define @insertelt_nxv2i32_imm( %v, i32 signext %elt) { ; CHECK-LABEL: insertelt_nxv2i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -517,7 +575,9 @@ define @insertelt_nxv2i32_idx( %v, i32 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv2i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu @@ -540,7 +600,9 @@ define @insertelt_nxv4i32_imm( %v, i32 signext %elt) { ; CHECK-LABEL: insertelt_nxv4i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -552,7 +614,9 @@ define @insertelt_nxv4i32_idx( %v, i32 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv4i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu @@ -575,7 +639,9 @@ define @insertelt_nxv8i32_imm( %v, i32 signext %elt) { ; CHECK-LABEL: insertelt_nxv8i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -587,7 +653,9 @@ define @insertelt_nxv8i32_idx( %v, i32 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv8i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu @@ -610,7 +678,9 @@ define @insertelt_nxv16i32_imm( %v, i32 signext %elt) { ; CHECK-LABEL: insertelt_nxv16i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vsetivli zero, 4, e32, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -622,7 +692,9 @@ define @insertelt_nxv16i32_idx( %v, i32 signext %elt, i32 signext %idx) { ; CHECK-LABEL: insertelt_nxv16i32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, tu, mu @@ -645,7 +717,9 @@ define @insertelt_nxv1i64_imm( %v, i64 %elt) { ; CHECK-LABEL: insertelt_nxv1i64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 4, e64, m1, tu, mu ; CHECK-NEXT: vslideup.vi v8, v9, 3 @@ -657,7 +731,9 @@ define @insertelt_nxv1i64_idx( %v, i64 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv1i64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: sext.w a0, a1 ; CHECK-NEXT: addi a1, a0, 1 @@ -681,7 +757,9 @@ define @insertelt_nxv2i64_imm( %v, i64 %elt) { ; CHECK-LABEL: insertelt_nxv2i64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu ; CHECK-NEXT: vslideup.vi v8, v10, 3 @@ -693,7 +771,9 @@ define @insertelt_nxv2i64_idx( %v, i64 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv2i64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a2, zero, e64, m2, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: sext.w a0, a1 ; CHECK-NEXT: addi a1, a0, 1 @@ -717,7 +797,9 @@ define @insertelt_nxv4i64_imm( %v, i64 %elt) { ; CHECK-LABEL: insertelt_nxv4i64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: vsetivli zero, 4, e64, m4, tu, mu ; CHECK-NEXT: vslideup.vi v8, v12, 3 @@ -729,7 +811,9 @@ define @insertelt_nxv4i64_idx( %v, i64 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv4i64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e64, m4, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a2, zero, e64, m4, tu, mu ; CHECK-NEXT: vmv.s.x v12, a0 ; CHECK-NEXT: sext.w a0, a1 ; CHECK-NEXT: addi a1, a0, 1 @@ -753,7 +837,9 @@ define @insertelt_nxv8i64_imm( %v, i64 %elt) { ; CHECK-LABEL: insertelt_nxv8i64_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: vsetivli zero, 4, e64, m8, tu, mu ; CHECK-NEXT: vslideup.vi v8, v16, 3 @@ -765,7 +851,9 @@ define @insertelt_nxv8i64_idx( %v, i64 %elt, i32 %idx) { ; CHECK-LABEL: insertelt_nxv8i64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli a2, zero, e64, m8, tu, mu ; CHECK-NEXT: vmv.s.x v16, a0 ; CHECK-NEXT: sext.w a0, a1 ; CHECK-NEXT: addi a1, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll @@ -247,6 +247,8 @@ define @masked_load_zero_mask(* %a) nounwind { ; CHECK-LABEL: masked_load_zero_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv2i8(* %a, i32 1, zeroinitializer, undef) ret %load diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll --- a/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll @@ -1352,8 +1352,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu ; CHECK-NEXT: vmfeq.vv v0, v9, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v9, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret entry: %mask = call @llvm.riscv.vmfeq.nxv1f16( @@ -1385,16 +1388,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vmseq.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmseq.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: intrinsic_vmseq_mask_vx_nxv1i64_i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmseq.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmseq.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret entry: %a = call @llvm.riscv.vmseq.mask.nxv1i64.i64( @@ -1422,18 +1431,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsle.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: intrinsic_vmsge_mask_vx_nxv1i64_i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmslt.vx v8, v8, a0, v0.t -; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; RV64-NEXT: vmxor.mm v0, v8, v0 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmslt.vx v9, v8, a0, v0.t +; RV64-NEXT: vmxor.mm v0, v9, v0 ; RV64-NEXT: ret entry: %a = call @llvm.riscv.vmsge.mask.nxv1i64.i64( diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -1277,9 +1277,12 @@ ; RV32-NEXT: vluxei32.v v16, (zero), v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: srli a2, a0, 3 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vx v0, v0, a2 +; RV32-NEXT: vslidedown.vx v8, v0, a2 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vluxei32.v v24, (zero), v12, v0.t ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, a1, a0 @@ -1297,21 +1300,23 @@ ; RV64-NEXT: vl8re64.v v24, (a0) ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vmv8r.v v16, v8 -; RV64-NEXT: vl8re64.v v8, (a1) +; RV64-NEXT: vl8re64.v v16, (a1) ; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; RV64-NEXT: vluxei64.v v24, (zero), v16, v0.t +; RV64-NEXT: vluxei64.v v24, (zero), v8, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: srli a1, a0, 3 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vslidedown.vx v8, v0, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vluxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vluxei64.v v16, (zero), v8, v0.t ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, a2, a0 -; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vs8r.v v16, (a0) ; RV64-NEXT: vs8r.v v24, (a2) ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 @@ -2213,15 +2218,21 @@ ; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmv1r.v v8, v10 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vslidedown.vx v10, v0, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v9 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v10 ; RV64-NEXT: vluxei64.v v11, (a0), v16, v0.t -; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: vmv.v.v v9, v11 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, %idxs %v = call @llvm.masked.gather.nxv16i8.nxv16p0i8( %ptrs, i32 2, %m, %passthru) @@ -2237,15 +2248,21 @@ ; RV32-NEXT: vsext.vf4 v16, v8 ; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu ; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vmv2r.v v8, v12 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 2 -; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, mu -; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32-NEXT: vslidedown.vx v12, v0, a1 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu ; RV32-NEXT: vsext.vf4 v16, v10 ; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vluxei32.v v14, (a0), v16, v0.t -; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: vmv.v.v v10, v14 ; RV32-NEXT: ret ; ; RV64-LABEL: mgather_baseidx_nxv32i8: @@ -2255,28 +2272,40 @@ ; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t +; RV64-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v20, 0 +; RV64-NEXT: vmv1r.v v8, v12 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a2, a1, 3 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v0, a2 +; RV64-NEXT: vslidedown.vx v0, v16, a2 ; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v24, v9 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t +; RV64-NEXT: vmv.v.v v9, v13 ; RV64-NEXT: srli a1, a1, 2 -; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; RV64-NEXT: vslidedown.vx v0, v16, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v10 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t +; RV64-NEXT: vmv.v.v v10, v14 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v0, a2 +; RV64-NEXT: vslidedown.vx v12, v0, a2 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t -; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: vmv.v.v v11, v15 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, %idxs %v = call @llvm.masked.gather.nxv32i8.nxv32p0i8( %ptrs, i32 2, %m, %passthru) diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -1831,9 +1831,12 @@ ; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vx v0, v0, a0 +; RV32-NEXT: vslidedown.vx v8, v0, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vsoxei32.v v16, (zero), v28, v0.t ; RV32-NEXT: ret ; @@ -1852,9 +1855,12 @@ ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: srli a0, a0, 3 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v0, a0 +; RV64-NEXT: vslidedown.vx v8, v0, a0 ; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t @@ -1882,9 +1888,12 @@ ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vslidedown.vx v8, v0, a1 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret ; @@ -1897,12 +1906,15 @@ ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vslidedown.vx v8, v0, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v8, v3 -; RV64-NEXT: vsll.vi v8, v8, 3 -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vsext.vf8 v24, v3 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, %idxs %v0 = call @llvm.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) @@ -1922,9 +1934,12 @@ ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vslidedown.vx v8, v0, a1 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret ; @@ -1937,12 +1952,15 @@ ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vslidedown.vx v8, v0, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v8, v6 -; RV64-NEXT: vsll.vi v8, v8, 3 -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vsext.vf4 v24, v6 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, %idxs %v0 = call @llvm.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll @@ -507,10 +507,14 @@ ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 ; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v16, v16, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, mu -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v28, v16, v8 -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v20, v8 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v16, v8 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v20, v8 +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV32-BITS-UNKNOWN-NEXT: vmv4r.v v8, v16 +; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 +; RV32-BITS-UNKNOWN-NEXT: vmv4r.v v12, v24 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, mu -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v24, 1 +; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; @@ -541,10 +545,13 @@ ; RV32-BITS-512-NEXT: vmv.v.i v16, 0 ; RV32-BITS-512-NEXT: vmerge.vim v16, v16, 1, v0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, mu -; RV32-BITS-512-NEXT: vrgather.vv v28, v16, v8 -; RV32-BITS-512-NEXT: vrgather.vv v24, v20, v8 +; RV32-BITS-512-NEXT: vrgather.vv v12, v16, v8 +; RV32-BITS-512-NEXT: vrgather.vv v16, v20, v8 +; RV32-BITS-512-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV32-BITS-512-NEXT: vmv4r.v v8, v16 +; RV32-BITS-512-NEXT: vmv.v.i v16, 0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, mu -; RV32-BITS-512-NEXT: vand.vi v8, v24, 1 +; RV32-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-512-NEXT: ret ; @@ -560,10 +567,14 @@ ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 ; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v16, v16, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, mu -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v28, v16, v8 -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v20, v8 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v16, v8 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v20, v8 +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV64-BITS-UNKNOWN-NEXT: vmv4r.v v8, v16 +; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 +; RV64-BITS-UNKNOWN-NEXT: vmv4r.v v12, v24 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, mu -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v24, 1 +; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; @@ -594,10 +605,13 @@ ; RV64-BITS-512-NEXT: vmv.v.i v16, 0 ; RV64-BITS-512-NEXT: vmerge.vim v16, v16, 1, v0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, mu -; RV64-BITS-512-NEXT: vrgather.vv v28, v16, v8 -; RV64-BITS-512-NEXT: vrgather.vv v24, v20, v8 +; RV64-BITS-512-NEXT: vrgather.vv v12, v16, v8 +; RV64-BITS-512-NEXT: vrgather.vv v16, v20, v8 +; RV64-BITS-512-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV64-BITS-512-NEXT: vmv4r.v v8, v16 +; RV64-BITS-512-NEXT: vmv.v.i v16, 0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, mu -; RV64-BITS-512-NEXT: vand.vi v8, v24, 1 +; RV64-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.experimental.vector.reverse.nxv64i1( %a) @@ -1078,11 +1092,13 @@ ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, mu ; RV32-BITS-UNKNOWN-NEXT: vid.v v16 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v24, v16, a0 +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, mu -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v20, v8, v24 -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v12, v24 -; RV32-BITS-UNKNOWN-NEXT: vmv8r.v v8, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v8, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v12, v16 +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 +; RV32-BITS-UNKNOWN-NEXT: vmv4r.v v12, v24 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv64i8: @@ -1104,10 +1120,12 @@ ; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, mu ; RV32-BITS-512-NEXT: vid.v v16 -; RV32-BITS-512-NEXT: vrsub.vx v24, v16, a0 -; RV32-BITS-512-NEXT: vrgather.vv v20, v8, v24 -; RV32-BITS-512-NEXT: vrgather.vv v16, v12, v24 -; RV32-BITS-512-NEXT: vmv8r.v v8, v16 +; RV32-BITS-512-NEXT: vrsub.vx v16, v16, a0 +; RV32-BITS-512-NEXT: vrgather.vv v20, v8, v16 +; RV32-BITS-512-NEXT: vrgather.vv v8, v12, v16 +; RV32-BITS-512-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV32-BITS-512-NEXT: vmv.v.i v24, 0 +; RV32-BITS-512-NEXT: vmv4r.v v12, v20 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv64i8: @@ -1117,11 +1135,13 @@ ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, mu ; RV64-BITS-UNKNOWN-NEXT: vid.v v16 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v24, v16, a0 +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, mu -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v20, v8, v24 -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v12, v24 -; RV64-BITS-UNKNOWN-NEXT: vmv8r.v v8, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v8, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v12, v16 +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 +; RV64-BITS-UNKNOWN-NEXT: vmv4r.v v12, v24 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv64i8: @@ -1143,10 +1163,12 @@ ; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, mu ; RV64-BITS-512-NEXT: vid.v v16 -; RV64-BITS-512-NEXT: vrsub.vx v24, v16, a0 -; RV64-BITS-512-NEXT: vrgather.vv v20, v8, v24 -; RV64-BITS-512-NEXT: vrgather.vv v16, v12, v24 -; RV64-BITS-512-NEXT: vmv8r.v v8, v16 +; RV64-BITS-512-NEXT: vrsub.vx v16, v16, a0 +; RV64-BITS-512-NEXT: vrgather.vv v20, v8, v16 +; RV64-BITS-512-NEXT: vrgather.vv v8, v12, v16 +; RV64-BITS-512-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; RV64-BITS-512-NEXT: vmv.v.i v24, 0 +; RV64-BITS-512-NEXT: vmv4r.v v12, v20 ; RV64-BITS-512-NEXT: ret %res = call @llvm.experimental.vector.reverse.nxv64i8( %a) ret %res diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -9,8 +9,11 @@ define @fcmp_oeq_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"oeq", %m, i32 %evl) ret %v @@ -19,8 +22,11 @@ define @fcmp_oeq_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -31,8 +37,11 @@ define @fcmp_oeq_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -43,8 +52,11 @@ define @fcmp_ogt_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ogt", %m, i32 %evl) ret %v @@ -53,8 +65,11 @@ define @fcmp_ogt_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -65,8 +80,11 @@ define @fcmp_ogt_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -77,8 +95,11 @@ define @fcmp_oge_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"oge", %m, i32 %evl) ret %v @@ -87,8 +108,11 @@ define @fcmp_oge_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -99,8 +123,11 @@ define @fcmp_oge_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -111,8 +138,11 @@ define @fcmp_olt_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"olt", %m, i32 %evl) ret %v @@ -121,8 +151,11 @@ define @fcmp_olt_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -133,8 +166,11 @@ define @fcmp_olt_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -145,8 +181,11 @@ define @fcmp_ole_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ole", %m, i32 %evl) ret %v @@ -155,8 +194,11 @@ define @fcmp_ole_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -167,8 +209,11 @@ define @fcmp_ole_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -179,11 +224,15 @@ define @fcmp_one_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vv v11, v9, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v11, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"one", %m, i32 %evl) ret %v @@ -192,11 +241,15 @@ define @fcmp_one_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -207,11 +260,15 @@ define @fcmp_one_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -222,11 +279,15 @@ define @fcmp_ord_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ord_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v9, v9, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v9, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ord", %m, i32 %evl) ret %v @@ -237,11 +298,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -254,11 +319,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -269,11 +338,15 @@ define @fcmp_ueq_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vv v11, v9, v8, v0.t +; CHECK-NEXT: vmnor.mm v0, v11, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ueq", %m, i32 %evl) ret %v @@ -282,11 +355,15 @@ define @fcmp_ueq_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmnor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -297,11 +374,15 @@ define @fcmp_ueq_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmnor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -312,10 +393,11 @@ define @fcmp_ugt_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ugt", %m, i32 %evl) ret %v @@ -324,10 +406,11 @@ define @fcmp_ugt_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -338,10 +421,11 @@ define @fcmp_ugt_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfge.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -352,10 +436,11 @@ define @fcmp_uge_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"uge", %m, i32 %evl) ret %v @@ -364,10 +449,11 @@ define @fcmp_uge_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -378,10 +464,11 @@ define @fcmp_uge_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -392,10 +479,11 @@ define @fcmp_ult_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ult", %m, i32 %evl) ret %v @@ -404,10 +492,11 @@ define @fcmp_ult_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfge.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -418,10 +507,11 @@ define @fcmp_ult_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfle.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -432,10 +522,11 @@ define @fcmp_ule_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"ule", %m, i32 %evl) ret %v @@ -444,10 +535,11 @@ define @fcmp_ule_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -458,10 +550,11 @@ define @fcmp_ule_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -472,8 +565,11 @@ define @fcmp_une_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"une", %m, i32 %evl) ret %v @@ -482,8 +578,11 @@ define @fcmp_une_vf_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -494,8 +593,11 @@ define @fcmp_une_vf_swap_nxv1f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_swap_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -506,11 +608,15 @@ define @fcmp_uno_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uno_vv_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vv v9, v9, v9, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vv v10, v9, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f16( %va, %vb, metadata !"uno", %m, i32 %evl) ret %v @@ -521,11 +627,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -538,11 +648,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -555,8 +669,11 @@ define @fcmp_oeq_vv_nxv3f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_nxv3f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv3f16( %va, %vb, metadata !"oeq", %m, i32 %evl) ret %v @@ -567,7 +684,9 @@ define @fcmp_oeq_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -578,7 +697,9 @@ define @fcmp_oeq_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -591,7 +712,9 @@ define @fcmp_oeq_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -604,7 +727,9 @@ define @fcmp_ogt_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v12, v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -615,7 +740,9 @@ define @fcmp_ogt_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -628,7 +755,9 @@ define @fcmp_ogt_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -641,7 +770,9 @@ define @fcmp_oge_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vv v12, v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -652,7 +783,9 @@ define @fcmp_oge_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfge.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -665,7 +798,9 @@ define @fcmp_oge_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -678,7 +813,9 @@ define @fcmp_olt_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -689,7 +826,9 @@ define @fcmp_olt_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -702,7 +841,9 @@ define @fcmp_olt_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -715,7 +856,9 @@ define @fcmp_ole_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -726,7 +869,9 @@ define @fcmp_ole_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -739,7 +884,9 @@ define @fcmp_ole_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfge.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -752,10 +899,14 @@ define @fcmp_one_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v13, v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v13, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"one", %m, i32 %evl) @@ -765,10 +916,14 @@ define @fcmp_one_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v11, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -780,10 +935,14 @@ define @fcmp_one_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v11, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -795,10 +954,14 @@ define @fcmp_ord_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ord_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vv v12, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vv v10, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmand.mm v0, v10, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"ord", %m, i32 %evl) @@ -810,10 +973,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vf v12, v10, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vv v10, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmand.mm v0, v10, v12 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -827,10 +994,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vf v12, v10, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfeq.vv v10, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmand.mm v0, v12, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -842,10 +1013,14 @@ define @fcmp_ueq_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v13, v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnor.mm v0, v13, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"ueq", %m, i32 %evl) @@ -855,10 +1030,14 @@ define @fcmp_ueq_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnor.mm v0, v11, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -870,10 +1049,14 @@ define @fcmp_ueq_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnor.mm v0, v11, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -885,9 +1068,10 @@ define @fcmp_ugt_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vv v12, v8, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"ugt", %m, i32 %evl) @@ -897,9 +1081,10 @@ define @fcmp_ugt_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -911,9 +1096,10 @@ define @fcmp_ugt_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfge.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -925,9 +1111,10 @@ define @fcmp_uge_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v12, v8, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"uge", %m, i32 %evl) @@ -937,9 +1124,10 @@ define @fcmp_uge_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -951,9 +1139,10 @@ define @fcmp_uge_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -965,9 +1154,10 @@ define @fcmp_ult_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vv v12, v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"ult", %m, i32 %evl) @@ -977,9 +1167,10 @@ define @fcmp_ult_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfge.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -991,9 +1182,10 @@ define @fcmp_ult_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfle.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -1005,9 +1197,10 @@ define @fcmp_ule_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vv v12, v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"ule", %m, i32 %evl) @@ -1017,9 +1210,10 @@ define @fcmp_ule_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -1031,9 +1225,10 @@ define @fcmp_ule_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -1045,7 +1240,9 @@ define @fcmp_une_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1056,7 +1253,9 @@ define @fcmp_une_vf_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1069,7 +1268,9 @@ define @fcmp_une_vf_swap_nxv8f16( %va, half %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_swap_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vf v10, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -1082,10 +1283,14 @@ define @fcmp_uno_vv_nxv8f16( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uno_vv_nxv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v12, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v10, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v10, v12 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f16( %va, %vb, metadata !"uno", %m, i32 %evl) @@ -1097,10 +1302,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vf v12, v10, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v10, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v10, v12 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -1114,10 +1323,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vf v12, v10, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v10, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v12, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -1134,43 +1347,59 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: vmv1r.v v1, v0 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a1, a3, 1 +; CHECK-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a5, zero, e8, m1, ta, mu ; CHECK-NEXT: slli a5, a3, 3 ; CHECK-NEXT: add a5, a0, a5 -; CHECK-NEXT: vl8re16.v v24, (a5) +; CHECK-NEXT: vl8re16.v v8, (a5) ; CHECK-NEXT: slli a3, a3, 2 ; CHECK-NEXT: sub a5, a2, a3 -; CHECK-NEXT: vslidedown.vx v0, v0, a1 +; CHECK-NEXT: vslidedown.vx v0, v2, a1 ; CHECK-NEXT: bltu a2, a5, .LBB85_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB85_2: -; CHECK-NEXT: vl8re16.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a4, e16, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v2, v16, v24, v0.t +; CHECK-NEXT: vl8re16.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v1, 0 +; CHECK-NEXT: vsetvli zero, a4, e16, m8, ta, mu +; CHECK-NEXT: vmfeq.vv v1, v16, v8, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB85_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB85_4: -; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v16, v24, v8, v0.t +; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t ; CHECK-NEXT: add a0, a1, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu -; CHECK-NEXT: vslideup.vx v16, v2, a1 +; CHECK-NEXT: vslideup.vx v16, v1, a1 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -1183,8 +1412,11 @@ define @fcmp_oeq_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"oeq", %m, i32 %evl) ret %v @@ -1193,8 +1425,11 @@ define @fcmp_oeq_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1205,8 +1440,11 @@ define @fcmp_oeq_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1217,8 +1455,11 @@ define @fcmp_ogt_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"ogt", %m, i32 %evl) ret %v @@ -1227,8 +1468,11 @@ define @fcmp_ogt_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1239,8 +1483,11 @@ define @fcmp_ogt_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1251,8 +1498,11 @@ define @fcmp_oge_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"oge", %m, i32 %evl) ret %v @@ -1261,8 +1511,11 @@ define @fcmp_oge_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1273,8 +1526,11 @@ define @fcmp_oge_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1285,8 +1541,11 @@ define @fcmp_olt_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"olt", %m, i32 %evl) ret %v @@ -1295,8 +1554,11 @@ define @fcmp_olt_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1307,8 +1569,11 @@ define @fcmp_olt_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1319,8 +1584,11 @@ define @fcmp_ole_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"ole", %m, i32 %evl) ret %v @@ -1329,8 +1597,11 @@ define @fcmp_ole_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1341,8 +1612,11 @@ define @fcmp_ole_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfge.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1353,11 +1627,15 @@ define @fcmp_one_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vv v11, v9, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v11, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"one", %m, i32 %evl) ret %v @@ -1366,11 +1644,15 @@ define @fcmp_one_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1381,11 +1663,15 @@ define @fcmp_one_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1396,11 +1682,15 @@ define @fcmp_ord_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ord_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v9, v9, v9, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v10, v9, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"ord", %m, i32 %evl) ret %v @@ -1411,11 +1701,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1428,11 +1722,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfeq.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmand.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1443,11 +1741,15 @@ define @fcmp_ueq_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vv v11, v9, v8, v0.t +; CHECK-NEXT: vmnor.mm v0, v11, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"ueq", %m, i32 %evl) ret %v @@ -1456,11 +1758,15 @@ define @fcmp_ueq_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmnor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1471,11 +1777,15 @@ define @fcmp_ueq_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t +; CHECK-NEXT: vmnor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1486,10 +1796,11 @@ define @fcmp_ugt_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"ugt", %m, i32 %evl) ret %v @@ -1498,10 +1809,11 @@ define @fcmp_ugt_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1512,10 +1824,11 @@ define @fcmp_ugt_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfge.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1526,10 +1839,11 @@ define @fcmp_uge_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"uge", %m, i32 %evl) ret %v @@ -1538,10 +1852,11 @@ define @fcmp_uge_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1552,10 +1867,11 @@ define @fcmp_uge_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1566,10 +1882,11 @@ define @fcmp_ult_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"ult", %m, i32 %evl) ret %v @@ -1578,10 +1895,11 @@ define @fcmp_ult_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfge.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfge.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1592,10 +1910,11 @@ define @fcmp_ult_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfle.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfle.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1606,10 +1925,11 @@ define @fcmp_ule_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"ule", %m, i32 %evl) ret %v @@ -1618,10 +1938,11 @@ define @fcmp_ule_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1632,10 +1953,11 @@ define @fcmp_ule_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmflt.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmnot.m v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1646,8 +1968,11 @@ define @fcmp_une_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"une", %m, i32 %evl) ret %v @@ -1656,8 +1981,11 @@ define @fcmp_une_vf_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1668,8 +1996,11 @@ define @fcmp_une_vf_swap_nxv1f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_swap_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vf v0, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vf v9, v8, fa0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1680,11 +2011,15 @@ define @fcmp_uno_vv_nxv1f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uno_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vv v9, v9, v9, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vv v10, v9, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv1f64( %va, %vb, metadata !"uno", %m, i32 %evl) ret %v @@ -1695,11 +2030,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v10 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1712,11 +2051,15 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t -; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vf v10, v9, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmfne.vv v9, v8, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v10, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1729,7 +2072,9 @@ define @fcmp_oeq_vv_nxv3f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_nxv3f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu ; CHECK-NEXT: vmfeq.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1742,7 +2087,9 @@ define @fcmp_oeq_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -1753,7 +2100,9 @@ define @fcmp_oeq_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1766,7 +2115,9 @@ define @fcmp_oeq_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oeq_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1779,7 +2130,9 @@ define @fcmp_ogt_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v24, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -1790,7 +2143,9 @@ define @fcmp_ogt_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1803,7 +2158,9 @@ define @fcmp_ogt_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ogt_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1816,7 +2173,9 @@ define @fcmp_oge_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vv v24, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -1827,7 +2186,9 @@ define @fcmp_oge_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfge.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1840,7 +2201,9 @@ define @fcmp_oge_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_oge_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1853,7 +2216,9 @@ define @fcmp_olt_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -1864,7 +2229,9 @@ define @fcmp_olt_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1877,7 +2244,9 @@ define @fcmp_olt_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_olt_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1890,7 +2259,9 @@ define @fcmp_ole_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -1901,7 +2272,9 @@ define @fcmp_ole_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1914,7 +2287,9 @@ define @fcmp_ole_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ole_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfge.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1927,10 +2302,14 @@ define @fcmp_one_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v24, v8, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v25, v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v25, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"one", %m, i32 %evl) @@ -1940,10 +2319,14 @@ define @fcmp_one_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v17, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v17, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -1955,10 +2338,14 @@ define @fcmp_one_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_one_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v16, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v17, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v17, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -1970,10 +2357,14 @@ define @fcmp_ord_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ord_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vv v24, v16, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmand.mm v0, v16, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"ord", %m, i32 %evl) @@ -1985,10 +2376,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vf v24, v16, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmand.mm v0, v16, v24 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2002,10 +2397,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vf v24, v16, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmand.mm v0, v24, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2017,10 +2416,14 @@ define @fcmp_ueq_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v24, v8, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v25, v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnor.mm v0, v25, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"ueq", %m, i32 %evl) @@ -2030,10 +2433,14 @@ define @fcmp_ueq_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v17, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnor.mm v0, v17, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2045,10 +2452,14 @@ define @fcmp_ueq_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ueq_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v16, v8, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v17, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnor.mm v0, v17, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2060,9 +2471,10 @@ define @fcmp_ugt_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vv v24, v8, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"ugt", %m, i32 %evl) @@ -2072,9 +2484,10 @@ define @fcmp_ugt_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2086,9 +2499,10 @@ define @fcmp_ugt_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ugt_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfge.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2100,9 +2514,10 @@ define @fcmp_uge_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v24, v8, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"uge", %m, i32 %evl) @@ -2112,9 +2527,10 @@ define @fcmp_uge_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2126,9 +2542,10 @@ define @fcmp_uge_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uge_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2140,9 +2557,10 @@ define @fcmp_ult_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vv v24, v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"ult", %m, i32 %evl) @@ -2152,9 +2570,10 @@ define @fcmp_ult_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfge.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2166,9 +2585,10 @@ define @fcmp_ult_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ult_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfle.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2180,9 +2600,10 @@ define @fcmp_ule_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vv v24, v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"ule", %m, i32 %evl) @@ -2192,9 +2613,10 @@ define @fcmp_ule_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfgt.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2206,9 +2628,10 @@ define @fcmp_ule_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_ule_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v8, fa0, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmnot.m v0, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2220,7 +2643,9 @@ define @fcmp_une_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -2231,7 +2656,9 @@ define @fcmp_une_vf_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2244,7 +2671,9 @@ define @fcmp_une_vf_swap_nxv8f64( %va, double %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_une_vf_swap_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vf v16, v8, fa0, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2257,10 +2686,14 @@ define @fcmp_uno_vv_nxv8f64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: fcmp_uno_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v24, v16, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v16, v24 ; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv8f64( %va, %vb, metadata !"uno", %m, i32 %evl) @@ -2272,10 +2705,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vf v24, v16, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v16, v24 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2289,10 +2726,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vf v24, v16, fa0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vmor.mm v0, v24, v16 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -2311,7 +2752,7 @@ ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 @@ -2325,6 +2766,8 @@ ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a1, a3, 3 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: slli a5, a3, 3 ; CHECK-NEXT: slli a7, a3, 1 ; CHECK-NEXT: add a4, a2, a5 @@ -2334,32 +2777,35 @@ ; CHECK-NEXT: mv t0, a7 ; CHECK-NEXT: .LBB171_2: ; CHECK-NEXT: li t1, 0 +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli t2, zero, e8, mf4, ta, mu -; CHECK-NEXT: vl8re64.v v16, (a4) +; CHECK-NEXT: vl8re64.v v8, (a4) ; CHECK-NEXT: srli a4, a3, 2 ; CHECK-NEXT: sub t2, t0, a3 -; CHECK-NEXT: vslidedown.vx v0, v24, a1 +; CHECK-NEXT: vslidedown.vx v0, v1, a1 ; CHECK-NEXT: bltu t0, t2, .LBB171_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv t1, t2 ; CHECK-NEXT: .LBB171_4: ; CHECK-NEXT: li t2, 24 ; CHECK-NEXT: vsetvli t3, zero, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v1, v24, a4 -; CHECK-NEXT: vl8re64.v v8, (a2) -; CHECK-NEXT: csrr t3, vlenb -; CHECK-NEXT: slli t3, t3, 3 -; CHECK-NEXT: add t3, sp, t3 -; CHECK-NEXT: addi t3, t3, 16 -; CHECK-NEXT: vs8r.v v8, (t3) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vx v16, v1, a4 ; CHECK-NEXT: slli t3, a3, 4 -; CHECK-NEXT: vsetvli zero, t1, e64, m8, ta, ma +; CHECK-NEXT: vl8re64.v v24, (a2) +; CHECK-NEXT: csrr t4, vlenb +; CHECK-NEXT: slli t4, t4, 3 +; CHECK-NEXT: add t4, sp, t4 +; CHECK-NEXT: addi t4, t4, 16 +; CHECK-NEXT: vs8r.v v24, (t4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v18, 0 +; CHECK-NEXT: vsetvli zero, t1, e64, m8, ta, mu ; CHECK-NEXT: csrr t1, vlenb ; CHECK-NEXT: slli t1, t1, 4 ; CHECK-NEXT: add t1, sp, t1 ; CHECK-NEXT: addi t1, t1, 16 -; CHECK-NEXT: vl8re8.v v8, (t1) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v2, v8, v16, v0.t +; CHECK-NEXT: vl8re8.v v24, (t1) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v18, v24, v8, v0.t ; CHECK-NEXT: bltu t0, a3, .LBB171_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv t0, a3 @@ -2367,43 +2813,47 @@ ; CHECK-NEXT: li t1, 0 ; CHECK-NEXT: mul t4, a3, t2 ; CHECK-NEXT: add t2, a2, t3 -; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli t3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v2, 0 +; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: csrr t0, vlenb ; CHECK-NEXT: li t3, 24 ; CHECK-NEXT: mul t0, t0, t3 ; CHECK-NEXT: add t0, sp, t0 ; CHECK-NEXT: addi t0, t0, 16 -; CHECK-NEXT: vl8re8.v v24, (t0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8re8.v v8, (t0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr t0, vlenb ; CHECK-NEXT: slli t0, t0, 3 ; CHECK-NEXT: add t0, sp, t0 ; CHECK-NEXT: addi t0, t0, 16 -; CHECK-NEXT: vl8re8.v v8, (t0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v17, v24, v8, v0.t +; CHECK-NEXT: vl8re8.v v24, (t0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v2, v8, v24, v0.t ; CHECK-NEXT: sub t0, a6, a7 ; CHECK-NEXT: add a7, a1, a1 ; CHECK-NEXT: bltu a6, t0, .LBB171_8 ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: mv t1, t0 ; CHECK-NEXT: .LBB171_8: +; CHECK-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: add a2, a2, t4 -; CHECK-NEXT: vl8re64.v v8, (t2) +; CHECK-NEXT: vl8re64.v v24, (t2) ; CHECK-NEXT: csrr a6, vlenb ; CHECK-NEXT: li t0, 24 ; CHECK-NEXT: mul a6, a6, t0 ; CHECK-NEXT: add a6, sp, a6 ; CHECK-NEXT: addi a6, a6, 16 -; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: csrr a6, vlenb ; CHECK-NEXT: slli a6, a6, 4 ; CHECK-NEXT: add a6, sp, a6 ; CHECK-NEXT: addi a6, a6, 16 -; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a5 ; CHECK-NEXT: vsetvli zero, a7, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v17, v2, a1 +; CHECK-NEXT: vslideup.vx v2, v18, a1 ; CHECK-NEXT: mv a5, t1 ; CHECK-NEXT: bltu t1, a3, .LBB171_10 ; CHECK-NEXT: # %bb.9: @@ -2411,18 +2861,20 @@ ; CHECK-NEXT: .LBB171_10: ; CHECK-NEXT: li a6, 0 ; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v1, a1 -; CHECK-NEXT: vl8re64.v v8, (a2) +; CHECK-NEXT: vslidedown.vx v8, v16, a1 +; CHECK-NEXT: vl8re64.v v24, (a2) ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a2, 24 ; CHECK-NEXT: mul a0, a0, a2 @@ -2433,32 +2885,34 @@ ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v18, v8, v24, v0.t +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v9, v16, v24, v0.t ; CHECK-NEXT: add a0, a4, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu ; CHECK-NEXT: sub a0, t1, a3 -; CHECK-NEXT: vslideup.vx v17, v18, a4 +; CHECK-NEXT: vslideup.vx v2, v9, a4 ; CHECK-NEXT: bltu t1, a0, .LBB171_12 ; CHECK-NEXT: # %bb.11: ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: .LBB171_12: -; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v9, v24, v16, v0.t ; CHECK-NEXT: slli a0, a1, 1 ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v17, v16, a0 -; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vslideup.vx v2, v9, a0 +; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll @@ -12,8 +12,11 @@ define @icmp_eq_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmseq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v @@ -22,8 +25,11 @@ define @icmp_eq_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -34,8 +40,11 @@ define @icmp_eq_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -46,8 +55,11 @@ define @icmp_eq_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -58,8 +70,11 @@ define @icmp_eq_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -70,8 +85,11 @@ define @icmp_ne_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"ne", %m, i32 %evl) ret %v @@ -80,8 +98,11 @@ define @icmp_ne_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -92,8 +113,11 @@ define @icmp_ne_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -104,8 +128,11 @@ define @icmp_ne_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -116,8 +143,11 @@ define @icmp_ne_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -128,8 +158,11 @@ define @icmp_ugt_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"ugt", %m, i32 %evl) ret %v @@ -138,8 +171,11 @@ define @icmp_ugt_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -150,8 +186,11 @@ define @icmp_ugt_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -162,8 +201,11 @@ define @icmp_ugt_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -174,8 +216,11 @@ define @icmp_ugt_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -186,8 +231,11 @@ define @icmp_uge_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsleu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"uge", %m, i32 %evl) ret %v @@ -197,9 +245,12 @@ ; CHECK-LABEL: icmp_uge_vx_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsleu.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -210,8 +261,11 @@ define @icmp_uge_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsleu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsleu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -222,8 +276,11 @@ define @icmp_uge_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -234,8 +291,11 @@ define @icmp_uge_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -246,8 +306,11 @@ define @icmp_ult_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"ult", %m, i32 %evl) ret %v @@ -256,8 +319,11 @@ define @icmp_ult_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -268,8 +334,11 @@ define @icmp_ult_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -280,8 +349,11 @@ define @icmp_ult_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -292,8 +364,11 @@ define @icmp_ult_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -304,8 +379,11 @@ define @icmp_sgt_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmslt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmslt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"sgt", %m, i32 %evl) ret %v @@ -314,8 +392,11 @@ define @icmp_sgt_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -326,8 +407,11 @@ define @icmp_sgt_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -338,8 +422,11 @@ define @icmp_sgt_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -350,8 +437,11 @@ define @icmp_sgt_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -362,8 +452,11 @@ define @icmp_sge_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"sge", %m, i32 %evl) ret %v @@ -373,9 +466,12 @@ ; CHECK-LABEL: icmp_sge_vx_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -386,8 +482,11 @@ define @icmp_sge_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -398,8 +497,11 @@ define @icmp_sge_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -410,8 +512,11 @@ define @icmp_sge_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -422,8 +527,11 @@ define @icmp_slt_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmslt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmslt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"slt", %m, i32 %evl) ret %v @@ -432,8 +540,11 @@ define @icmp_slt_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -444,8 +555,11 @@ define @icmp_slt_vx_swap_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -456,8 +570,11 @@ define @icmp_slt_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -468,8 +585,11 @@ define @icmp_slt_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -480,8 +600,11 @@ define @icmp_sle_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i8( %va, %vb, metadata !"sle", %m, i32 %evl) ret %v @@ -490,8 +613,11 @@ define @icmp_sle_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vx_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -503,9 +629,12 @@ ; CHECK-LABEL: icmp_sle_vx_swap_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -516,8 +645,11 @@ define @icmp_sle_vi_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -528,8 +660,11 @@ define @icmp_sle_vi_swap_nxv1i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -542,8 +677,11 @@ define @icmp_eq_vv_nxv3i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv3i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv3i8( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v @@ -552,8 +690,11 @@ define @icmp_eq_vx_nxv3i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_nxv3i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -564,8 +705,11 @@ define @icmp_eq_vx_swap_nxv3i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_nxv3i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -581,9 +725,12 @@ ; CHECK-NEXT: li a1, 127 ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu ; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vv v8, v10, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i7( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v @@ -594,11 +741,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a2 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vand.vx v10, v8, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -611,11 +761,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 127 ; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, mu -; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vand.vx v9, v9, a2 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vv v0, v9, v8, v0.t +; CHECK-NEXT: vand.vx v9, v8, a2 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vand.vx v10, v8, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vv v8, v10, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, i7 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -628,8 +781,11 @@ define @icmp_eq_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v @@ -638,8 +794,11 @@ define @icmp_eq_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -650,8 +809,11 @@ define @icmp_eq_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -662,8 +824,11 @@ define @icmp_eq_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -674,8 +839,11 @@ define @icmp_eq_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -686,8 +854,11 @@ define @icmp_ne_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"ne", %m, i32 %evl) ret %v @@ -696,8 +867,11 @@ define @icmp_ne_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -708,8 +882,11 @@ define @icmp_ne_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -720,8 +897,11 @@ define @icmp_ne_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -732,8 +912,11 @@ define @icmp_ne_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -744,8 +927,11 @@ define @icmp_ugt_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"ugt", %m, i32 %evl) ret %v @@ -754,8 +940,11 @@ define @icmp_ugt_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -766,8 +955,11 @@ define @icmp_ugt_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -778,8 +970,11 @@ define @icmp_ugt_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -790,8 +985,11 @@ define @icmp_ugt_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -802,8 +1000,11 @@ define @icmp_uge_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsleu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"uge", %m, i32 %evl) ret %v @@ -813,9 +1014,12 @@ ; CHECK-LABEL: icmp_uge_vx_nxv8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsleu.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -826,8 +1030,11 @@ define @icmp_uge_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsleu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsleu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -838,8 +1045,11 @@ define @icmp_uge_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -850,8 +1060,11 @@ define @icmp_uge_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -862,8 +1075,11 @@ define @icmp_ult_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"ult", %m, i32 %evl) ret %v @@ -872,8 +1088,11 @@ define @icmp_ult_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -884,8 +1103,11 @@ define @icmp_ult_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -896,8 +1118,11 @@ define @icmp_ult_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -908,8 +1133,11 @@ define @icmp_ult_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -920,8 +1148,11 @@ define @icmp_sgt_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmslt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmslt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"sgt", %m, i32 %evl) ret %v @@ -930,8 +1161,11 @@ define @icmp_sgt_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -942,8 +1176,11 @@ define @icmp_sgt_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -954,8 +1191,11 @@ define @icmp_sgt_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -966,8 +1206,11 @@ define @icmp_sgt_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -978,8 +1221,11 @@ define @icmp_sge_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"sge", %m, i32 %evl) ret %v @@ -989,9 +1235,12 @@ ; CHECK-LABEL: icmp_sge_vx_nxv8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1002,8 +1251,11 @@ define @icmp_sge_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1014,8 +1266,11 @@ define @icmp_sge_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1026,8 +1281,11 @@ define @icmp_sge_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1038,8 +1296,11 @@ define @icmp_slt_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmslt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmslt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"slt", %m, i32 %evl) ret %v @@ -1048,8 +1309,11 @@ define @icmp_slt_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1060,8 +1324,11 @@ define @icmp_slt_vx_swap_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1072,8 +1339,11 @@ define @icmp_slt_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1084,8 +1354,11 @@ define @icmp_slt_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1096,8 +1369,11 @@ define @icmp_sle_vv_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv8i8( %va, %vb, metadata !"sle", %m, i32 %evl) ret %v @@ -1106,8 +1382,11 @@ define @icmp_sle_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1119,9 +1398,12 @@ ; CHECK-LABEL: icmp_sle_vx_swap_nxv8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1132,8 +1414,11 @@ define @icmp_sle_vi_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1144,8 +1429,11 @@ define @icmp_sle_vi_swap_nxv8i8( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1174,9 +1462,10 @@ ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a4, zero, e8, m8, ta, mu -; CHECK-NEXT: vlm.v v25, (a2) +; CHECK-NEXT: vlm.v v2, (a2) ; CHECK-NEXT: sub a4, a3, a1 -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 @@ -1190,30 +1479,34 @@ ; CHECK-NEXT: vl8r.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vv v1, v16, v8, v0.t +; CHECK-NEXT: vmseq.vv v16, v24, v8, v0.t ; CHECK-NEXT: bltu a3, a1, .LBB96_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a3, a1 ; CHECK-NEXT: .LBB96_4: -; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmv1r.v v8, v1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmseq.vv v17, v24, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vmv1r.v v8, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 @@ -1236,18 +1529,22 @@ ; CHECK-NEXT: .LBB97_2: ; CHECK-NEXT: li a5, 0 ; CHECK-NEXT: vsetvli a6, zero, e8, m8, ta, mu -; CHECK-NEXT: vlm.v v24, (a1) -; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v25, (a1) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, mu ; CHECK-NEXT: sub a1, a2, a3 -; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t +; CHECK-NEXT: vmseq.vx v24, v8, a0, v0.t ; CHECK-NEXT: bltu a2, a1, .LBB97_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a5, a1 ; CHECK-NEXT: .LBB97_4: -; CHECK-NEXT: vsetvli zero, a5, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a5, e8, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i8 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1267,18 +1564,22 @@ ; CHECK-NEXT: .LBB98_2: ; CHECK-NEXT: li a5, 0 ; CHECK-NEXT: vsetvli a6, zero, e8, m8, ta, mu -; CHECK-NEXT: vlm.v v24, (a1) -; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v25, (a1) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, mu ; CHECK-NEXT: sub a1, a2, a3 -; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t +; CHECK-NEXT: vmseq.vx v24, v8, a0, v0.t ; CHECK-NEXT: bltu a2, a1, .LBB98_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a5, a1 ; CHECK-NEXT: .LBB98_4: -; CHECK-NEXT: vsetvli zero, a5, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a5, e8, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret %elt.head = insertelement poison, i8 %b, i8 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1291,8 +1592,11 @@ define @icmp_eq_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmseq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v @@ -1301,8 +1605,11 @@ define @icmp_eq_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1313,8 +1620,11 @@ define @icmp_eq_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmseq.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1325,8 +1635,11 @@ define @icmp_eq_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1337,8 +1650,11 @@ define @icmp_eq_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1349,8 +1665,11 @@ define @icmp_ne_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"ne", %m, i32 %evl) ret %v @@ -1359,8 +1678,11 @@ define @icmp_ne_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1371,8 +1693,11 @@ define @icmp_ne_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsne.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1383,8 +1708,11 @@ define @icmp_ne_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1395,8 +1723,11 @@ define @icmp_ne_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1407,8 +1738,11 @@ define @icmp_ugt_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"ugt", %m, i32 %evl) ret %v @@ -1417,8 +1751,11 @@ define @icmp_ugt_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1429,8 +1766,11 @@ define @icmp_ugt_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1441,8 +1781,11 @@ define @icmp_ugt_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1453,8 +1796,11 @@ define @icmp_ugt_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1465,8 +1811,11 @@ define @icmp_uge_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsleu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"uge", %m, i32 %evl) ret %v @@ -1476,9 +1825,12 @@ ; CHECK-LABEL: icmp_uge_vx_nxv1i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsleu.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1489,8 +1841,11 @@ define @icmp_uge_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsleu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsleu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1501,8 +1856,11 @@ define @icmp_uge_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1513,8 +1871,11 @@ define @icmp_uge_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1525,8 +1886,11 @@ define @icmp_ult_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"ult", %m, i32 %evl) ret %v @@ -1535,8 +1899,11 @@ define @icmp_ult_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsltu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1547,8 +1914,11 @@ define @icmp_ult_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1559,8 +1929,11 @@ define @icmp_ult_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1571,8 +1944,11 @@ define @icmp_ult_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1583,8 +1959,11 @@ define @icmp_sgt_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmslt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmslt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"sgt", %m, i32 %evl) ret %v @@ -1593,8 +1972,11 @@ define @icmp_sgt_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1605,8 +1987,11 @@ define @icmp_sgt_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1617,8 +2002,11 @@ define @icmp_sgt_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1629,8 +2017,11 @@ define @icmp_sgt_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1641,8 +2032,11 @@ define @icmp_sge_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"sge", %m, i32 %evl) ret %v @@ -1652,9 +2046,12 @@ ; CHECK-LABEL: icmp_sge_vx_nxv1i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1665,8 +2062,11 @@ define @icmp_sge_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1677,8 +2077,11 @@ define @icmp_sge_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1689,8 +2092,11 @@ define @icmp_sge_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1701,8 +2107,11 @@ define @icmp_slt_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmslt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmslt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"slt", %m, i32 %evl) ret %v @@ -1711,8 +2120,11 @@ define @icmp_slt_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmslt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1723,8 +2135,11 @@ define @icmp_slt_vx_swap_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsgt.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1735,8 +2150,11 @@ define @icmp_slt_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1747,8 +2165,11 @@ define @icmp_slt_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1759,8 +2180,11 @@ define @icmp_sle_vv_nxv1i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i32( %va, %vb, metadata !"sle", %m, i32 %evl) ret %v @@ -1769,8 +2193,11 @@ define @icmp_sle_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vx_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vx v9, v8, a0, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1782,9 +2209,12 @@ ; CHECK-LABEL: icmp_sle_vx_swap_nxv1i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1795,8 +2225,11 @@ define @icmp_sle_vi_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1807,8 +2240,11 @@ define @icmp_sle_vi_swap_nxv1i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1821,7 +2257,9 @@ define @icmp_eq_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmseq.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1832,7 +2270,9 @@ define @icmp_eq_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmseq.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1845,7 +2285,9 @@ define @icmp_eq_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmseq.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1858,7 +2300,9 @@ define @icmp_eq_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmseq.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1871,7 +2315,9 @@ define @icmp_eq_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmseq.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1884,7 +2330,9 @@ define @icmp_ne_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsne.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1895,7 +2343,9 @@ define @icmp_ne_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsne.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1908,7 +2358,9 @@ define @icmp_ne_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsne.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1921,7 +2373,9 @@ define @icmp_ne_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsne.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1934,7 +2388,9 @@ define @icmp_ne_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsne.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1947,7 +2403,9 @@ define @icmp_ugt_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsltu.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -1958,7 +2416,9 @@ define @icmp_ugt_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsgtu.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1971,7 +2431,9 @@ define @icmp_ugt_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsltu.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1984,7 +2446,9 @@ define @icmp_ugt_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsgtu.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -1997,7 +2461,9 @@ define @icmp_ugt_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsleu.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2010,7 +2476,9 @@ define @icmp_uge_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsleu.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2023,7 +2491,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e32, m4, ta, mu ; CHECK-NEXT: vmv.v.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsleu.vv v12, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2036,7 +2506,9 @@ define @icmp_uge_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsleu.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2049,7 +2521,9 @@ define @icmp_uge_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsgtu.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2062,7 +2536,9 @@ define @icmp_uge_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsleu.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2075,7 +2551,9 @@ define @icmp_ult_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsltu.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2086,7 +2564,9 @@ define @icmp_ult_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsltu.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2099,7 +2579,9 @@ define @icmp_ult_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsgtu.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2112,7 +2594,9 @@ define @icmp_ult_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsleu.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2125,7 +2609,9 @@ define @icmp_ult_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsgtu.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2138,7 +2624,9 @@ define @icmp_sgt_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmslt.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2149,7 +2637,9 @@ define @icmp_sgt_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsgt.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2162,7 +2652,9 @@ define @icmp_sgt_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmslt.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2175,7 +2667,9 @@ define @icmp_sgt_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2188,7 +2682,9 @@ define @icmp_sgt_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2201,7 +2697,9 @@ define @icmp_sge_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2214,7 +2712,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e32, m4, ta, mu ; CHECK-NEXT: vmv.v.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vv v12, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2227,7 +2727,9 @@ define @icmp_sge_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2240,7 +2742,9 @@ define @icmp_sge_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2253,7 +2757,9 @@ define @icmp_sge_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2266,7 +2772,9 @@ define @icmp_slt_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmslt.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2277,7 +2785,9 @@ define @icmp_slt_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmslt.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2290,7 +2800,9 @@ define @icmp_slt_vx_swap_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vx_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsgt.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2303,7 +2815,9 @@ define @icmp_slt_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2316,7 +2830,9 @@ define @icmp_slt_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2329,7 +2845,9 @@ define @icmp_sle_vv_nxv8i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -2340,7 +2858,9 @@ define @icmp_sle_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vx v12, v8, a0, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2355,7 +2875,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e32, m4, ta, mu ; CHECK-NEXT: vmv.v.x v16, a0 -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vv v12, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2368,7 +2890,9 @@ define @icmp_sle_vi_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsle.vi v12, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2381,7 +2905,9 @@ define @icmp_sle_vi_swap_nxv8i32( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmsgt.vi v12, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: ret @@ -2399,43 +2925,59 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: vmv1r.v v1, v0 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a1, a3, 2 -; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a5, a3, 3 ; CHECK-NEXT: add a5, a0, a5 -; CHECK-NEXT: vl8re32.v v24, (a5) +; CHECK-NEXT: vl8re32.v v8, (a5) ; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: sub a5, a2, a3 -; CHECK-NEXT: vslidedown.vx v0, v0, a1 +; CHECK-NEXT: vslidedown.vx v0, v2, a1 ; CHECK-NEXT: bltu a2, a5, .LBB189_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB189_2: -; CHECK-NEXT: vl8re32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; CHECK-NEXT: vmseq.vv v2, v16, v24, v0.t +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v1, 0 +; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, mu +; CHECK-NEXT: vmseq.vv v1, v16, v8, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB189_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB189_4: -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vv v16, v24, v8, v0.t +; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: add a0, a1, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v16, v2, a1 +; CHECK-NEXT: vslideup.vx v16, v1, a1 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -2450,21 +2992,27 @@ ; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a2, a3, 2 -; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: sub a5, a1, a3 -; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: vslidedown.vx v0, v24, a2 ; CHECK-NEXT: bltu a1, a5, .LBB190_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB190_2: -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, mu ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t ; CHECK-NEXT: bltu a1, a3, .LBB190_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: .LBB190_4: -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t ; CHECK-NEXT: add a0, a2, a2 @@ -2485,21 +3033,27 @@ ; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: srli a2, a3, 2 -; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a3, a3, 1 ; CHECK-NEXT: sub a5, a1, a3 -; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: vslidedown.vx v0, v24, a2 ; CHECK-NEXT: bltu a1, a5, .LBB191_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB191_2: -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, mu ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t ; CHECK-NEXT: bltu a1, a3, .LBB191_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: .LBB191_4: -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t ; CHECK-NEXT: add a0, a2, a2 @@ -2518,8 +3072,11 @@ define @icmp_eq_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmseq.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmseq.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v @@ -2534,16 +3091,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmseq.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmseq.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_eq_vx_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmseq.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmseq.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2560,16 +3123,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmseq.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmseq.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_eq_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmseq.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmseq.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2580,8 +3149,11 @@ define @icmp_eq_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2592,8 +3164,11 @@ define @icmp_eq_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmseq.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmseq.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2604,8 +3179,11 @@ define @icmp_ne_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsne.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsne.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"ne", %m, i32 %evl) ret %v @@ -2620,16 +3198,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsne.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsne.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_ne_vx_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsne.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsne.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2646,16 +3230,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsne.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsne.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_ne_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsne.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsne.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2666,8 +3256,11 @@ define @icmp_ne_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2678,8 +3271,11 @@ define @icmp_ne_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsne.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2690,8 +3286,11 @@ define @icmp_ugt_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"ugt", %m, i32 %evl) ret %v @@ -2706,16 +3305,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsltu.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsltu.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_ugt_vx_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2732,16 +3337,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsltu.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsltu.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_ugt_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsltu.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsltu.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2752,8 +3363,11 @@ define @icmp_ugt_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2764,8 +3378,11 @@ define @icmp_ugt_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2776,8 +3393,11 @@ define @icmp_uge_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsleu.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"uge", %m, i32 %evl) ret %v @@ -2792,18 +3412,24 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsleu.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsleu.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_uge_vx_nxv1i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsleu.vv v0, v9, v8, v0.t +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsleu.vv v9, v10, v8, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2820,16 +3446,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsleu.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsleu.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_uge_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsleu.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsleu.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2840,8 +3472,11 @@ define @icmp_uge_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2852,8 +3487,11 @@ define @icmp_uge_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2864,8 +3502,11 @@ define @icmp_ult_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsltu.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"ult", %m, i32 %evl) ret %v @@ -2880,16 +3521,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsltu.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsltu.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_ult_vx_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsltu.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsltu.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2906,16 +3553,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsltu.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsltu.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_ult_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsgtu.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2926,8 +3579,11 @@ define @icmp_ult_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsleu.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsleu.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2938,8 +3594,11 @@ define @icmp_ult_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsgtu.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsgtu.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2950,8 +3609,11 @@ define @icmp_sgt_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmslt.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmslt.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"sgt", %m, i32 %evl) ret %v @@ -2966,16 +3628,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmslt.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmslt.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_sgt_vx_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsgt.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsgt.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2992,16 +3660,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmslt.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmslt.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_sgt_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmslt.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmslt.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3012,8 +3686,11 @@ define @icmp_sgt_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3024,8 +3701,11 @@ define @icmp_sgt_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3036,8 +3716,11 @@ define @icmp_sge_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsle.vv v10, v9, v8, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"sge", %m, i32 %evl) ret %v @@ -3052,18 +3735,24 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsle.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_sge_vx_nxv1i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsle.vv v9, v10, v8, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3080,16 +3769,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsle.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsle.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_sge_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsle.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsle.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3100,8 +3795,11 @@ define @icmp_sge_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3112,8 +3810,11 @@ define @icmp_sge_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3124,8 +3825,11 @@ define @icmp_slt_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmslt.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmslt.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"slt", %m, i32 %evl) ret %v @@ -3140,16 +3844,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmslt.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmslt.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_slt_vx_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmslt.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmslt.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3166,16 +3876,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmslt.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmslt.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_slt_vx_swap_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsgt.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsgt.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3186,8 +3902,11 @@ define @icmp_slt_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3198,8 +3917,11 @@ define @icmp_slt_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3210,8 +3932,11 @@ define @icmp_sle_vv_nxv1i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsle.vv v0, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsle.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv1i64( %va, %vb, metadata !"sle", %m, i32 %evl) ret %v @@ -3226,16 +3951,22 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsle.vv v0, v8, v9, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsle.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_sle_vx_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsle.vx v0, v8, a0, v0.t +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsle.vx v9, v8, a0, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3252,18 +3983,24 @@ ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; RV32-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmsle.vv v9, v10, v8, v0.t +; RV32-NEXT: vmv.v.v v0, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: icmp_sle_vx_swap_nxv1i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; RV64-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vmsle.vv v9, v10, v8, v0.t +; RV64-NEXT: vmv.v.v v0, v9 ; RV64-NEXT: ret %elt.head = insertelement poison, i64 %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3274,8 +4011,11 @@ define @icmp_sle_vi_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, 4, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsle.vi v9, v8, 4, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3286,8 +4026,11 @@ define @icmp_sle_vi_swap_nxv1i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_nxv1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmsgt.vi v0, v8, 3, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu +; CHECK-NEXT: vmsgt.vi v9, v8, 3, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, i64 4, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -3300,7 +4043,9 @@ define @icmp_eq_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmseq.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3318,7 +4063,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmseq.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3326,7 +4073,9 @@ ; ; RV64-LABEL: icmp_eq_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmseq.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3346,7 +4095,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmseq.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3354,7 +4105,9 @@ ; ; RV64-LABEL: icmp_eq_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmseq.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3367,7 +4120,9 @@ define @icmp_eq_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmseq.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3380,7 +4135,9 @@ define @icmp_eq_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmseq.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3393,7 +4150,9 @@ define @icmp_ne_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsne.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3411,7 +4170,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsne.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3419,7 +4180,9 @@ ; ; RV64-LABEL: icmp_ne_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsne.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3439,7 +4202,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsne.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3447,7 +4212,9 @@ ; ; RV64-LABEL: icmp_ne_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsne.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3460,7 +4227,9 @@ define @icmp_ne_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsne.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3473,7 +4242,9 @@ define @icmp_ne_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ne_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsne.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3486,7 +4257,9 @@ define @icmp_ugt_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsltu.vv v24, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3504,7 +4277,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsltu.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3512,7 +4287,9 @@ ; ; RV64-LABEL: icmp_ugt_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsgtu.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3532,7 +4309,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsltu.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3540,7 +4319,9 @@ ; ; RV64-LABEL: icmp_ugt_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsltu.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3553,7 +4334,9 @@ define @icmp_ugt_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsgtu.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3566,7 +4349,9 @@ define @icmp_ugt_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ugt_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsleu.vi v16, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3579,7 +4364,9 @@ define @icmp_uge_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsleu.vv v24, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3597,7 +4384,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsleu.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3607,7 +4396,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v24, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsleu.vv v16, v24, v8, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3627,7 +4418,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsleu.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3635,7 +4428,9 @@ ; ; RV64-LABEL: icmp_uge_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsleu.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3648,7 +4443,9 @@ define @icmp_uge_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsgtu.vi v16, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3661,7 +4458,9 @@ define @icmp_uge_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_uge_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsleu.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3674,7 +4473,9 @@ define @icmp_ult_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsltu.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3692,7 +4493,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsltu.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3700,7 +4503,9 @@ ; ; RV64-LABEL: icmp_ult_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsltu.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3720,7 +4525,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsltu.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3728,7 +4535,9 @@ ; ; RV64-LABEL: icmp_ult_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsgtu.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3741,7 +4550,9 @@ define @icmp_ult_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsleu.vi v16, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3754,7 +4565,9 @@ define @icmp_ult_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_ult_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsgtu.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3767,7 +4580,9 @@ define @icmp_sgt_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmslt.vv v24, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3785,7 +4600,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmslt.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3793,7 +4610,9 @@ ; ; RV64-LABEL: icmp_sgt_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsgt.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3813,7 +4632,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmslt.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3821,7 +4642,9 @@ ; ; RV64-LABEL: icmp_sgt_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmslt.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3834,7 +4657,9 @@ define @icmp_sgt_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsgt.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3847,7 +4672,9 @@ define @icmp_sgt_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sgt_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsle.vi v16, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3860,7 +4687,9 @@ define @icmp_sge_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsle.vv v24, v16, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3878,7 +4707,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsle.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3888,7 +4719,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v24, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsle.vv v16, v24, v8, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3908,7 +4741,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsle.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3916,7 +4751,9 @@ ; ; RV64-LABEL: icmp_sge_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsle.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -3929,7 +4766,9 @@ define @icmp_sge_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsgt.vi v16, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3942,7 +4781,9 @@ define @icmp_sge_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sge_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsle.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -3955,7 +4796,9 @@ define @icmp_slt_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmslt.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -3973,7 +4816,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmslt.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -3981,7 +4826,9 @@ ; ; RV64-LABEL: icmp_slt_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmslt.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -4001,7 +4848,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmslt.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -4009,7 +4858,9 @@ ; ; RV64-LABEL: icmp_slt_vx_swap_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsgt.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -4022,7 +4873,9 @@ define @icmp_slt_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsle.vi v16, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -4035,7 +4888,9 @@ define @icmp_slt_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_slt_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsgt.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -4048,7 +4903,9 @@ define @icmp_sle_vv_nxv8i64( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsle.vv v24, v8, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: ret @@ -4066,7 +4923,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsle.vv v16, v8, v24, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -4074,7 +4933,9 @@ ; ; RV64-LABEL: icmp_sle_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsle.vx v16, v8, a0, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -4094,7 +4955,9 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vmsle.vv v16, v24, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: addi sp, sp, 16 @@ -4104,7 +4967,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu ; RV64-NEXT: vmv.v.x v24, a0 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; RV64-NEXT: vmsle.vv v16, v24, v8, v0.t ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: ret @@ -4117,7 +4982,9 @@ define @icmp_sle_vi_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsle.vi v16, v8, 4, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret @@ -4130,7 +4997,9 @@ define @icmp_sle_vi_swap_nxv8i64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_sle_vi_swap_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu ; CHECK-NEXT: vmsgt.vi v16, v8, 3, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll --- a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll @@ -11,6 +11,8 @@ define <4 x i32> @vload_v4i32_zero_evl(<4 x i32>* %ptr, <4 x i1> %m) { ; CHECK-LABEL: vload_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %v = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* %ptr, <4 x i1> %m, i32 0) ret <4 x i32> %v @@ -19,6 +21,8 @@ define <4 x i32> @vload_v4i32_false_mask(<4 x i32>* %ptr, i32 %evl) { ; CHECK-LABEL: vload_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %v = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* %ptr, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %v @@ -29,6 +33,8 @@ define <4 x i32> @vgather_v4i32_v4i32_zero_evl(<4 x i32*> %ptrs, <4 x i1> %m) { ; CHECK-LABEL: vgather_v4i32_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, <4 x i1> %m, i32 0) ret <4 x i32> %v @@ -37,6 +43,8 @@ define <4 x i32> @vgather_v4i32_v4i32_false_mask(<4 x i32*> %ptrs, i32 %evl) { ; CHECK-LABEL: vgather_v4i32_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %v @@ -83,6 +91,8 @@ define <4 x i32> @vadd_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vadd_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -91,6 +101,8 @@ define <4 x i32> @vadd_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vadd_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -101,6 +113,8 @@ define <4 x i32> @vand_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vand_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -109,6 +123,8 @@ define <4 x i32> @vand_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vand_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -119,6 +135,8 @@ define <4 x i32> @vlshr_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vlshr_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -127,6 +145,8 @@ define <4 x i32> @vlshr_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vlshr_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -137,6 +157,8 @@ define <4 x i32> @vmul_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vmul_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -145,6 +167,8 @@ define <4 x i32> @vmul_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vmul_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -155,6 +179,8 @@ define <4 x i32> @vor_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vor_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -163,6 +189,8 @@ define <4 x i32> @vor_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vor_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -173,6 +201,8 @@ define <4 x i32> @vsdiv_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vsdiv_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -181,6 +211,8 @@ define <4 x i32> @vsdiv_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vsdiv_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -191,6 +223,8 @@ define <4 x i32> @vsrem_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vsrem_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -199,6 +233,8 @@ define <4 x i32> @vsrem_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vsrem_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -209,6 +245,8 @@ define <4 x i32> @vsub_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vsub_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -217,6 +255,8 @@ define <4 x i32> @vsub_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vsub_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -227,6 +267,8 @@ define <4 x i32> @vudiv_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vudiv_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -235,6 +277,8 @@ define <4 x i32> @vudiv_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vudiv_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -245,6 +289,8 @@ define <4 x i32> @vurem_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vurem_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -253,6 +299,8 @@ define <4 x i32> @vurem_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vurem_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -263,6 +311,8 @@ define <4 x i32> @vxor_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { ; CHECK-LABEL: vxor_v4i32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) ret <4 x i32> %s @@ -271,6 +321,8 @@ define <4 x i32> @vxor_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { ; CHECK-LABEL: vxor_v4i32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x i32> %s @@ -281,6 +333,8 @@ define <4 x float> @vfadd_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { ; CHECK-LABEL: vfadd_v4f32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) ret <4 x float> %s @@ -289,6 +343,8 @@ define <4 x float> @vfadd_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { ; CHECK-LABEL: vfadd_v4f32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x float> %s @@ -299,6 +355,8 @@ define <4 x float> @vfsub_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { ; CHECK-LABEL: vfsub_v4f32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) ret <4 x float> %s @@ -307,6 +365,8 @@ define <4 x float> @vfsub_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { ; CHECK-LABEL: vfsub_v4f32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x float> %s @@ -317,6 +377,8 @@ define <4 x float> @vfmul_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { ; CHECK-LABEL: vfmul_v4f32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) ret <4 x float> %s @@ -325,6 +387,8 @@ define <4 x float> @vfmul_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { ; CHECK-LABEL: vfmul_v4f32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x float> %s @@ -335,6 +399,8 @@ define <4 x float> @vfdiv_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { ; CHECK-LABEL: vfdiv_v4f32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) ret <4 x float> %s @@ -343,6 +409,8 @@ define <4 x float> @vfdiv_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { ; CHECK-LABEL: vfdiv_v4f32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x float> %s @@ -353,6 +421,8 @@ define <4 x float> @vfrem_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { ; CHECK-LABEL: vfrem_v4f32_zero_evl: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) ret <4 x float> %s @@ -361,6 +431,8 @@ define <4 x float> @vfrem_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { ; CHECK-LABEL: vfrem_v4f32_false_mask: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: ret %s = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) ret <4 x float> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll --- a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll +++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll @@ -521,8 +521,11 @@ define @intrinsic_vredsum_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredsum_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredsum.nxv8i8.nxv1i8( @@ -543,8 +546,11 @@ define @intrinsic_vredand_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredand_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredand.nxv8i8.nxv1i8( @@ -565,8 +571,11 @@ define @intrinsic_vredor_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredor_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredor.nxv8i8.nxv1i8( @@ -587,8 +596,11 @@ define @intrinsic_vredxor_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredxor_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredxor.nxv8i8.nxv1i8( @@ -609,8 +621,11 @@ define @intrinsic_vredminu_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredminu_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredminu.nxv8i8.nxv1i8( @@ -631,8 +646,11 @@ define @intrinsic_vredmin_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredmin_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredmin.nxv8i8.nxv1i8( @@ -653,8 +671,11 @@ define @intrinsic_vredmaxu_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredmaxu_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredmaxu.nxv8i8.nxv1i8( @@ -675,8 +696,11 @@ define @intrinsic_vredmax_vs_nxv8i8_nxv1i8_nxv8i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vredmax_vs_nxv8i8_nxv1i8_nxv8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vredmax.nxv8i8.nxv1i8( @@ -697,8 +721,11 @@ define @intrinsic_vwredsumu_vs_nxv4i16_nxv1i8_nxv4i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vwredsumu_vs_nxv4i16_nxv1i8_nxv4i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vwredsumu.nxv4i16.nxv1i8( @@ -719,8 +746,11 @@ define @intrinsic_vwredsum_vs_nxv4i16_nxv1i8_nxv4i16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vwredsum_vs_nxv4i16_nxv1i8_nxv4i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vwredsum.nxv4i16.nxv1i8( @@ -741,8 +771,11 @@ define @intrinsic_vfredosum_vs_nxv4f16_nxv1f16_nxv4f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv1f16_nxv4f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vfredosum.nxv4f16.nxv1f16( @@ -763,8 +796,11 @@ define @intrinsic_vfredusum_vs_nxv4f16_nxv1f16_nxv4f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv1f16_nxv4f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vfredusum.nxv4f16.nxv1f16( @@ -785,8 +821,11 @@ define @intrinsic_vfredmax_vs_nxv4f16_nxv1f16_nxv4f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vfredmax_vs_nxv4f16_nxv1f16_nxv4f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vfredmax.nxv4f16.nxv1f16( @@ -807,8 +846,11 @@ define @intrinsic_vfredmin_vs_nxv4f16_nxv1f16_nxv4f16( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vfredmin_vs_nxv4f16_nxv1f16_nxv4f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vfredmin.nxv4f16.nxv1f16( @@ -829,8 +871,11 @@ define @intrinsic_vfwredosum_vs_nxv2f32_nxv1f16_nxv2f32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv1f16_nxv2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vfwredosum.nxv2f32.nxv1f16( @@ -850,8 +895,11 @@ define @intrinsic_vfwredusum_vs_nxv2f32_nxv1f16_nxv2f32( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv1f16_nxv2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vfwredusum.nxv2f32.nxv1f16( @@ -873,8 +921,11 @@ define @intrinsic_vslidedown_vx_nxv1i8_nxv1i8( %0, iXLen %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vslidedown_vx_nxv1i8_nxv1i8: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret entry: %a = call @llvm.riscv.vslidedown.nxv1i8( @@ -927,7 +978,9 @@ ; ; RV64-LABEL: intrinsic_vmv.s.x_x_nxv1i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: ret entry: @@ -940,7 +993,9 @@ define @intrinsic_vfmv.s.f_f_nxv1f16(half %0, iXLen %1) nounwind { ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1f16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: ret entry: @@ -957,7 +1012,9 @@ define @intrinsic_vcompress_um_nxv1i8_nxv1i8( %0, %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_vcompress_um_nxv1i8_nxv1i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu ; CHECK-NEXT: vcompress.vm v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -1548,10 +1548,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB118_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 @@ -1615,10 +1617,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a4, a0, 2 -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB120_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 @@ -1656,13 +1660,17 @@ ; RV32-NEXT: slli a0, a0, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu ; RV32-NEXT: vadd.vi v8, v8, -1, v0.t +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vadd_vi_nxv32i32_evl_nx16: ; RV64: # %bb.0: ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: srli a1, a0, 2 -; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; RV64-NEXT: vslidedown.vx v24, v0, a1 ; RV64-NEXT: slli a0, a0, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll @@ -15,15 +15,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v10, v10, a0 +; CHECK-NEXT: vslidedown.vx v11, v10, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, mu -; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v11, v8, 1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i1( %a, %b, i32 -1) @@ -39,15 +41,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vi v10, v10, 1 +; CHECK-NEXT: vslidedown.vi v11, v10, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, mu -; CHECK-NEXT: vslideup.vx v10, v8, a0 +; CHECK-NEXT: vslideup.vx v11, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i1( %a, %b, i32 1) @@ -65,15 +69,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v10, v10, a0 +; CHECK-NEXT: vslidedown.vx v11, v10, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, mu -; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v11, v8, 1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 -1) @@ -89,15 +95,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v10, v10, 3 +; CHECK-NEXT: vslidedown.vi v11, v10, 3 ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, mu -; CHECK-NEXT: vslideup.vx v10, v8, a0 +; CHECK-NEXT: vslideup.vx v11, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 3) @@ -115,15 +123,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v10, v10, a0 +; CHECK-NEXT: vslidedown.vx v11, v10, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v11, v8, 1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 -1) @@ -139,15 +149,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v10, v10, 7 +; CHECK-NEXT: vslidedown.vi v11, v10, 7 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v10, v8, a0 +; CHECK-NEXT: vslideup.vx v11, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 7) @@ -164,15 +176,17 @@ ; CHECK-NEXT: vmerge.vim v10, v9, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v10, v10, a0 +; CHECK-NEXT: vslidedown.vx v11, v10, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v11, v8, 1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 -1) @@ -187,15 +201,17 @@ ; CHECK-NEXT: vmerge.vim v10, v9, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v10, v10, 15 +; CHECK-NEXT: vslidedown.vi v11, v10, 15 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; CHECK-NEXT: vslideup.vx v10, v8, a0 +; CHECK-NEXT: vslideup.vx v11, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vand.vi v8, v10, 1 +; CHECK-NEXT: vand.vi v8, v11, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 15) @@ -213,15 +229,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v14, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v12, v12, a0 +; CHECK-NEXT: vslidedown.vx v14, v12, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, mu -; CHECK-NEXT: vslideup.vi v12, v8, 1 +; CHECK-NEXT: vslideup.vi v14, v8, 1 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; CHECK-NEXT: vand.vi v8, v12, 1 +; CHECK-NEXT: vand.vi v8, v14, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 -1) @@ -237,15 +255,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v14, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v12, v12, 31 +; CHECK-NEXT: vslidedown.vi v14, v12, 31 ; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, mu -; CHECK-NEXT: vslideup.vx v12, v8, a0 +; CHECK-NEXT: vslideup.vx v14, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; CHECK-NEXT: vand.vi v8, v12, 1 +; CHECK-NEXT: vand.vi v8, v14, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 31) @@ -263,15 +283,17 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v20, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v16, a0 +; CHECK-NEXT: vslidedown.vx v20, v16, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, mu -; CHECK-NEXT: vslideup.vi v16, v8, 1 +; CHECK-NEXT: vslideup.vi v20, v8, 1 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, mu -; CHECK-NEXT: vand.vi v8, v16, 1 +; CHECK-NEXT: vand.vi v8, v20, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i1( %a, %b, i32 -1) @@ -288,15 +310,17 @@ ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -63 ; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v20, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v16, v16, a1 +; CHECK-NEXT: vslidedown.vx v20, v16, a1 ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, mu -; CHECK-NEXT: vslideup.vx v16, v8, a0 +; CHECK-NEXT: vslideup.vx v20, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, mu -; CHECK-NEXT: vand.vi v8, v16, 1 +; CHECK-NEXT: vand.vi v8, v20, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i1( %a, %b, i32 63) @@ -308,16 +332,19 @@ define @splice_nxv64i1_offset_negone( %a, %b) #0 { ; CHECK-LABEL: splice_nxv64i1_offset_negone: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v1, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, mu ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v24, v24, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m8, tu, mu ; CHECK-NEXT: vslideup.vi v24, v8, 1 @@ -332,17 +359,20 @@ define @splice_nxv64i1_offset_max( %a, %b) #0 { ; CHECK-LABEL: splice_nxv64i1_offset_max: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v1, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, mu ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -127 ; CHECK-NEXT: li a1, 127 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v24, v24, a1 +; CHECK-NEXT: vslidedown.vx v24, v8, a1 ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e8, m8, tu, mu ; CHECK-NEXT: vslideup.vx v24, v8, a0 @@ -370,10 +400,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i8( %a, %b, i32 -1) ret %res @@ -385,10 +418,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i8( %a, %b, i32 -2) ret %res @@ -400,10 +436,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, mf8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i8( %a, %b, i32 1) ret %res @@ -425,10 +464,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 -1) ret %res @@ -440,10 +482,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 -4) ret %res @@ -455,10 +500,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i8( %a, %b, i32 3) ret %res @@ -480,10 +528,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i8( %a, %b, i32 -1) ret %res @@ -495,10 +546,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 8 +; CHECK-NEXT: vslideup.vi v10, v9, 8 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i8( %a, %b, i32 -8) ret %res @@ -510,10 +564,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vslidedown.vi v10, v8, 7 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i8( %a, %b, i32 7) ret %res @@ -534,10 +591,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i8( %a, %b, i32 -1) ret %res @@ -548,10 +608,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 16 +; CHECK-NEXT: vslideup.vi v10, v9, 16 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i8( %a, %b, i32 -16) ret %res @@ -562,10 +625,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vslidedown.vi v10, v8, 15 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i8( %a, %b, i32 15) ret %res @@ -587,10 +653,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -1) ret %res @@ -603,10 +672,13 @@ ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -32 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: vslideup.vx v12, v10, a1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 -32) ret %res @@ -618,10 +690,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vslidedown.vi v12, v8, 31 ; CHECK-NEXT: vsetvli a1, zero, e8, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i8( %a, %b, i32 31) ret %res @@ -643,10 +718,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v16, v12, 1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i8( %a, %b, i32 -1) ret %res @@ -659,10 +737,13 @@ ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -64 ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: vslideup.vx v16, v12, a1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i8( %a, %b, i32 -64) ret %res @@ -675,10 +756,13 @@ ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -63 ; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vx v16, v8, a1 ; CHECK-NEXT: vsetvli a1, zero, e8, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v16, v12, a0 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i8( %a, %b, i32 63) ret %res @@ -700,10 +784,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: vslideup.vi v24, v16, 1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv64i8( %a, %b, i32 -1) ret %res @@ -716,10 +803,13 @@ ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -128 ; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslideup.vx v24, v16, a1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv64i8( %a, %b, i32 -128) ret %res @@ -732,10 +822,13 @@ ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -127 ; CHECK-NEXT: li a1, 127 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vx v24, v8, a1 ; CHECK-NEXT: vsetvli a1, zero, e8, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v24, v16, a0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv64i8( %a, %b, i32 127) ret %res @@ -757,10 +850,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i16( %a, %b, i32 -1) ret %res @@ -772,10 +868,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i16( %a, %b, i32 -2) ret %res @@ -787,10 +886,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i16( %a, %b, i32 1) ret %res @@ -812,10 +914,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i16( %a, %b, i32 -1) ret %res @@ -827,10 +932,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i16( %a, %b, i32 -4) ret %res @@ -842,10 +950,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i16( %a, %b, i32 3) ret %res @@ -867,10 +978,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i16( %a, %b, i32 -1) ret %res @@ -882,10 +996,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 8 +; CHECK-NEXT: vslideup.vi v10, v9, 8 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i16( %a, %b, i32 -8) ret %res @@ -897,10 +1014,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vslidedown.vi v10, v8, 7 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i16( %a, %b, i32 7) ret %res @@ -921,10 +1041,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -1) ret %res @@ -935,10 +1058,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 16 +; CHECK-NEXT: vslideup.vi v12, v10, 16 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 -16) ret %res @@ -949,10 +1075,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vslidedown.vi v12, v8, 15 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i16( %a, %b, i32 15) ret %res @@ -974,10 +1103,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v16, v12, 1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i16( %a, %b, i32 -1) ret %res @@ -990,10 +1122,13 @@ ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -32 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: vslideup.vx v16, v12, a1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i16( %a, %b, i32 -32) ret %res @@ -1005,10 +1140,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vslidedown.vi v16, v8, 31 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v16, v12, a0 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i16( %a, %b, i32 31) ret %res @@ -1030,10 +1168,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: vslideup.vi v24, v16, 1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i16( %a, %b, i32 -1) ret %res @@ -1046,10 +1187,13 @@ ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -64 ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslideup.vx v24, v16, a1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i16( %a, %b, i32 -64) ret %res @@ -1062,10 +1206,13 @@ ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -63 ; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vx v24, v8, a1 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v24, v16, a0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32i16( %a, %b, i32 63) ret %res @@ -1087,10 +1234,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i32( %a, %b, i32 -1) ret %res @@ -1102,10 +1252,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i32( %a, %b, i32 -2) ret %res @@ -1117,10 +1270,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e32, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i32( %a, %b, i32 1) ret %res @@ -1142,10 +1298,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i32( %a, %b, i32 -1) ret %res @@ -1157,10 +1316,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i32( %a, %b, i32 -4) ret %res @@ -1172,10 +1334,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i32( %a, %b, i32 3) ret %res @@ -1197,10 +1362,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -1) ret %res @@ -1212,10 +1380,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 -8) ret %res @@ -1227,10 +1398,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vslidedown.vi v12, v8, 7 ; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i32( %a, %b, i32 7) ret %res @@ -1251,10 +1425,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v16, v12, 1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i32( %a, %b, i32 -1) ret %res @@ -1265,10 +1442,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 16 +; CHECK-NEXT: vslideup.vi v16, v12, 16 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i32( %a, %b, i32 -16) ret %res @@ -1279,10 +1459,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vslidedown.vi v16, v8, 15 ; CHECK-NEXT: vsetvli a1, zero, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v16, v12, a0 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i32( %a, %b, i32 15) ret %res @@ -1304,10 +1487,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: vslideup.vi v24, v16, 1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i32( %a, %b, i32 -1) ret %res @@ -1320,10 +1506,13 @@ ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -32 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslideup.vx v24, v16, a1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i32( %a, %b, i32 -32) ret %res @@ -1335,10 +1524,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vslidedown.vi v24, v8, 31 ; CHECK-NEXT: vsetvli a1, zero, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v24, v16, a0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i32( %a, %b, i32 31) ret %res @@ -1360,10 +1552,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i64( %a, %b, i32 -1) ret %res @@ -1375,10 +1570,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i64( %a, %b, i32 -2) ret %res @@ -1390,10 +1588,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1i64( %a, %b, i32 1) ret %res @@ -1415,10 +1616,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -1) ret %res @@ -1430,10 +1634,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 -4) ret %res @@ -1445,10 +1652,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v12, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e64, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i64( %a, %b, i32 3) ret %res @@ -1470,10 +1680,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v16, v12, 1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i64( %a, %b, i32 -1) ret %res @@ -1485,10 +1698,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vslideup.vi v16, v12, 8 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i64( %a, %b, i32 -8) ret %res @@ -1500,10 +1716,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vslidedown.vi v16, v8, 7 ; CHECK-NEXT: vsetvli a1, zero, e64, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v16, v12, a0 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i64( %a, %b, i32 7) ret %res @@ -1524,10 +1743,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: vslideup.vi v24, v16, 1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i64( %a, %b, i32 -1) ret %res @@ -1538,10 +1760,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 16 +; CHECK-NEXT: vslideup.vi v24, v16, 16 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i64( %a, %b, i32 -16) ret %res @@ -1552,10 +1777,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vslidedown.vi v24, v8, 15 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v24, v16, a0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i64( %a, %b, i32 15) ret %res @@ -1577,10 +1805,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f16( %a, %b, i32 -1) ret %res @@ -1592,10 +1823,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f16( %a, %b, i32 -2) ret %res @@ -1607,10 +1841,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f16( %a, %b, i32 1) ret %res @@ -1632,10 +1869,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -1) ret %res @@ -1647,10 +1887,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 -4) ret %res @@ -1662,10 +1905,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f16( %a, %b, i32 3) ret %res @@ -1687,10 +1933,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -1) ret %res @@ -1702,10 +1951,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 8 +; CHECK-NEXT: vslideup.vi v10, v9, 8 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 -8) ret %res @@ -1717,10 +1969,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vslidedown.vi v10, v8, 7 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f16( %a, %b, i32 7) ret %res @@ -1741,10 +1996,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -1) ret %res @@ -1755,10 +2013,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 16 +; CHECK-NEXT: vslideup.vi v12, v10, 16 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 -16) ret %res @@ -1769,10 +2030,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vslidedown.vi v12, v8, 15 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f16( %a, %b, i32 15) ret %res @@ -1794,10 +2058,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v16, v12, 1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16f16( %a, %b, i32 -1) ret %res @@ -1810,10 +2077,13 @@ ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -32 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: vslideup.vx v16, v12, a1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16f16( %a, %b, i32 -32) ret %res @@ -1825,10 +2095,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vslidedown.vi v16, v8, 31 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v16, v12, a0 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16f16( %a, %b, i32 31) ret %res @@ -1850,10 +2123,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: vslideup.vi v24, v16, 1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32f16( %a, %b, i32 -1) ret %res @@ -1866,10 +2142,13 @@ ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -64 ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslideup.vx v24, v16, a1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32f16( %a, %b, i32 -64) ret %res @@ -1882,10 +2161,13 @@ ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -63 ; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vx v24, v8, a1 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v24, v16, a0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv32f16( %a, %b, i32 63) ret %res @@ -1907,10 +2189,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f32( %a, %b, i32 -1) ret %res @@ -1922,10 +2207,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f32( %a, %b, i32 -2) ret %res @@ -1937,10 +2225,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e32, mf2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f32( %a, %b, i32 1) ret %res @@ -1962,10 +2253,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -1) ret %res @@ -1977,10 +2271,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 -4) ret %res @@ -1992,10 +2289,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f32( %a, %b, i32 3) ret %res @@ -2017,10 +2317,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -1) ret %res @@ -2032,10 +2335,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 -8) ret %res @@ -2047,10 +2353,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vslidedown.vi v12, v8, 7 ; CHECK-NEXT: vsetvli a1, zero, e32, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f32( %a, %b, i32 7) ret %res @@ -2071,10 +2380,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v16, v12, 1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f32( %a, %b, i32 -1) ret %res @@ -2085,10 +2397,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 16 +; CHECK-NEXT: vslideup.vi v16, v12, 16 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f32( %a, %b, i32 -16) ret %res @@ -2099,10 +2414,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vslidedown.vi v16, v8, 15 ; CHECK-NEXT: vsetvli a1, zero, e32, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v16, v12, a0 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f32( %a, %b, i32 15) ret %res @@ -2124,10 +2442,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: vslideup.vi v24, v16, 1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16f32( %a, %b, i32 -1) ret %res @@ -2140,10 +2461,13 @@ ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -32 ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslideup.vx v24, v16, a1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16f32( %a, %b, i32 -32) ret %res @@ -2155,10 +2479,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vslidedown.vi v24, v8, 31 ; CHECK-NEXT: vsetvli a1, zero, e32, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v24, v16, a0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16f32( %a, %b, i32 31) ret %res @@ -2180,10 +2507,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f64( %a, %b, i32 -1) ret %res @@ -2195,10 +2525,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v10, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f64( %a, %b, i32 -2) ret %res @@ -2210,10 +2543,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, tu, mu -; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v10, v9, a0 +; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv1f64( %a, %b, i32 1) ret %res @@ -2235,10 +2571,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v12, v10, 1 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -1) ret %res @@ -2250,10 +2589,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v12, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 -4) ret %res @@ -2265,10 +2607,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vslidedown.vi v12, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e64, m2, tu, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2f64( %a, %b, i32 3) ret %res @@ -2290,10 +2635,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v16, v12, 1 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f64( %a, %b, i32 -1) ret %res @@ -2305,10 +2653,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, tu, mu -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vslideup.vi v16, v12, 8 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f64( %a, %b, i32 -8) ret %res @@ -2320,10 +2671,13 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vslidedown.vi v16, v8, 7 ; CHECK-NEXT: vsetvli a1, zero, e64, m4, tu, mu -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v16, v12, a0 +; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4f64( %a, %b, i32 7) ret %res @@ -2344,10 +2698,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: vslideup.vi v24, v16, 1 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f64( %a, %b, i32 -1) ret %res @@ -2358,10 +2715,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vi v8, v16, 16 +; CHECK-NEXT: vslideup.vi v24, v16, 16 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f64( %a, %b, i32 -16) ret %res @@ -2372,10 +2732,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vslidedown.vi v24, v8, 15 ; CHECK-NEXT: vsetvli a1, zero, e64, m8, tu, mu -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v24, v16, a0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8f64( %a, %b, i32 15) ret %res diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -1217,6 +1217,8 @@ ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; CHECK-NEXT: slli a5, a1, 3 ; CHECK-NEXT: add a6, a2, a5 @@ -1236,7 +1238,7 @@ ; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: srli a6, a1, 3 ; CHECK-NEXT: sub a5, a4, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a6 +; CHECK-NEXT: vslidedown.vx v0, v1, a6 ; CHECK-NEXT: bltu a4, a5, .LBB92_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a5 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll @@ -398,9 +398,11 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 3 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll @@ -98,10 +98,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll @@ -8,9 +8,12 @@ ; CHECK-LABEL: vfptosi_nxv2i1_nxv2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call @llvm.vp.fptosi.nxv2i1.nxv2f16( %va, %m, i32 %evl) ret %v @@ -33,9 +36,12 @@ ; CHECK-LABEL: vfptosi_nxv2i1_nxv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %v = call @llvm.vp.fptosi.nxv2i1.nxv2f32( %va, %m, i32 %evl) ret %v @@ -59,7 +65,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vmsne.vi v8, v10, 0, v0.t ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll @@ -318,10 +318,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll @@ -8,9 +8,12 @@ ; CHECK-LABEL: vfptoui_nxv2i1_nxv2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.xu.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call @llvm.vp.fptoui.nxv2i1.nxv2f16( %va, %m, i32 %evl) ret %v @@ -33,9 +36,12 @@ ; CHECK-LABEL: vfptoui_nxv2i1_nxv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vfcvt.rtz.xu.f.v v9, v8, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %v = call @llvm.vp.fptoui.nxv2i1.nxv2f32( %va, %m, i32 %evl) ret %v @@ -59,7 +65,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vfcvt.rtz.xu.f.v v10, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vmsne.vi v8, v10, 0, v0.t ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll @@ -318,10 +318,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll @@ -98,9 +98,11 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 3 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 @@ -113,8 +115,11 @@ ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vfncvt.f.f.w v24, v8, v0.t -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: vfncvt.f.f.w v16, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv4r.v v12, v28 ; CHECK-NEXT: ret %v = call @llvm.vp.fptrunc.nxv16f64.nxv16f32( %a, %m, i32 %vl) ret %v @@ -128,17 +133,16 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a4, a1, 1 ; CHECK-NEXT: srli a3, a1, 3 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: slli a4, a1, 1 +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: mv a5, a2 ; CHECK-NEXT: bltu a2, a4, .LBB8_2 ; CHECK-NEXT: # %bb.1: @@ -147,71 +151,69 @@ ; CHECK-NEXT: li a6, 0 ; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a7, a5, a1 -; CHECK-NEXT: vslidedown.vx v0, v24, a3 +; CHECK-NEXT: vslidedown.vx v0, v1, a3 ; CHECK-NEXT: bltu a5, a7, .LBB8_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a6, a7 ; CHECK-NEXT: .LBB8_4: ; CHECK-NEXT: srli a7, a1, 2 +; CHECK-NEXT: vsetvli t0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: slli t0, a1, 3 ; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, mu -; CHECK-NEXT: vfncvt.f.f.w v12, v16, v0.t +; CHECK-NEXT: addi a6, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a6) # Unknown-size Folded Reload +; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t ; CHECK-NEXT: bltu a5, a1, .LBB8_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv a5, a1 ; CHECK-NEXT: .LBB8_6: ; CHECK-NEXT: li a6, 0 ; CHECK-NEXT: vsetvli t1, zero, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v1, v24, a7 +; CHECK-NEXT: vslidedown.vx v16, v1, a7 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 ; CHECK-NEXT: add a7, a0, t0 ; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfncvt.f.f.w v4, v8, v0.t +; CHECK-NEXT: vsetvli a5, zero, e16, m8, ta, mu ; CHECK-NEXT: sub a4, a2, a4 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vl8re8.v v16, (a5) # Unknown-size Folded Reload -; CHECK-NEXT: vfncvt.f.f.w v8, v16, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: bltu a2, a4, .LBB8_8 ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: mv a6, a4 ; CHECK-NEXT: .LBB8_8: ; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; CHECK-NEXT: vl8re64.v v16, (a7) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v24, (a7) ; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vmv4r.v v8, v4 ; CHECK-NEXT: sub a4, a6, a1 -; CHECK-NEXT: vslidedown.vx v0, v1, a3 +; CHECK-NEXT: vslidedown.vx v17, v16, a3 ; CHECK-NEXT: bltu a6, a4, .LBB8_10 ; CHECK-NEXT: # %bb.9: ; CHECK-NEXT: mv a2, a4 ; CHECK-NEXT: .LBB8_10: -; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv4r.v v12, v20 ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, mu -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v17 ; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t ; CHECK-NEXT: bltu a6, a1, .LBB8_12 ; CHECK-NEXT: # %bb.11: ; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB8_12: ; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfncvt.f.f.w v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll @@ -260,10 +260,12 @@ ; RV32-NEXT: li a3, 0 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: srli a5, a2, 2 -; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, mu +; RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; RV32-NEXT: slli a2, a2, 1 ; RV32-NEXT: sub a4, a1, a2 -; RV32-NEXT: vslidedown.vx v0, v0, a5 +; RV32-NEXT: vslidedown.vx v0, v12, a5 ; RV32-NEXT: bltu a1, a4, .LBB12_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a4 @@ -281,6 +283,8 @@ ; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_nxv32i8: @@ -288,7 +292,7 @@ ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: slli a5, a3, 1 ; RV64-NEXT: sub a6, a1, a5 -; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: vmv1r.v v13, v0 ; RV64-NEXT: li a4, 0 ; RV64-NEXT: li a2, 0 ; RV64-NEXT: bltu a1, a6, .LBB12_2 @@ -302,11 +306,15 @@ ; RV64-NEXT: mv a7, a6 ; RV64-NEXT: .LBB12_4: ; RV64-NEXT: srli a6, a3, 2 -; RV64-NEXT: vsetvli t0, zero, e8, mf2, ta, mu -; RV64-NEXT: vslidedown.vx v13, v12, a6 +; RV64-NEXT: vsetvli t0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64-NEXT: vslidedown.vx v12, v13, a6 ; RV64-NEXT: srli a6, a3, 3 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetvli t0, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v13, a6 +; RV64-NEXT: vslidedown.vx v0, v12, a6 ; RV64-NEXT: vsetvli t0, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, a7, e8, m1, ta, mu @@ -320,8 +328,10 @@ ; RV64-NEXT: # %bb.7: ; RV64-NEXT: mv a4, a5 ; RV64-NEXT: .LBB12_8: +; RV64-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v12, a6 +; RV64-NEXT: vslidedown.vx v0, v13, a6 ; RV64-NEXT: vsetvli a5, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v9 ; RV64-NEXT: vsetvli zero, a4, e8, m1, ta, mu @@ -333,8 +343,10 @@ ; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, mu -; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vmv1r.v v0, v13 ; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 ; RV64-NEXT: bltu a2, a3, .LBB12_12 ; RV64-NEXT: # %bb.11: ; RV64-NEXT: mv a2, a3 @@ -342,7 +354,7 @@ ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v10 ; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, mu -; RV64-NEXT: vmv1r.v v0, v13 +; RV64-NEXT: vmv1r.v v0, v12 ; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, %idxs @@ -2341,9 +2353,11 @@ ; RV32-NEXT: li a2, 0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a4, a1, 3 +; RV32-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a3, a0, a1 -; RV32-NEXT: vslidedown.vx v0, v0, a4 +; RV32-NEXT: vslidedown.vx v0, v24, a4 ; RV32-NEXT: bltu a0, a3, .LBB102_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a2, a3 @@ -2366,9 +2380,11 @@ ; RV64-NEXT: li a2, 0 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a4, a1, 3 +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a3, a0, a1 -; RV64-NEXT: vslidedown.vx v0, v0, a4 +; RV64-NEXT: vslidedown.vx v0, v24, a4 ; RV64-NEXT: bltu a0, a3, .LBB102_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a3 @@ -2394,9 +2410,11 @@ ; RV32-NEXT: li a3, 0 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: srli a5, a2, 3 +; RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 0 ; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a4, a1, a2 -; RV32-NEXT: vslidedown.vx v0, v0, a5 +; RV32-NEXT: vslidedown.vx v0, v12, a5 ; RV32-NEXT: bltu a1, a4, .LBB103_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a3, a4 @@ -2421,9 +2439,11 @@ ; RV64-NEXT: li a3, 0 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: srli a5, a2, 3 +; RV64-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a4, a1, a2 -; RV64-NEXT: vslidedown.vx v0, v0, a5 +; RV64-NEXT: vslidedown.vx v0, v12, a5 ; RV64-NEXT: bltu a1, a4, .LBB103_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a4 @@ -2468,19 +2488,22 @@ ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: srli a3, a2, 3 +; RV32-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a2, a1, a2 -; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: vslidedown.vx v16, v0, a3 ; RV32-NEXT: bltu a1, a2, .LBB104_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a2 ; RV32-NEXT: .LBB104_4: ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu -; RV32-NEXT: vncvt.x.x.w v24, v16 +; RV32-NEXT: vncvt.x.x.w v4, v24 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vluxei32.v v16, (a0), v4, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64: @@ -2491,9 +2514,11 @@ ; RV64-NEXT: vsext.vf4 v16, v10 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: srli a5, a2, 3 +; RV64-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a4, a1, a2 -; RV64-NEXT: vslidedown.vx v0, v0, a5 +; RV64-NEXT: vslidedown.vx v0, v12, a5 ; RV64-NEXT: bltu a1, a4, .LBB104_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a4 @@ -2538,19 +2563,22 @@ ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: srli a3, a2, 3 +; RV32-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a2, a1, a2 -; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: vslidedown.vx v16, v0, a3 ; RV32-NEXT: bltu a1, a2, .LBB105_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a2 ; RV32-NEXT: .LBB105_4: ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsll.vi v24, v24, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu -; RV32-NEXT: vncvt.x.x.w v24, v16 +; RV32-NEXT: vncvt.x.x.w v4, v24 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vluxei32.v v16, (a0), v4, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64: @@ -2561,9 +2589,11 @@ ; RV64-NEXT: vzext.vf4 v16, v10 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: srli a5, a2, 3 +; RV64-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 0 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a4, a1, a2 -; RV64-NEXT: vslidedown.vx v0, v0, a5 +; RV64-NEXT: vslidedown.vx v0, v12, a5 ; RV64-NEXT: bltu a1, a4, .LBB105_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a3, a4 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll @@ -457,9 +457,11 @@ ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: srli a5, a2, 3 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a4, a1, a2 -; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: vslidedown.vx v0, v8, a5 ; CHECK-NEXT: bltu a1, a4, .LBB37_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a4 @@ -510,6 +512,8 @@ ; CHECK-NEXT: .LBB38_4: ; CHECK-NEXT: li a7, 0 ; CHECK-NEXT: srli t0, a3, 3 +; CHECK-NEXT: vsetvli t1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli t1, zero, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vx v0, v8, t0 ; CHECK-NEXT: slli t0, a3, 3 @@ -517,13 +521,15 @@ ; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v16, (t0), v0.t ; CHECK-NEXT: srli a6, a3, 2 +; CHECK-NEXT: vsetvli t0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: sub t0, a2, a5 ; CHECK-NEXT: slli a5, a3, 4 ; CHECK-NEXT: bltu a2, t0, .LBB38_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv a7, t0 ; CHECK-NEXT: .LBB38_6: -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vslidedown.vx v0, v8, a6 ; CHECK-NEXT: add a2, a0, a5 ; CHECK-NEXT: bltu a7, a3, .LBB38_8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll @@ -2088,14 +2088,17 @@ ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t ; RV32-NEXT: srli a2, a0, 3 +; RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a0, a1, a0 -; RV32-NEXT: vslidedown.vx v0, v0, a2 +; RV32-NEXT: vslidedown.vx v8, v0, a2 ; RV32-NEXT: bltu a1, a0, .LBB95_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a3, a0 ; RV32-NEXT: .LBB95_4: ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vsoxei32.v v16, (zero), v28, v0.t ; RV32-NEXT: ret ; @@ -2122,14 +2125,17 @@ ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t ; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a0, a2, a1 -; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: vslidedown.vx v8, v0, a3 ; RV64-NEXT: bltu a2, a0, .LBB95_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a4, a0 ; RV64-NEXT: .LBB95_4: ; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t @@ -2159,14 +2165,17 @@ ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: srli a3, a1, 3 +; RV32-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: vslidedown.vx v8, v0, a3 ; RV32-NEXT: bltu a2, a1, .LBB96_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a1 ; RV32-NEXT: .LBB96_4: ; RV32-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret ; @@ -2186,18 +2195,21 @@ ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: vslidedown.vx v8, v0, a3 ; RV64-NEXT: bltu a2, a1, .LBB96_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a4, a1 ; RV64-NEXT: .LBB96_4: ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf4 v8, v6 -; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsext.vf4 v24, v6 +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, %idxs call void @llvm.vp.scatter.nxv16f64.nxv16p0f64( %val, %ptrs, %m, i32 %evl) @@ -2238,24 +2250,27 @@ ; RV32-NEXT: vl8re8.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: srli a3, a1, 3 +; RV32-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: vslidedown.vx v8, v0, a3 ; RV32-NEXT: bltu a2, a1, .LBB97_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a1 ; RV32-NEXT: .LBB97_4: ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 +; RV32-NEXT: vncvt.x.x.w v12, v16 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v16, (a0), v12, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -2293,22 +2308,25 @@ ; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t ; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: vslidedown.vx v8, v0, a3 ; RV64-NEXT: bltu a2, a1, .LBB97_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a4, a1 ; RV64-NEXT: .LBB97_4: ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 @@ -2354,24 +2372,27 @@ ; RV32-NEXT: vl8re8.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: srli a3, a1, 3 +; RV32-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: vslidedown.vx v8, v0, a3 ; RV32-NEXT: bltu a2, a1, .LBB98_4 ; RV32-NEXT: # %bb.3: ; RV32-NEXT: mv a4, a1 ; RV32-NEXT: .LBB98_4: ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 +; RV32-NEXT: vncvt.x.x.w v12, v16 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v16, (a0), v12, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -2409,22 +2430,25 @@ ; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t ; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a5, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu ; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: vslidedown.vx v8, v0, a3 ; RV64-NEXT: bltu a2, a1, .LBB98_4 ; RV64-NEXT: # %bb.3: ; RV64-NEXT: mv a4, a1 ; RV64-NEXT: .LBB98_4: ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll @@ -379,9 +379,11 @@ ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; CHECK-NEXT: vse64.v v8, (a0), v0.t ; CHECK-NEXT: srli a5, a2, 3 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a3, a1, a2 -; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: vslidedown.vx v8, v0, a5 ; CHECK-NEXT: bltu a1, a3, .LBB30_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a4, a3 @@ -389,6 +391,7 @@ ; CHECK-NEXT: slli a1, a2, 3 ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vse64.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.vp.store.nxv16f64.p0nxv16f64( %val, * %ptr, %m, i32 %evl) @@ -412,44 +415,48 @@ ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: addi a5, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: mv a5, a2 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: bltu a2, a4, .LBB31_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: mv a6, a4 ; CHECK-NEXT: .LBB31_2: -; CHECK-NEXT: mv a7, a5 -; CHECK-NEXT: bltu a5, a3, .LBB31_4 +; CHECK-NEXT: mv a7, a6 +; CHECK-NEXT: bltu a6, a3, .LBB31_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a7, a3 ; CHECK-NEXT: .LBB31_4: -; CHECK-NEXT: li a6, 0 +; CHECK-NEXT: li a5, 0 ; CHECK-NEXT: vl8re64.v v16, (a0) ; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, mu -; CHECK-NEXT: sub a0, a5, a3 +; CHECK-NEXT: sub a0, a6, a3 ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vse64.v v8, (a1), v0.t -; CHECK-NEXT: bltu a5, a0, .LBB31_6 +; CHECK-NEXT: bltu a6, a0, .LBB31_6 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: mv a6, a0 +; CHECK-NEXT: mv a5, a0 ; CHECK-NEXT: .LBB31_6: ; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: srli a5, a3, 3 +; CHECK-NEXT: srli a6, a3, 3 +; CHECK-NEXT: vsetvli a7, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v0, v24, a5 -; CHECK-NEXT: slli a5, a3, 3 -; CHECK-NEXT: add a5, a1, a5 -; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, mu -; CHECK-NEXT: addi a6, sp, 16 -; CHECK-NEXT: vl8re8.v v8, (a6) # Unknown-size Folded Reload -; CHECK-NEXT: vse64.v v8, (a5), v0.t +; CHECK-NEXT: vslidedown.vx v0, v24, a6 +; CHECK-NEXT: slli a6, a3, 3 +; CHECK-NEXT: add a6, a1, a6 +; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-NEXT: addi a5, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: vse64.v v8, (a6), v0.t ; CHECK-NEXT: srli a5, a3, 2 +; CHECK-NEXT: vsetvli a6, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: sub a6, a2, a4 ; CHECK-NEXT: slli a4, a3, 4 ; CHECK-NEXT: bltu a2, a6, .LBB31_8 ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: mv a0, a6 ; CHECK-NEXT: .LBB31_8: -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vslidedown.vx v0, v24, a5 ; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: bltu a0, a3, .LBB31_10 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -9,11 +9,15 @@ define half @vreduce_fadd_nxv1f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv1f16(half %s, %v) ret half %red @@ -22,11 +26,15 @@ define half @vreduce_ord_fadd_nxv1f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv1f16(half %s, %v) ret half %red @@ -37,11 +45,15 @@ define half @vreduce_fadd_nxv2f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv2f16(half %s, %v) ret half %red @@ -50,11 +62,15 @@ define half @vreduce_ord_fadd_nxv2f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv2f16(half %s, %v) ret half %red @@ -65,11 +81,15 @@ define half @vreduce_fadd_nxv4f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv4f16(half %s, %v) ret half %red @@ -78,11 +98,15 @@ define half @vreduce_ord_fadd_nxv4f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv4f16(half %s, %v) ret half %red @@ -93,11 +117,15 @@ define float @vreduce_fadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) ret float %red @@ -106,11 +134,15 @@ define float @vreduce_ord_fadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) ret float %red @@ -119,12 +151,16 @@ define float @vreduce_fwadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_fwadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, %e) @@ -134,12 +170,16 @@ define float @vreduce_ord_fwadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, %e) @@ -151,11 +191,15 @@ define float @vreduce_fadd_nxv2f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, %v) ret float %red @@ -164,11 +208,15 @@ define float @vreduce_ord_fadd_nxv2f32( %v, float %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, %v) ret float %red @@ -177,12 +225,16 @@ define float @vreduce_fwadd_nxv2f32( %v, float %s) { ; CHECK-LABEL: vreduce_fwadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, %e) @@ -192,12 +244,16 @@ define float @vreduce_ord_fwadd_nxv2f32( %v, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, %e) @@ -209,11 +265,15 @@ define float @vreduce_fadd_nxv4f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vfredusum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, %v) ret float %red @@ -222,11 +282,15 @@ define float @vreduce_ord_fadd_nxv4f32( %v, float %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, %v) ret float %red @@ -235,12 +299,16 @@ define float @vreduce_fwadd_nxv4f32( %v, float %s) { ; CHECK-LABEL: vreduce_fwadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, %e) @@ -250,12 +318,16 @@ define float @vreduce_ord_fwadd_nxv4f32( %v, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, %e) @@ -267,11 +339,15 @@ define double @vreduce_fadd_nxv1f64( %v, double %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, %v) ret double %red @@ -280,11 +356,15 @@ define double @vreduce_ord_fadd_nxv1f64( %v, double %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, %v) ret double %red @@ -293,12 +373,16 @@ define double @vreduce_fwadd_nxv1f64( %v, double %s) { ; CHECK-LABEL: vreduce_fwadd_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, %e) @@ -308,12 +392,16 @@ define double @vreduce_ord_fwadd_nxv1f64( %v, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, %e) @@ -325,11 +413,15 @@ define double @vreduce_fadd_nxv2f64( %v, double %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; CHECK-NEXT: vfredusum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, %v) ret double %red @@ -338,11 +430,15 @@ define double @vreduce_ord_fadd_nxv2f64( %v, double %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, %v) ret double %red @@ -351,12 +447,16 @@ define double @vreduce_fwadd_nxv2f64( %v, double %s) { ; CHECK-LABEL: vreduce_fwadd_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vfwredusum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, %e) @@ -366,12 +466,16 @@ define double @vreduce_ord_fwadd_nxv2f64( %v, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vfwredosum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %e = fpext %v to %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, %e) @@ -383,11 +487,15 @@ define double @vreduce_fadd_nxv4f64( %v, double %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; CHECK-NEXT: vfredusum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, %v) ret double %red @@ -396,11 +504,15 @@ define double @vreduce_ord_fadd_nxv4f64( %v, double %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; CHECK-NEXT: vfredosum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, %v) ret double %red @@ -409,12 +521,16 @@ define double @vreduce_fwadd_nxv4f64( %v, double %s) { ; CHECK-LABEL: vreduce_fwadd_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vfwredusum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vfwredusum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, %e) @@ -424,12 +540,16 @@ define double @vreduce_ord_fwadd_nxv4f64( %v, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vfwredosum.vs v8, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vfwredosum.vs v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %e = fpext %v to %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, %e) @@ -445,9 +565,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmin.nxv1f16( %v) ret half %red @@ -460,9 +582,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan half @llvm.vector.reduce.fmin.nxv1f16( %v) ret half %red @@ -475,9 +599,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan ninf half @llvm.vector.reduce.fmin.nxv1f16( %v) ret half %red @@ -492,9 +618,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmin.nxv2f16( %v) ret half %red @@ -510,8 +638,10 @@ ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmin.nxv4f16( %v) ret half %red @@ -528,9 +658,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI35_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v16, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, tu, mu +; CHECK-NEXT: vfredmin.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmin.nxv64f16( %v) ret half %red @@ -545,9 +677,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI36_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmin.nxv1f32( %v) ret float %red @@ -560,9 +694,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI37_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan float @llvm.vector.reduce.fmin.nxv1f32( %v) ret float %red @@ -575,9 +711,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI38_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan ninf float @llvm.vector.reduce.fmin.nxv1f32( %v) ret float %red @@ -592,9 +730,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI39_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmin.nxv2f32( %v) ret float %red @@ -609,9 +749,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI40_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v10, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vfredmin.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmin.nxv4f32( %v) ret float %red @@ -628,9 +770,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI41_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v16, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu +; CHECK-NEXT: vfredmin.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmin.nxv32f32( %v) ret float %red @@ -645,9 +789,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI42_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmin.nxv1f64( %v) ret double %red @@ -660,9 +806,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI43_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan double @llvm.vector.reduce.fmin.nxv1f64( %v) ret double %red @@ -675,9 +823,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI44_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredmin.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan ninf double @llvm.vector.reduce.fmin.nxv1f64( %v) ret double %red @@ -692,9 +842,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI45_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; CHECK-NEXT: vfredmin.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmin.nxv2f64( %v) ret double %red @@ -709,9 +861,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v12, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; CHECK-NEXT: vfredmin.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmin.nxv4f64( %v) ret double %red @@ -728,9 +882,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI47_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v16, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu +; CHECK-NEXT: vfredmin.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmin.nxv16f64( %v) ret double %red @@ -745,9 +901,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI48_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmax.nxv1f16( %v) ret half %red @@ -760,9 +918,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI49_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan half @llvm.vector.reduce.fmax.nxv1f16( %v) ret half %red @@ -775,9 +935,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI50_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan ninf half @llvm.vector.reduce.fmax.nxv1f16( %v) ret half %red @@ -792,9 +954,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI51_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmax.nxv2f16( %v) ret half %red @@ -810,8 +974,10 @@ ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v9, (a0), zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmax.nxv4f16( %v) ret half %red @@ -828,9 +994,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI53_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vlse16.v v16, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, tu, mu +; CHECK-NEXT: vfredmax.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmax.nxv64f16( %v) ret half %red @@ -845,9 +1013,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI54_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmax.nxv1f32( %v) ret float %red @@ -860,9 +1030,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI55_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan float @llvm.vector.reduce.fmax.nxv1f32( %v) ret float %red @@ -875,9 +1047,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI56_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan ninf float @llvm.vector.reduce.fmax.nxv1f32( %v) ret float %red @@ -892,9 +1066,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI57_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmax.nxv2f32( %v) ret float %red @@ -909,9 +1085,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI58_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v10, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vfredmax.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmax.nxv4f32( %v) ret float %red @@ -928,9 +1106,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI59_0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vlse32.v v16, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m8, tu, mu +; CHECK-NEXT: vfredmax.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %red = call float @llvm.vector.reduce.fmax.nxv32f32( %v) ret float %red @@ -945,9 +1125,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI60_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmax.nxv1f64( %v) ret double %red @@ -960,9 +1142,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI61_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan double @llvm.vector.reduce.fmax.nxv1f64( %v) ret double %red @@ -975,9 +1159,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI62_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; CHECK-NEXT: vfredmax.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call nnan ninf double @llvm.vector.reduce.fmax.nxv1f64( %v) ret double %red @@ -992,9 +1178,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI63_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; CHECK-NEXT: vfredmax.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmax.nxv2f64( %v) ret double %red @@ -1009,9 +1197,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI64_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v12, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; CHECK-NEXT: vfredmax.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmax.nxv4f64( %v) ret double %red @@ -1028,9 +1218,11 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI65_0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vlse64.v v16, (a0), zero -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 +; CHECK-NEXT: vsetvli a0, zero, e64, m8, tu, mu +; CHECK-NEXT: vfredmax.vs v17, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v17 ; CHECK-NEXT: ret %red = call double @llvm.vector.reduce.fmax.nxv16f64( %v) ret double %red @@ -1039,11 +1231,15 @@ define float @vreduce_nsz_fadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_nsz_fadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc nsz float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) ret float %red @@ -1066,11 +1262,15 @@ ; CHECK-NEXT: vfmv.v.f v9, ft0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vx v8, v9, a1 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv3f16(half %s, %v) ret half %red @@ -1090,11 +1290,15 @@ ; CHECK-NEXT: vfmv.v.f v10, ft0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vx v9, v10, a0 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv6f16(half %s, %v) ret half %red @@ -1118,11 +1322,15 @@ ; CHECK-NEXT: vslideup.vi v11, v12, 0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vx v11, v12, a0 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu +; CHECK-NEXT: vfredosum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv10f16(half %s, %v) ret half %red @@ -1133,15 +1341,18 @@ define half @vreduce_ord_fadd_nxv12f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv12f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: fmv.h.x ft0, zero ; CHECK-NEXT: fneg.h ft0, ft0 -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v11, ft0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu +; CHECK-NEXT: vfredosum.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv12f16(half %s, %v) ret half %red @@ -1162,11 +1373,15 @@ ; CHECK-NEXT: vfmv.v.f v9, ft0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vx v8, v9, a1 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv3f16(half %s, %v) ret half %red @@ -1184,11 +1399,15 @@ ; CHECK-NEXT: vfmv.v.f v10, ft0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vx v9, v10, a0 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, tu, mu +; CHECK-NEXT: vfredusum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv6f16(half %s, %v) ret half %red @@ -1212,11 +1431,15 @@ ; CHECK-NEXT: vslideup.vi v11, v12, 0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vslideup.vx v11, v12, a0 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, ft0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu -; CHECK-NEXT: vfredmin.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu +; CHECK-NEXT: vfredmin.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmin.nxv10f16( %v) ret half %red @@ -1229,13 +1452,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI74_0) ; CHECK-NEXT: flh ft0, %lo(.LCPI74_0)(a0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v12, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vfmv.s.f v12, ft0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v11, ft0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu -; CHECK-NEXT: vfredmax.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, tu, mu +; CHECK-NEXT: vfredmax.vs v13, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v13 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmax.nxv12f16( %v) ret half %red diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll @@ -9,7 +9,9 @@ define half @vpreduce_fadd_nxv1f16(half %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -22,7 +24,9 @@ define half @vpreduce_ord_fadd_nxv1f16(half %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -37,7 +41,9 @@ define half @vpreduce_fadd_nxv2f16(half %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -50,7 +56,9 @@ define half @vpreduce_ord_fadd_nxv2f16(half %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -65,7 +73,9 @@ define half @vpreduce_fadd_nxv4f16(half %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -78,7 +88,9 @@ define half @vpreduce_ord_fadd_nxv4f16(half %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -95,7 +107,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: srli a1, a2, 1 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: vfmv.s.f v25, fa0 ; CHECK-NEXT: mv a3, a0 @@ -109,7 +124,9 @@ ; CHECK-NEXT: vsetvli zero, a3, e16, m8, tu, mu ; CHECK-NEXT: vfredusum.vs v25, v8, v25, v0.t ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: sub a1, a0, a2 ; CHECK-NEXT: vfmv.s.f v8, ft0 ; CHECK-NEXT: bltu a0, a1, .LBB6_4 @@ -130,7 +147,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: srli a1, a2, 1 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: vfmv.s.f v25, fa0 ; CHECK-NEXT: mv a3, a0 @@ -144,7 +164,9 @@ ; CHECK-NEXT: vsetvli zero, a3, e16, m8, tu, mu ; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t ; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: sub a1, a0, a2 ; CHECK-NEXT: vfmv.s.f v8, ft0 ; CHECK-NEXT: bltu a0, a1, .LBB7_4 @@ -165,7 +187,9 @@ define float @vpreduce_fadd_nxv1f32(float %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -178,7 +202,9 @@ define float @vpreduce_ord_fadd_nxv1f32(float %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -193,7 +219,9 @@ define float @vpreduce_fadd_nxv2f32(float %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -206,7 +234,9 @@ define float @vpreduce_ord_fadd_nxv2f32(float %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -221,7 +251,9 @@ define float @vpreduce_fadd_nxv4f32(float %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu ; CHECK-NEXT: vfredusum.vs v10, v8, v10, v0.t @@ -234,7 +266,9 @@ define float @vpreduce_ord_fadd_nxv4f32(float %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu ; CHECK-NEXT: vfredosum.vs v10, v8, v10, v0.t @@ -249,7 +283,9 @@ define double @vpreduce_fadd_nxv1f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu ; CHECK-NEXT: vfredusum.vs v9, v8, v9, v0.t @@ -262,7 +298,9 @@ define double @vpreduce_ord_fadd_nxv1f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu ; CHECK-NEXT: vfredosum.vs v9, v8, v9, v0.t @@ -277,7 +315,9 @@ define double @vpreduce_fadd_nxv2f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu ; CHECK-NEXT: vfredusum.vs v10, v8, v10, v0.t @@ -290,7 +330,9 @@ define double @vpreduce_ord_fadd_nxv2f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu ; CHECK-NEXT: vfredosum.vs v10, v8, v10, v0.t @@ -305,7 +347,9 @@ define double @vpreduce_fadd_nxv3f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv3f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu ; CHECK-NEXT: vfredusum.vs v12, v8, v12, v0.t @@ -318,7 +362,9 @@ define double @vpreduce_ord_fadd_nxv3f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv3f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu ; CHECK-NEXT: vfredosum.vs v12, v8, v12, v0.t @@ -333,7 +379,9 @@ define double @vpreduce_fadd_nxv4f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu ; CHECK-NEXT: vfredusum.vs v12, v8, v12, v0.t @@ -346,7 +394,9 @@ define double @vpreduce_ord_fadd_nxv4f64(double %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu ; CHECK-NEXT: vfredosum.vs v12, v8, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll @@ -9,7 +9,9 @@ define signext i8 @vpreduce_add_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -25,7 +27,9 @@ ; CHECK-LABEL: vpreduce_umax_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -40,7 +44,9 @@ define signext i8 @vpreduce_smax_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -56,7 +62,9 @@ ; CHECK-LABEL: vpreduce_umin_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -71,7 +79,9 @@ define signext i8 @vpreduce_smin_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -86,7 +96,9 @@ define signext i8 @vpreduce_and_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -101,7 +113,9 @@ define signext i8 @vpreduce_or_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -116,7 +130,9 @@ define signext i8 @vpreduce_xor_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -131,7 +147,9 @@ define signext i8 @vpreduce_add_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -147,7 +165,9 @@ ; CHECK-LABEL: vpreduce_umax_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -162,7 +182,9 @@ define signext i8 @vpreduce_smax_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -178,7 +200,9 @@ ; CHECK-LABEL: vpreduce_umin_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -193,7 +217,9 @@ define signext i8 @vpreduce_smin_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -208,7 +234,9 @@ define signext i8 @vpreduce_and_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -223,7 +251,9 @@ define signext i8 @vpreduce_or_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -238,7 +268,9 @@ define signext i8 @vpreduce_xor_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -253,7 +285,9 @@ define signext i8 @vpreduce_smax_nxv3i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv3i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -268,7 +302,9 @@ define signext i8 @vpreduce_add_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -284,7 +320,9 @@ ; CHECK-LABEL: vpreduce_umax_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -299,7 +337,9 @@ define signext i8 @vpreduce_smax_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -315,7 +355,9 @@ ; CHECK-LABEL: vpreduce_umin_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -330,7 +372,9 @@ define signext i8 @vpreduce_smin_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -345,7 +389,9 @@ define signext i8 @vpreduce_and_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -360,7 +406,9 @@ define signext i8 @vpreduce_or_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -375,7 +423,9 @@ define signext i8 @vpreduce_xor_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -390,7 +440,9 @@ define signext i16 @vpreduce_add_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -407,7 +459,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -418,7 +472,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -433,7 +489,9 @@ define signext i16 @vpreduce_smax_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -450,7 +508,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -461,7 +521,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -476,7 +538,9 @@ define signext i16 @vpreduce_smin_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -491,7 +555,9 @@ define signext i16 @vpreduce_and_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -506,7 +572,9 @@ define signext i16 @vpreduce_or_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -521,7 +589,9 @@ define signext i16 @vpreduce_xor_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -536,7 +606,9 @@ define signext i16 @vpreduce_add_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -553,7 +625,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -564,7 +638,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -579,7 +655,9 @@ define signext i16 @vpreduce_smax_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -596,7 +674,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -607,7 +687,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -622,7 +704,9 @@ define signext i16 @vpreduce_smin_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -637,7 +721,9 @@ define signext i16 @vpreduce_and_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -652,7 +738,9 @@ define signext i16 @vpreduce_or_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -667,7 +755,9 @@ define signext i16 @vpreduce_xor_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -682,7 +772,9 @@ define signext i16 @vpreduce_add_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -699,7 +791,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -710,7 +804,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -725,7 +821,9 @@ define signext i16 @vpreduce_smax_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -742,7 +840,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: slli a0, a0, 16 ; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -753,7 +853,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -768,7 +870,9 @@ define signext i16 @vpreduce_smin_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -783,7 +887,9 @@ define signext i16 @vpreduce_and_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -798,7 +904,9 @@ define signext i16 @vpreduce_or_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -813,7 +921,9 @@ define signext i16 @vpreduce_xor_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -828,7 +938,9 @@ define signext i32 @vpreduce_add_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -843,7 +955,9 @@ define signext i32 @vpreduce_umax_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umax_nxv1i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -854,7 +968,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -869,7 +985,9 @@ define signext i32 @vpreduce_smax_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -884,7 +1002,9 @@ define signext i32 @vpreduce_umin_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umin_nxv1i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -895,7 +1015,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -910,7 +1032,9 @@ define signext i32 @vpreduce_smin_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -925,7 +1049,9 @@ define signext i32 @vpreduce_and_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -940,7 +1066,9 @@ define signext i32 @vpreduce_or_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -955,7 +1083,9 @@ define signext i32 @vpreduce_xor_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -970,7 +1100,9 @@ define signext i32 @vpreduce_add_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -985,7 +1117,9 @@ define signext i32 @vpreduce_umax_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umax_nxv2i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV32-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -996,7 +1130,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -1011,7 +1147,9 @@ define signext i32 @vpreduce_smax_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -1026,7 +1164,9 @@ define signext i32 @vpreduce_umin_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umin_nxv2i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV32-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -1037,7 +1177,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -1052,7 +1194,9 @@ define signext i32 @vpreduce_smin_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -1067,7 +1211,9 @@ define signext i32 @vpreduce_and_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredand.vs v9, v8, v9, v0.t @@ -1082,7 +1228,9 @@ define signext i32 @vpreduce_or_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredor.vs v9, v8, v9, v0.t @@ -1097,7 +1245,9 @@ define signext i32 @vpreduce_xor_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; CHECK-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -1112,7 +1262,9 @@ define signext i32 @vpreduce_add_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_add_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; CHECK-NEXT: vredsum.vs v10, v8, v10, v0.t @@ -1127,7 +1279,9 @@ define signext i32 @vpreduce_umax_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umax_nxv4i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; RV32-NEXT: vredmaxu.vs v10, v8, v10, v0.t @@ -1138,7 +1292,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; RV64-NEXT: vredmaxu.vs v10, v8, v10, v0.t @@ -1155,7 +1311,10 @@ ; RV32: # %bb.0: ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: srli a2, a3, 2 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vmv.v.i v25, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: slli a3, a3, 1 ; RV32-NEXT: vmv.s.x v25, a0 ; RV32-NEXT: mv a0, a1 @@ -1169,7 +1328,9 @@ ; RV32-NEXT: vsetvli zero, a0, e32, m8, tu, mu ; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t ; RV32-NEXT: vmv.x.s a2, v25 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: sub a0, a1, a3 ; RV32-NEXT: vmv.s.x v8, a2 ; RV32-NEXT: bltu a1, a0, .LBB67_4 @@ -1186,6 +1347,8 @@ ; RV64: # %bb.0: ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: srli a2, a3, 2 +; RV64-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v24, 0 ; RV64-NEXT: slli a4, a0, 32 ; RV64-NEXT: slli a0, a3, 1 ; RV64-NEXT: srli a3, a4, 32 @@ -1195,14 +1358,18 @@ ; RV64-NEXT: mv a4, a0 ; RV64-NEXT: .LBB67_2: ; RV64-NEXT: li a5, 0 -; RV64-NEXT: vsetvli a6, zero, e8, mf2, ta, mu +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; RV64-NEXT: vslidedown.vx v24, v0, a2 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v25, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v25, a3 ; RV64-NEXT: vsetvli zero, a4, e32, m8, tu, mu ; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t ; RV64-NEXT: vmv.x.s a2, v25 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: sub a0, a1, a0 ; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: bltu a1, a0, .LBB67_4 @@ -1223,7 +1390,9 @@ define signext i32 @vpreduce_smax_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smax_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; CHECK-NEXT: vredmax.vs v10, v8, v10, v0.t @@ -1238,7 +1407,9 @@ define signext i32 @vpreduce_umin_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; RV32-LABEL: vpreduce_umin_nxv4i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; RV32-NEXT: vredminu.vs v10, v8, v10, v0.t @@ -1249,7 +1420,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; RV64-NEXT: vredminu.vs v10, v8, v10, v0.t @@ -1264,7 +1437,9 @@ define signext i32 @vpreduce_smin_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_smin_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; CHECK-NEXT: vredmin.vs v10, v8, v10, v0.t @@ -1279,7 +1454,9 @@ define signext i32 @vpreduce_and_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_and_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; CHECK-NEXT: vredand.vs v10, v8, v10, v0.t @@ -1294,7 +1471,9 @@ define signext i32 @vpreduce_or_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_or_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; CHECK-NEXT: vredor.vs v10, v8, v10, v0.t @@ -1309,7 +1488,9 @@ define signext i32 @vpreduce_xor_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; CHECK-NEXT: vredxor.vs v10, v8, v10, v0.t @@ -1343,7 +1524,9 @@ ; ; RV64-LABEL: vpreduce_add_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredsum.vs v9, v8, v9, v0.t @@ -1376,7 +1559,9 @@ ; ; RV64-LABEL: vpwreduce_add_nxv1i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t @@ -1411,7 +1596,9 @@ ; ; RV64-LABEL: vpwreduce_uadd_nxv1i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu ; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t @@ -1447,7 +1634,9 @@ ; ; RV64-LABEL: vpreduce_umax_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredmaxu.vs v9, v8, v9, v0.t @@ -1481,7 +1670,9 @@ ; ; RV64-LABEL: vpreduce_smax_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredmax.vs v9, v8, v9, v0.t @@ -1515,7 +1706,9 @@ ; ; RV64-LABEL: vpreduce_umin_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredminu.vs v9, v8, v9, v0.t @@ -1549,7 +1742,9 @@ ; ; RV64-LABEL: vpreduce_smin_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredmin.vs v9, v8, v9, v0.t @@ -1583,7 +1778,9 @@ ; ; RV64-LABEL: vpreduce_and_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredand.vs v9, v8, v9, v0.t @@ -1617,7 +1814,9 @@ ; ; RV64-LABEL: vpreduce_or_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredor.vs v9, v8, v9, v0.t @@ -1651,7 +1850,9 @@ ; ; RV64-LABEL: vpreduce_xor_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu ; RV64-NEXT: vredxor.vs v9, v8, v9, v0.t @@ -1685,7 +1886,9 @@ ; ; RV64-LABEL: vpreduce_add_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredsum.vs v10, v8, v10, v0.t @@ -1718,7 +1921,9 @@ ; ; RV64-LABEL: vwpreduce_add_nxv2i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t @@ -1753,7 +1958,9 @@ ; ; RV64-LABEL: vwpreduce_uadd_nxv2i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu ; RV64-NEXT: vwredsum.vs v9, v8, v9, v0.t @@ -1789,7 +1996,9 @@ ; ; RV64-LABEL: vpreduce_umax_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredmaxu.vs v10, v8, v10, v0.t @@ -1823,7 +2032,9 @@ ; ; RV64-LABEL: vpreduce_smax_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredmax.vs v10, v8, v10, v0.t @@ -1857,7 +2068,9 @@ ; ; RV64-LABEL: vpreduce_umin_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredminu.vs v10, v8, v10, v0.t @@ -1891,7 +2104,9 @@ ; ; RV64-LABEL: vpreduce_smin_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredmin.vs v10, v8, v10, v0.t @@ -1925,7 +2140,9 @@ ; ; RV64-LABEL: vpreduce_and_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredand.vs v10, v8, v10, v0.t @@ -1959,7 +2176,9 @@ ; ; RV64-LABEL: vpreduce_or_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredor.vs v10, v8, v10, v0.t @@ -1993,7 +2212,9 @@ ; ; RV64-LABEL: vpreduce_xor_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu ; RV64-NEXT: vredxor.vs v10, v8, v10, v0.t @@ -2027,7 +2248,9 @@ ; ; RV64-LABEL: vpreduce_add_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredsum.vs v12, v8, v12, v0.t @@ -2060,7 +2283,9 @@ ; ; RV64-LABEL: vpwreduce_add_nxv4i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; RV64-NEXT: vwredsum.vs v10, v8, v10, v0.t @@ -2095,7 +2320,9 @@ ; ; RV64-LABEL: vpwreduce_uadd_nxv4i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu ; RV64-NEXT: vwredsumu.vs v10, v8, v10, v0.t @@ -2131,7 +2358,9 @@ ; ; RV64-LABEL: vpreduce_umax_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredmaxu.vs v12, v8, v12, v0.t @@ -2165,7 +2394,9 @@ ; ; RV64-LABEL: vpreduce_smax_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredmax.vs v12, v8, v12, v0.t @@ -2199,7 +2430,9 @@ ; ; RV64-LABEL: vpreduce_umin_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredminu.vs v12, v8, v12, v0.t @@ -2233,7 +2466,9 @@ ; ; RV64-LABEL: vpreduce_smin_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredmin.vs v12, v8, v12, v0.t @@ -2267,7 +2502,9 @@ ; ; RV64-LABEL: vpreduce_and_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredand.vs v12, v8, v12, v0.t @@ -2301,7 +2538,9 @@ ; ; RV64-LABEL: vpreduce_or_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredor.vs v12, v8, v12, v0.t @@ -2335,7 +2574,9 @@ ; ; RV64-LABEL: vpreduce_xor_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 ; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu ; RV64-NEXT: vredxor.vs v12, v8, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll @@ -9,11 +9,15 @@ define signext i8 @vreduce_add_nxv1i8( %v) { ; CHECK-LABEL: vreduce_add_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.add.nxv1i8( %v) ret i8 %red @@ -24,11 +28,15 @@ define signext i8 @vreduce_umax_nxv1i8( %v) { ; CHECK-LABEL: vreduce_umax_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.umax.nxv1i8( %v) ret i8 %red @@ -40,11 +48,15 @@ ; CHECK-LABEL: vreduce_smax_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, -128 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.smax.nxv1i8( %v) ret i8 %red @@ -57,9 +69,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.umin.nxv1i8( %v) ret i8 %red @@ -71,11 +85,15 @@ ; CHECK-LABEL: vreduce_smin_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.smin.nxv1i8( %v) ret i8 %red @@ -88,9 +106,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.and.nxv1i8( %v) ret i8 %red @@ -101,11 +121,15 @@ define signext i8 @vreduce_or_nxv1i8( %v) { ; CHECK-LABEL: vreduce_or_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.or.nxv1i8( %v) ret i8 %red @@ -116,11 +140,15 @@ define signext i8 @vreduce_xor_nxv1i8( %v) { ; CHECK-LABEL: vreduce_xor_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.xor.nxv1i8( %v) ret i8 %red @@ -131,11 +159,15 @@ define signext i8 @vreduce_add_nxv2i8( %v) { ; CHECK-LABEL: vreduce_add_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.add.nxv2i8( %v) ret i8 %red @@ -146,11 +178,15 @@ define signext i8 @vreduce_umax_nxv2i8( %v) { ; CHECK-LABEL: vreduce_umax_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.umax.nxv2i8( %v) ret i8 %red @@ -162,11 +198,15 @@ ; CHECK-LABEL: vreduce_smax_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, -128 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.smax.nxv2i8( %v) ret i8 %red @@ -179,9 +219,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.umin.nxv2i8( %v) ret i8 %red @@ -193,11 +235,15 @@ ; CHECK-LABEL: vreduce_smin_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.smin.nxv2i8( %v) ret i8 %red @@ -210,9 +256,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.and.nxv2i8( %v) ret i8 %red @@ -223,11 +271,15 @@ define signext i8 @vreduce_or_nxv2i8( %v) { ; CHECK-LABEL: vreduce_or_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.or.nxv2i8( %v) ret i8 %red @@ -238,11 +290,15 @@ define signext i8 @vreduce_xor_nxv2i8( %v) { ; CHECK-LABEL: vreduce_xor_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.xor.nxv2i8( %v) ret i8 %red @@ -253,11 +309,15 @@ define signext i8 @vreduce_add_nxv4i8( %v) { ; CHECK-LABEL: vreduce_add_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.add.nxv4i8( %v) ret i8 %red @@ -268,11 +328,15 @@ define signext i8 @vreduce_umax_nxv4i8( %v) { ; CHECK-LABEL: vreduce_umax_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.umax.nxv4i8( %v) ret i8 %red @@ -284,11 +348,15 @@ ; CHECK-LABEL: vreduce_smax_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, -128 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.smax.nxv4i8( %v) ret i8 %red @@ -301,9 +369,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.umin.nxv4i8( %v) ret i8 %red @@ -315,11 +385,15 @@ ; CHECK-LABEL: vreduce_smin_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredmin.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredmin.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.smin.nxv4i8( %v) ret i8 %red @@ -332,9 +406,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.and.nxv4i8( %v) ret i8 %red @@ -345,11 +421,15 @@ define signext i8 @vreduce_or_nxv4i8( %v) { ; CHECK-LABEL: vreduce_or_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.or.nxv4i8( %v) ret i8 %red @@ -360,11 +440,15 @@ define signext i8 @vreduce_xor_nxv4i8( %v) { ; CHECK-LABEL: vreduce_xor_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i8 @llvm.vector.reduce.xor.nxv4i8( %v) ret i8 %red @@ -375,11 +459,15 @@ define signext i16 @vreduce_add_nxv1i16( %v) { ; CHECK-LABEL: vreduce_add_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.add.nxv1i16( %v) ret i16 %red @@ -388,12 +476,16 @@ define signext i16 @vwreduce_add_nxv1i8( %v) { ; CHECK-LABEL: vwreduce_add_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i16 @llvm.vector.reduce.add.nxv1i16( %e) @@ -403,12 +495,16 @@ define signext i16 @vwreduce_uadd_nxv1i8( %v) { ; CHECK-LABEL: vwreduce_uadd_nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i16 @llvm.vector.reduce.add.nxv1i16( %e) @@ -420,11 +516,15 @@ define signext i16 @vreduce_umax_nxv1i16( %v) { ; CHECK-LABEL: vreduce_umax_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.umax.nxv1i16( %v) ret i16 %red @@ -436,11 +536,15 @@ ; CHECK-LABEL: vreduce_smax_nxv1i16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.smax.nxv1i16( %v) ret i16 %red @@ -453,9 +557,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.umin.nxv1i16( %v) ret i16 %red @@ -468,22 +574,30 @@ ; RV32: # %bb.0: ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_nxv1i16: ; RV64: # %bb.0: ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i16 @llvm.vector.reduce.smin.nxv1i16( %v) ret i16 %red @@ -496,9 +610,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.and.nxv1i16( %v) ret i16 %red @@ -509,11 +625,15 @@ define signext i16 @vreduce_or_nxv1i16( %v) { ; CHECK-LABEL: vreduce_or_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.or.nxv1i16( %v) ret i16 %red @@ -524,11 +644,15 @@ define signext i16 @vreduce_xor_nxv1i16( %v) { ; CHECK-LABEL: vreduce_xor_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.xor.nxv1i16( %v) ret i16 %red @@ -539,11 +663,15 @@ define signext i16 @vreduce_add_nxv2i16( %v) { ; CHECK-LABEL: vreduce_add_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.add.nxv2i16( %v) ret i16 %red @@ -552,12 +680,16 @@ define signext i16 @vwreduce_add_nxv2i8( %v) { ; CHECK-LABEL: vwreduce_add_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i16 @llvm.vector.reduce.add.nxv2i16( %e) @@ -567,12 +699,16 @@ define signext i16 @vwreduce_uadd_nxv2i8( %v) { ; CHECK-LABEL: vwreduce_uadd_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i16 @llvm.vector.reduce.add.nxv2i16( %e) @@ -584,11 +720,15 @@ define signext i16 @vreduce_umax_nxv2i16( %v) { ; CHECK-LABEL: vreduce_umax_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.umax.nxv2i16( %v) ret i16 %red @@ -600,11 +740,15 @@ ; CHECK-LABEL: vreduce_smax_nxv2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.smax.nxv2i16( %v) ret i16 %red @@ -617,9 +761,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.umin.nxv2i16( %v) ret i16 %red @@ -632,22 +778,30 @@ ; RV32: # %bb.0: ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_nxv2i16: ; RV64: # %bb.0: ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i16 @llvm.vector.reduce.smin.nxv2i16( %v) ret i16 %red @@ -660,9 +814,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.and.nxv2i16( %v) ret i16 %red @@ -673,11 +829,15 @@ define signext i16 @vreduce_or_nxv2i16( %v) { ; CHECK-LABEL: vreduce_or_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.or.nxv2i16( %v) ret i16 %red @@ -688,11 +848,15 @@ define signext i16 @vreduce_xor_nxv2i16( %v) { ; CHECK-LABEL: vreduce_xor_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.xor.nxv2i16( %v) ret i16 %red @@ -703,11 +867,15 @@ define signext i16 @vreduce_add_nxv4i16( %v) { ; CHECK-LABEL: vreduce_add_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.add.nxv4i16( %v) ret i16 %red @@ -716,12 +884,16 @@ define signext i16 @vwreduce_add_nxv4i8( %v) { ; CHECK-LABEL: vwreduce_add_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i16 @llvm.vector.reduce.add.nxv4i16( %e) @@ -731,12 +903,16 @@ define signext i16 @vwreduce_uadd_nxv4i8( %v) { ; CHECK-LABEL: vwreduce_uadd_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i16 @llvm.vector.reduce.add.nxv4i16( %e) @@ -748,11 +924,15 @@ define signext i16 @vreduce_umax_nxv4i16( %v) { ; CHECK-LABEL: vreduce_umax_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.umax.nxv4i16( %v) ret i16 %red @@ -764,11 +944,15 @@ ; CHECK-LABEL: vreduce_smax_nxv4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.smax.nxv4i16( %v) ret i16 %red @@ -782,8 +966,10 @@ ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.umin.nxv4i16( %v) ret i16 %red @@ -796,22 +982,30 @@ ; RV32: # %bb.0: ; RV32-NEXT: lui a0, 8 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_nxv4i16: ; RV64: # %bb.0: ; RV64-NEXT: lui a0, 8 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i16 @llvm.vector.reduce.smin.nxv4i16( %v) ret i16 %red @@ -825,8 +1019,10 @@ ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.and.nxv4i16( %v) ret i16 %red @@ -837,11 +1033,15 @@ define signext i16 @vreduce_or_nxv4i16( %v) { ; CHECK-LABEL: vreduce_or_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.or.nxv4i16( %v) ret i16 %red @@ -852,11 +1052,15 @@ define signext i16 @vreduce_xor_nxv4i16( %v) { ; CHECK-LABEL: vreduce_xor_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i16 @llvm.vector.reduce.xor.nxv4i16( %v) ret i16 %red @@ -867,11 +1071,15 @@ define signext i32 @vreduce_add_nxv1i32( %v) { ; CHECK-LABEL: vreduce_add_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.add.nxv1i32( %v) ret i32 %red @@ -880,12 +1088,16 @@ define signext i32 @vwreduce_add_nxv1i16( %v) { ; CHECK-LABEL: vwreduce_add_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i32 @llvm.vector.reduce.add.nxv1i32( %e) @@ -895,12 +1107,16 @@ define signext i32 @vwreduce_uadd_nxv1i16( %v) { ; CHECK-LABEL: vwreduce_uadd_nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = zext %v to %red = call i32 @llvm.vector.reduce.add.nxv1i32( %e) @@ -912,11 +1128,15 @@ define signext i32 @vreduce_umax_nxv1i32( %v) { ; CHECK-LABEL: vreduce_umax_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.umax.nxv1i32( %v) ret i32 %red @@ -928,11 +1148,15 @@ ; CHECK-LABEL: vreduce_smax_nxv1i32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.smax.nxv1i32( %v) ret i32 %red @@ -945,9 +1169,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.umin.nxv1i32( %v) ret i32 %red @@ -960,22 +1186,30 @@ ; RV32: # %bb.0: ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_nxv1i32: ; RV64: # %bb.0: ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i32 @llvm.vector.reduce.smin.nxv1i32( %v) ret i32 %red @@ -988,9 +1222,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.and.nxv1i32( %v) ret i32 %red @@ -1001,11 +1237,15 @@ define signext i32 @vreduce_or_nxv1i32( %v) { ; CHECK-LABEL: vreduce_or_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.or.nxv1i32( %v) ret i32 %red @@ -1016,11 +1256,15 @@ define signext i32 @vreduce_xor_nxv1i32( %v) { ; CHECK-LABEL: vreduce_xor_nxv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.xor.nxv1i32( %v) ret i32 %red @@ -1031,11 +1275,15 @@ define signext i32 @vreduce_add_nxv2i32( %v) { ; CHECK-LABEL: vreduce_add_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vredsum.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.add.nxv2i32( %v) ret i32 %red @@ -1044,12 +1292,16 @@ define signext i32 @vwreduce_add_nxv2i16( %v) { ; CHECK-LABEL: vwreduce_add_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i32 @llvm.vector.reduce.add.nxv2i32( %e) @@ -1059,12 +1311,16 @@ define signext i32 @vwreduce_uadd_nxv2i16( %v) { ; CHECK-LABEL: vwreduce_uadd_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = zext %v to %red = call i32 @llvm.vector.reduce.add.nxv2i32( %e) @@ -1076,11 +1332,15 @@ define signext i32 @vreduce_umax_nxv2i32( %v) { ; CHECK-LABEL: vreduce_umax_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vredmaxu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.umax.nxv2i32( %v) ret i32 %red @@ -1092,11 +1352,15 @@ ; CHECK-LABEL: vreduce_smax_nxv2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vredmax.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.smax.nxv2i32( %v) ret i32 %red @@ -1109,9 +1373,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vredminu.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.umin.nxv2i32( %v) ret i32 %red @@ -1124,22 +1390,30 @@ ; RV32: # %bb.0: ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_nxv2i32: ; RV64: # %bb.0: ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i32 @llvm.vector.reduce.smin.nxv2i32( %v) ret i32 %red @@ -1152,9 +1426,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vredand.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.and.nxv2i32( %v) ret i32 %red @@ -1165,11 +1441,15 @@ define signext i32 @vreduce_or_nxv2i32( %v) { ; CHECK-LABEL: vreduce_or_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vredor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.or.nxv2i32( %v) ret i32 %red @@ -1180,11 +1460,15 @@ define signext i32 @vreduce_xor_nxv2i32( %v) { ; CHECK-LABEL: vreduce_xor_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; CHECK-NEXT: vredxor.vs v10, v8, v9 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.xor.nxv2i32( %v) ret i32 %red @@ -1195,11 +1479,15 @@ define signext i32 @vreduce_add_nxv4i32( %v) { ; CHECK-LABEL: vreduce_add_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vredsum.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vredsum.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.add.nxv4i32( %v) ret i32 %red @@ -1208,12 +1496,16 @@ define signext i32 @vwreduce_add_nxv4i16( %v) { ; CHECK-LABEL: vwreduce_add_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vwredsum.vs v8, v8, v9 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vwredsum.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = sext %v to %red = call i32 @llvm.vector.reduce.add.nxv4i32( %e) @@ -1223,12 +1515,16 @@ define signext i32 @vwreduce_uadd_nxv4i16( %v) { ; CHECK-LABEL: vwreduce_uadd_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; CHECK-NEXT: vwredsumu.vs v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v10 ; CHECK-NEXT: ret %e = zext %v to %red = call i32 @llvm.vector.reduce.add.nxv4i32( %e) @@ -1240,11 +1536,15 @@ define signext i32 @vreduce_umax_nxv4i32( %v) { ; CHECK-LABEL: vreduce_umax_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vredmaxu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vredmaxu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.umax.nxv4i32( %v) ret i32 %red @@ -1256,11 +1556,15 @@ ; CHECK-LABEL: vreduce_smax_nxv4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vredmax.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vredmax.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.smax.nxv4i32( %v) ret i32 %red @@ -1273,9 +1577,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vredminu.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vredminu.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.umin.nxv4i32( %v) ret i32 %red @@ -1288,22 +1594,30 @@ ; RV32: # %bb.0: ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV32-NEXT: vredmin.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_smin_nxv4i32: ; RV64: # %bb.0: ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64-NEXT: vredmin.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i32 @llvm.vector.reduce.smin.nxv4i32( %v) ret i32 %red @@ -1316,9 +1630,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vredand.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vredand.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.and.nxv4i32( %v) ret i32 %red @@ -1329,11 +1645,15 @@ define signext i32 @vreduce_or_nxv4i32( %v) { ; CHECK-LABEL: vreduce_or_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vredor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vredor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.or.nxv4i32( %v) ret i32 %red @@ -1344,11 +1664,15 @@ define signext i32 @vreduce_xor_nxv4i32( %v) { ; CHECK-LABEL: vreduce_xor_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, tu, mu ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; CHECK-NEXT: vredxor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; CHECK-NEXT: vredxor.vs v11, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v11 ; CHECK-NEXT: ret %red = call i32 @llvm.vector.reduce.xor.nxv4i32( %v) ret i32 %red @@ -1359,24 +1683,32 @@ define i64 @vreduce_add_nxv1i64( %v) { ; RV32-LABEL: vreduce_add_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredsum.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredsum.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_add_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredsum.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredsum.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.add.nxv1i64( %v) ret i64 %red @@ -1385,26 +1717,34 @@ define i64 @vwreduce_add_nxv1i32( %v) { ; RV32-LABEL: vwreduce_add_nxv1i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; RV32-NEXT: vwredsum.vs v10, v8, v9 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_add_nxv1i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; RV64-NEXT: vwredsum.vs v10, v8, v9 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %e = sext %v to %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) @@ -1414,26 +1754,34 @@ define i64 @vwreduce_uadd_nxv1i32( %v) { ; RV32-LABEL: vwreduce_uadd_nxv1i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; RV32-NEXT: vwredsumu.vs v10, v8, v9 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_uadd_nxv1i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e32, mf2, tu, mu +; RV64-NEXT: vwredsumu.vs v10, v8, v9 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %e = zext %v to %red = call i64 @llvm.vector.reduce.add.nxv1i64( %e) @@ -1445,24 +1793,32 @@ define i64 @vreduce_umax_nxv1i64( %v) { ; RV32-LABEL: vreduce_umax_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredmaxu.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredmaxu.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_umax_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredmaxu.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredmaxu.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.umax.nxv1i64( %v) ret i64 %red @@ -1481,12 +1837,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredmax.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1495,11 +1853,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredmax.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredmax.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.smax.nxv1i64( %v) ret i64 %red @@ -1512,12 +1874,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredminu.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1525,9 +1889,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v9, -1 -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredminu.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.umin.nxv1i64( %v) ret i64 %red @@ -1548,12 +1914,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredmin.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1562,11 +1930,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredmin.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.smin.nxv1i64( %v) ret i64 %red @@ -1579,12 +1951,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredand.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1592,9 +1966,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v9, -1 -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredand.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.and.nxv1i64( %v) ret i64 %red @@ -1605,24 +1981,32 @@ define i64 @vreduce_or_nxv1i64( %v) { ; RV32-LABEL: vreduce_or_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredor.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_or_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredor.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.or.nxv1i64( %v) ret i64 %red @@ -1633,24 +2017,32 @@ define i64 @vreduce_xor_nxv1i64( %v) { ; RV32-LABEL: vreduce_xor_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV32-NEXT: vredxor.vs v10, v8, v9 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_xor_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m1, tu, mu +; RV64-NEXT: vredxor.vs v10, v8, v9 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.xor.nxv1i64( %v) ret i64 %red @@ -1661,24 +2053,32 @@ define i64 @vreduce_add_nxv2i64( %v) { ; RV32-LABEL: vreduce_add_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredsum.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredsum.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_add_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredsum.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredsum.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.add.nxv2i64( %v) ret i64 %red @@ -1687,26 +2087,34 @@ define i64 @vwreduce_add_nxv2i32( %v) { ; RV32-LABEL: vwreduce_add_nxv2i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV32-NEXT: vwredsum.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; RV32-NEXT: vwredsum.vs v10, v8, v9 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_add_nxv2i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV64-NEXT: vwredsum.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; RV64-NEXT: vwredsum.vs v10, v8, v9 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %e = sext %v to %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) @@ -1716,26 +2124,34 @@ define i64 @vwreduce_uadd_nxv2i32( %v) { ; RV32-LABEL: vwreduce_uadd_nxv2i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV32-NEXT: vwredsumu.vs v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; RV32-NEXT: vwredsumu.vs v10, v8, v9 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v10 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_uadd_nxv2i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV64-NEXT: vwredsumu.vs v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli a0, zero, e32, m1, tu, mu +; RV64-NEXT: vwredsumu.vs v10, v8, v9 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v10 ; RV64-NEXT: ret %e = zext %v to %red = call i64 @llvm.vector.reduce.add.nxv2i64( %e) @@ -1747,24 +2163,32 @@ define i64 @vreduce_umax_nxv2i64( %v) { ; RV32-LABEL: vreduce_umax_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredmaxu.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredmaxu.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_umax_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredmaxu.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredmaxu.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.umax.nxv2i64( %v) ret i64 %red @@ -1783,12 +2207,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredmax.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1797,11 +2223,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredmax.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredmax.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.smax.nxv2i64( %v) ret i64 %red @@ -1814,12 +2244,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredminu.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1827,9 +2259,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v10, -1 -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredminu.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.umin.nxv2i64( %v) ret i64 %red @@ -1850,12 +2284,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredmin.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1864,11 +2300,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredmin.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.smin.nxv2i64( %v) ret i64 %red @@ -1881,12 +2321,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredand.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1894,9 +2336,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v10, -1 -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredand.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.and.nxv2i64( %v) ret i64 %red @@ -1907,24 +2351,32 @@ define i64 @vreduce_or_nxv2i64( %v) { ; RV32-LABEL: vreduce_or_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredor.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredor.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_or_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredor.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredor.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.or.nxv2i64( %v) ret i64 %red @@ -1935,24 +2387,32 @@ define i64 @vreduce_xor_nxv2i64( %v) { ; RV32-LABEL: vreduce_xor_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vredxor.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV32-NEXT: vredxor.vs v11, v8, v10 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_xor_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vredxor.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli a0, zero, e64, m2, tu, mu +; RV64-NEXT: vredxor.vs v11, v8, v10 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.xor.nxv2i64( %v) ret i64 %red @@ -1963,24 +2423,32 @@ define i64 @vreduce_add_nxv4i64( %v) { ; RV32-LABEL: vreduce_add_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredsum.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredsum.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_add_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredsum.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredsum.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.add.nxv4i64( %v) ret i64 %red @@ -1989,26 +2457,34 @@ define i64 @vwreduce_add_nxv4i32( %v) { ; RV32-LABEL: vwreduce_add_nxv4i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV32-NEXT: vwredsum.vs v8, v8, v10 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV32-NEXT: vwredsum.vs v11, v8, v10 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_add_nxv4i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV64-NEXT: vwredsum.vs v8, v8, v10 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64-NEXT: vwredsum.vs v11, v8, v10 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %e = sext %v to %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) @@ -2018,26 +2494,34 @@ define i64 @vwreduce_uadd_nxv4i32( %v) { ; RV32-LABEL: vwreduce_uadd_nxv4i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV32-NEXT: vwredsumu.vs v8, v8, v10 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV32-NEXT: vwredsumu.vs v11, v8, v10 ; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a0, v11 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v11, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_uadd_nxv4i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV64-NEXT: vwredsumu.vs v8, v8, v10 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64-NEXT: vwredsumu.vs v11, v8, v10 ; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv.x.s a0, v11 ; RV64-NEXT: ret %e = zext %v to %red = call i64 @llvm.vector.reduce.add.nxv4i64( %e) @@ -2049,24 +2533,32 @@ define i64 @vreduce_umax_nxv4i64( %v) { ; RV32-LABEL: vreduce_umax_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredmaxu.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredmaxu.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_umax_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredmaxu.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredmaxu.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.umax.nxv4i64( %v) ret i64 %red @@ -2085,12 +2577,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredmax.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredmax.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -2099,11 +2593,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a0, -1 ; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredmax.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredmax.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.smax.nxv4i64( %v) ret i64 %red @@ -2116,12 +2614,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredminu.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredminu.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2129,9 +2629,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v12, -1 -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredminu.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredminu.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.umin.nxv4i64( %v) ret i64 %red @@ -2152,12 +2654,14 @@ ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredmin.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredmin.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -2166,11 +2670,15 @@ ; RV64: # %bb.0: ; RV64-NEXT: li a0, -1 ; RV64-NEXT: srli a0, a0, 1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredmin.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredmin.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.smin.nxv4i64( %v) ret i64 %red @@ -2183,12 +2691,14 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredand.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredand.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -2196,9 +2706,11 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vmv.v.i v12, -1 -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredand.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredand.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.and.nxv4i64( %v) ret i64 %red @@ -2209,24 +2721,32 @@ define i64 @vreduce_or_nxv4i64( %v) { ; RV32-LABEL: vreduce_or_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredor.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredor.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_or_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredor.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredor.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.or.nxv4i64( %v) ret i64 %red @@ -2237,24 +2757,32 @@ define i64 @vreduce_xor_nxv4i64( %v) { ; RV32-LABEL: vreduce_xor_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vmv.s.x v12, zero -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vredxor.vs v8, v8, v12 -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV32-NEXT: vredxor.vs v13, v8, v12 +; RV32-NEXT: vmv.x.s a0, v13 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v13, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vreduce_xor_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vredxor.vs v8, v8, v12 -; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.i v13, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, mu +; RV64-NEXT: vredxor.vs v13, v8, v12 +; RV64-NEXT: vmv.x.s a0, v13 ; RV64-NEXT: ret %red = call i64 @llvm.vector.reduce.xor.nxv4i64( %v) ret i64 %red diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll @@ -439,6 +439,8 @@ ; CHECK-NEXT: vmand.mm v1, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a2, a0, 3 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vx v0, v1, a2 ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll @@ -439,6 +439,8 @@ ; CHECK-NEXT: vmand.mm v1, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a2, a0, 3 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; CHECK-NEXT: vslidedown.vx v0, v1, a2 ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll @@ -350,49 +350,41 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a4, a1, 3 ; CHECK-NEXT: add a4, a0, a4 -; CHECK-NEXT: vl8re32.v v8, (a4) +; CHECK-NEXT: vl8re32.v v24, (a4) ; CHECK-NEXT: srli a5, a1, 2 -; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a4, a2, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: vslidedown.vx v0, v1, a5 ; CHECK-NEXT: bltu a2, a4, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a4 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vl8re32.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: bltu a2, a1, .LBB27_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB27_4: ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -408,49 +400,41 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a2, a1, 3 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vl8re32.v v8, (a2) +; CHECK-NEXT: vl8re32.v v24, (a2) ; CHECK-NEXT: srli a5, a1, 2 -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a2, a1, 1 ; CHECK-NEXT: sub a4, a1, a2 -; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: vslidedown.vx v0, v1, a5 ; CHECK-NEXT: bltu a1, a4, .LBB28_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a4 ; CHECK-NEXT: .LBB28_2: -; CHECK-NEXT: vl8re32.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: bltu a1, a2, .LBB28_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB28_4: ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -696,48 +680,40 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a4, a1, 3 ; CHECK-NEXT: add a4, a0, a4 -; CHECK-NEXT: vl8re64.v v8, (a4) +; CHECK-NEXT: vl8re64.v v24, (a4) ; CHECK-NEXT: srli a5, a1, 3 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a4, a2, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: vslidedown.vx v0, v1, a5 ; CHECK-NEXT: bltu a2, a4, .LBB48_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a4 ; CHECK-NEXT: .LBB48_2: -; CHECK-NEXT: vl8re64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: bltu a2, a1, .LBB48_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB48_4: ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll @@ -155,10 +155,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v12, a4 ; CHECK-NEXT: bltu a0, a3, .LBB12_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll @@ -310,10 +310,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll @@ -198,9 +198,11 @@ ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vx v9, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v9 ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll @@ -8,9 +8,12 @@ ; CHECK-LABEL: vtrunc_nxv2i1_nxv2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vand.vi v8, v8, 1, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vand.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret %v = call @llvm.vp.trunc.nxv2i16.nxv2i1( %a, %m, i32 %vl) ret %v @@ -33,9 +36,12 @@ ; CHECK-LABEL: vtrunc_nxv2i1_nxv2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vand.vi v8, v8, 1, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsne.vi v0, v8, 0, v0.t +; CHECK-NEXT: vand.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; CHECK-NEXT: vmsne.vi v8, v9, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: ret %v = call @llvm.vp.trunc.nxv2i1.nxv2i32( %a, %m, i32 %vl) ret %v @@ -59,7 +65,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vand.vi v10, v8, 1, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu ; CHECK-NEXT: vmsne.vi v8, v10, 0, v0.t ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll @@ -161,9 +161,11 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 3 +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB12_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 @@ -171,7 +173,7 @@ ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, mu ; CHECK-NEXT: vncvt.x.x.w v28, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vncvt.x.x.w v18, v28, v0.t +; CHECK-NEXT: vncvt.x.x.w v16, v28, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB12_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a0, a1 @@ -180,8 +182,10 @@ ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vncvt.x.x.w v20, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vncvt.x.x.w v16, v20, v0.t -; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vncvt.x.x.w v8, v20, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv2r.v v10, v16 ; CHECK-NEXT: ret %v = call @llvm.vp.trunc.nxv15i16.nxv15i64( %a, %m, i32 %vl) ret %v @@ -220,10 +224,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB15_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 @@ -231,7 +237,7 @@ ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vncvt.x.x.w v28, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; CHECK-NEXT: vncvt.x.x.w v18, v28, v0.t +; CHECK-NEXT: vncvt.x.x.w v16, v28, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB15_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a0, a1 @@ -240,8 +246,10 @@ ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vncvt.x.x.w v20, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; CHECK-NEXT: vncvt.x.x.w v16, v20, v0.t -; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vncvt.x.x.w v8, v20, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv2r.v v10, v16 ; CHECK-NEXT: ret %v = call @llvm.vp.trunc.nxv32i7.nxv32i32( %a, %m, i32 %vl) ret %v @@ -256,10 +264,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB16_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 @@ -267,7 +277,7 @@ ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, mu ; CHECK-NEXT: vncvt.x.x.w v28, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; CHECK-NEXT: vncvt.x.x.w v18, v28, v0.t +; CHECK-NEXT: vncvt.x.x.w v16, v28, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB16_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a0, a1 @@ -276,8 +286,10 @@ ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vncvt.x.x.w v20, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; CHECK-NEXT: vncvt.x.x.w v16, v20, v0.t -; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vncvt.x.x.w v8, v20, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv2r.v v10, v16 ; CHECK-NEXT: ret %v = call @llvm.vp.trunc.nxv32i8.nxv32i32( %a, %m, i32 %vl) ret %v @@ -291,17 +303,16 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a4, a1, 1 ; CHECK-NEXT: srli a3, a1, 3 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, mu +; CHECK-NEXT: slli a4, a1, 1 +; CHECK-NEXT: vmv.v.i v0, 0 ; CHECK-NEXT: mv a5, a2 ; CHECK-NEXT: bltu a2, a4, .LBB17_2 ; CHECK-NEXT: # %bb.1: @@ -310,71 +321,69 @@ ; CHECK-NEXT: li a6, 0 ; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, mu ; CHECK-NEXT: sub a7, a5, a1 -; CHECK-NEXT: vslidedown.vx v0, v24, a3 +; CHECK-NEXT: vslidedown.vx v0, v1, a3 ; CHECK-NEXT: bltu a5, a7, .LBB17_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a6, a7 ; CHECK-NEXT: .LBB17_4: ; CHECK-NEXT: srli a7, a1, 2 +; CHECK-NEXT: vsetvli t0, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: slli t0, a1, 3 ; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, mu -; CHECK-NEXT: vncvt.x.x.w v12, v16, v0.t +; CHECK-NEXT: addi a6, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a6) # Unknown-size Folded Reload +; CHECK-NEXT: vncvt.x.x.w v20, v24, v0.t ; CHECK-NEXT: bltu a5, a1, .LBB17_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv a5, a1 ; CHECK-NEXT: .LBB17_6: ; CHECK-NEXT: li a6, 0 ; CHECK-NEXT: vsetvli t1, zero, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v1, v24, a7 +; CHECK-NEXT: vslidedown.vx v16, v1, a7 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v17, 0 ; CHECK-NEXT: add a7, a0, t0 ; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vncvt.x.x.w v4, v8, v0.t +; CHECK-NEXT: vsetvli a5, zero, e16, m8, ta, mu ; CHECK-NEXT: sub a4, a2, a4 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vl8re8.v v16, (a5) # Unknown-size Folded Reload -; CHECK-NEXT: vncvt.x.x.w v8, v16, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: bltu a2, a4, .LBB17_8 ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: mv a6, a4 ; CHECK-NEXT: .LBB17_8: ; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; CHECK-NEXT: vl8re64.v v16, (a7) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v24, (a7) ; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vmv4r.v v8, v4 ; CHECK-NEXT: sub a4, a6, a1 -; CHECK-NEXT: vslidedown.vx v0, v1, a3 +; CHECK-NEXT: vslidedown.vx v17, v16, a3 ; CHECK-NEXT: bltu a6, a4, .LBB17_10 ; CHECK-NEXT: # %bb.9: ; CHECK-NEXT: mv a2, a4 ; CHECK-NEXT: .LBB17_10: -; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv4r.v v12, v20 ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, mu -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v17 ; CHECK-NEXT: vncvt.x.x.w v20, v24, v0.t ; CHECK-NEXT: bltu a6, a1, .LBB17_12 ; CHECK-NEXT: # %bb.11: ; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB17_12: ; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vncvt.x.x.w v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll @@ -310,10 +310,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: bltu a0, a3, .LBB25_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll @@ -155,10 +155,12 @@ ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a4, a1, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: vslidedown.vx v0, v12, a4 ; CHECK-NEXT: bltu a0, a3, .LBB12_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -648,7 +648,9 @@ ; RV32MV-NEXT: sw a1, 12(sp) ; RV32MV-NEXT: sw a0, 8(sp) ; RV32MV-NEXT: li a0, 85 -; RV32MV-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32MV-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32MV-NEXT: vmv.v.i v0, 0 +; RV32MV-NEXT: vsetivli zero, 1, e8, mf8, tu, mu ; RV32MV-NEXT: vmv.s.x v0, a0 ; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV32MV-NEXT: vmv.v.i v8, 1 @@ -665,16 +667,28 @@ ; RV32MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32MV-NEXT: vse32.v v8, (s2) +; RV32MV-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32MV-NEXT: vmv.v.i v10, 0 +; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32MV-NEXT: vslidedown.vi v10, v8, 1 ; RV32MV-NEXT: vmv.x.s a0, v10 +; RV32MV-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32MV-NEXT: vmv.v.i v10, 0 +; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32MV-NEXT: vslidedown.vi v10, v8, 2 ; RV32MV-NEXT: vmv.x.s a1, v10 ; RV32MV-NEXT: slli a2, a1, 1 ; RV32MV-NEXT: sub a0, a2, a0 ; RV32MV-NEXT: sw a0, 4(s2) +; RV32MV-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32MV-NEXT: vmv.v.i v10, 0 +; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32MV-NEXT: vslidedown.vi v10, v8, 4 ; RV32MV-NEXT: vmv.x.s a0, v10 ; RV32MV-NEXT: srli a2, a0, 30 +; RV32MV-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV32MV-NEXT: vmv.v.i v10, 0 +; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, mu ; RV32MV-NEXT: vslidedown.vi v10, v8, 5 ; RV32MV-NEXT: vmv.x.s a3, v10 ; RV32MV-NEXT: slli a3, a3, 2 @@ -682,8 +696,11 @@ ; RV32MV-NEXT: andi a2, a2, 7 ; RV32MV-NEXT: sb a2, 12(s2) ; RV32MV-NEXT: srli a1, a1, 31 -; RV32MV-NEXT: vslidedown.vi v8, v8, 3 -; RV32MV-NEXT: vmv.x.s a2, v8 +; RV32MV-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV32MV-NEXT: vmv.v.i v10, 0 +; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32MV-NEXT: vslidedown.vi v10, v8, 3 +; RV32MV-NEXT: vmv.x.s a2, v10 ; RV32MV-NEXT: andi a2, a2, 1 ; RV32MV-NEXT: slli a2, a2, 1 ; RV32MV-NEXT: or a1, a1, a2 @@ -763,6 +780,8 @@ ; RV64MV-NEXT: vmsne.vv v0, v8, v10 ; RV64MV-NEXT: vmv.v.i v8, 0 ; RV64MV-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64MV-NEXT: vsetvli a2, zero, e16, m2, ta, mu +; RV64MV-NEXT: vmv.v.i v10, 0 ; RV64MV-NEXT: vsetivli zero, 1, e64, m2, ta, mu ; RV64MV-NEXT: vslidedown.vi v10, v8, 2 ; RV64MV-NEXT: vmv.x.s a2, v10 @@ -771,8 +790,11 @@ ; RV64MV-NEXT: sb a3, 12(a0) ; RV64MV-NEXT: vmv.x.s a3, v8 ; RV64MV-NEXT: and a1, a3, a1 -; RV64MV-NEXT: vslidedown.vi v8, v8, 1 -; RV64MV-NEXT: vmv.x.s a3, v8 +; RV64MV-NEXT: vsetvli a3, zero, e16, m2, ta, mu +; RV64MV-NEXT: vmv.v.i v10, 0 +; RV64MV-NEXT: vsetivli zero, 1, e64, m2, ta, mu +; RV64MV-NEXT: vslidedown.vi v10, v8, 1 +; RV64MV-NEXT: vmv.x.s a3, v10 ; RV64MV-NEXT: slli a4, a3, 33 ; RV64MV-NEXT: or a1, a1, a4 ; RV64MV-NEXT: sd a1, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -576,6 +576,8 @@ ; RV32MV-NEXT: vand.vx v8, v8, a1 ; RV32MV-NEXT: vmsltu.vv v0, v12, v8 ; RV32MV-NEXT: vmerge.vim v8, v10, -1, v0 +; RV32MV-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32MV-NEXT: vmv.v.i v9, 0 ; RV32MV-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV32MV-NEXT: vslidedown.vi v9, v8, 2 ; RV32MV-NEXT: vmv.x.s a1, v9 @@ -584,8 +586,11 @@ ; RV32MV-NEXT: sb a2, 4(a0) ; RV32MV-NEXT: vmv.x.s a2, v8 ; RV32MV-NEXT: andi a2, a2, 2047 -; RV32MV-NEXT: vslidedown.vi v8, v8, 1 -; RV32MV-NEXT: vmv.x.s a3, v8 +; RV32MV-NEXT: vsetvli a3, zero, e16, m1, ta, mu +; RV32MV-NEXT: vmv.v.i v9, 0 +; RV32MV-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32MV-NEXT: vslidedown.vi v9, v8, 1 +; RV32MV-NEXT: vmv.x.s a3, v9 ; RV32MV-NEXT: andi a3, a3, 2047 ; RV32MV-NEXT: slli a3, a3, 11 ; RV32MV-NEXT: or a2, a2, a3 @@ -643,14 +648,19 @@ ; RV64MV-NEXT: vmerge.vim v8, v10, -1, v0 ; RV64MV-NEXT: vmv.x.s a1, v8 ; RV64MV-NEXT: andi a1, a1, 2047 +; RV64MV-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64MV-NEXT: vmv.v.i v9, 0 ; RV64MV-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64MV-NEXT: vslidedown.vi v9, v8, 1 ; RV64MV-NEXT: vmv.x.s a2, v9 ; RV64MV-NEXT: andi a2, a2, 2047 ; RV64MV-NEXT: slli a2, a2, 11 ; RV64MV-NEXT: or a1, a1, a2 -; RV64MV-NEXT: vslidedown.vi v8, v8, 2 -; RV64MV-NEXT: vmv.x.s a2, v8 +; RV64MV-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64MV-NEXT: vmv.v.i v9, 0 +; RV64MV-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64MV-NEXT: vslidedown.vi v9, v8, 2 +; RV64MV-NEXT: vmv.x.s a2, v9 ; RV64MV-NEXT: slli a2, a2, 22 ; RV64MV-NEXT: or a1, a1, a2 ; RV64MV-NEXT: sw a1, 0(a0)