Index: llvm/lib/Target/RISCV/RISCVISelLowering.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -6928,6 +6928,24 @@ return Vec; return convertFromScalableVector(VecVT, Vec, DAG, Subtarget); } + // IF our index is small enough that the mask for the vmerge would require + // only the vmv.v.i form, then we can perform the insert in two instructions: + // vmv.v.i v0, + // vmerge vd, , , v0 + // This avoids the need for the vector temporary, and may let us fold the + // inserted value if it's a constant. TODO: This could be extended to larger + // indices, non-constant indices, or scalable vectors by using vmseq (vid, idx). + if (isa(Idx) && cast(Idx)->getZExtValue() <= 4 && + VecVT.isFixedLengthVector()) { + SmallVector Ops; + for (uint64_t i = 0; i < VecVT.getVectorNumElements(); i++) + Ops.push_back(DAG.getConstant(i == cast(Idx)->getZExtValue(), DL, XLenVT)); + MVT SelMaskTy = VecVT.changeVectorElementType(MVT::i1); + return DAG.getNode(ISD::VSELECT, DL, VecVT, + DAG.getBuildVector(SelMaskTy, DL, Ops), + DAG.getSplatBuildVector(VecVT, DL, Val), + convertFromScalableVector(VecVT, Vec, DAG, Subtarget)); + } ValInVec = lowerScalarInsert(Val, VL, ContainerVT, DL, DAG, Subtarget); } else { // On RV32, i64-element vectors must be specially handled to place the Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -97,11 +97,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, 262144 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret store <4 x float> , <4 x float>* %x @@ -112,11 +110,9 @@ ; CHECK-LABEL: buildvec_dominant1_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vfmv.v.f v8, fa0 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v0 = insertelement <4 x float> poison, float %f, i32 0 @@ -130,14 +126,12 @@ define void @buildvec_dominant2_v4f32(<4 x float>* %x, float %f) { ; CHECK-LABEL: buildvec_dominant2_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 262144 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v8, a1 -; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: lui a1, 262144 +; CHECK-NEXT: vmerge.vxm v8, v8, a1, v0 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v0 = insertelement <4 x float> poison, float %f, i32 0 %v1 = insertelement <4 x float> %v0, float 2.0, i32 1 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck --check-prefixes=CHECK,RV64 %s define <1 x i1> @insertelt_v1i1(<1 x i1> %x, i1 %elt) nounwind { ; CHECK-LABEL: insertelt_v1i1: @@ -38,10 +38,11 @@ ; CHECK-LABEL: insertelt_v2i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vmv.v.i v8, 2 +; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vxm v8, v9, a0, v0 ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret @@ -71,12 +72,11 @@ ; CHECK-LABEL: insertelt_v8i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vmv.v.i v8, 2 +; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vxm v8, v9, a0, v0 ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret @@ -103,20 +103,35 @@ } define <64 x i1> @insertelt_v64i1(<64 x i1> %x, i1 %elt) nounwind { -; CHECK-LABEL: insertelt_v64i1: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 -; CHECK-NEXT: vsetivli zero, 2, e8, m4, tu, ma -; CHECK-NEXT: vslideup.vi v12, v8, 1 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vand.vi v8, v12, 1 -; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: ret +; RV32-LABEL: insertelt_v64i1: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 64 +; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vid.v v12 +; RV32-NEXT: vadd.vv v12, v12, v12 +; RV32-NEXT: vrsub.vi v0, v12, 2 +; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: vand.vi v8, v8, 1 +; RV32-NEXT: vmsne.vi v0, v8, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_v64i1: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.v.i v0, 2 +; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: vand.vi v8, v8, 1 +; RV64-NEXT: vmsne.vi v0, v8, 0 +; RV64-NEXT: ret %y = insertelement <64 x i1> %x, i1 %elt, i64 1 ret <64 x i1> %y } Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -17,9 +17,10 @@ ; ; RV64-LABEL: insertelt_v4i64: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 8 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vslideup.vi v8, v10, 3 +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64-NEXT: ret %b = insertelement <4 x i64> %a, i64 %y, i32 3 ret <4 x i64> %b Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -187,14 +187,12 @@ ; RV32-LABEL: buildvec_vid_step1_add0_v4i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 1 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v9, 2 ; RV32-NEXT: lui a0, %hi(.LCPI12_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI12_0) -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vmv.v.i v0, 4 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: buildvec_vid_step1_add0_v4i64: @@ -210,14 +208,12 @@ ; RV32-LABEL: buildvec_vid_step2_add0_v4i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 2 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v9, 2 ; RV32-NEXT: lui a0, %hi(.LCPI13_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0) -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vmv.v.i v0, 4 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vim v8, v8, 2, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: buildvec_vid_step2_add0_v4i64: @@ -267,12 +263,10 @@ ; CHECK-LABEL: buildvec_dominant0_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vmv.v.i v9, 8 -; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 3 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v9, (a0) +; CHECK-NEXT: vmv.v.i v0, 8 +; CHECK-NEXT: vmv.v.i v8, 8 +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret store <8 x i16> , ptr %x ret void @@ -480,49 +474,25 @@ } define void @buildvec_vid_step1o2_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3, ptr %z4, ptr %z5, ptr %z6) { -; RV32-LABEL: buildvec_vid_step1o2_v4i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vsrl.vi v8, v8, 1 -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: vse32.v v8, (a1) -; RV32-NEXT: vmv.v.i v9, 1 -; RV32-NEXT: vse32.v v8, (a2) -; RV32-NEXT: vse32.v v8, (a3) -; RV32-NEXT: vse32.v v8, (a4) -; RV32-NEXT: vmv.s.x v8, zero -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v9, v8, 1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vse32.v v9, (a5) -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: vse32.v v8, (a6) -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_vid_step1o2_v4i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vsrl.vi v8, v8, 1 -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: vmv.v.i v9, 1 -; RV64-NEXT: vse32.v v8, (a1) -; RV64-NEXT: vse32.v v8, (a2) -; RV64-NEXT: vse32.v v8, (a3) -; RV64-NEXT: vse32.v v8, (a4) -; RV64-NEXT: vmv.s.x v8, zero -; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64-NEXT: vslideup.vi v9, v8, 1 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vse32.v v9, (a5) -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: vse32.v v8, (a6) -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_vid_step1o2_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v8, (a1) +; CHECK-NEXT: vse32.v v8, (a2) +; CHECK-NEXT: vse32.v v8, (a3) +; CHECK-NEXT: vse32.v v8, (a4) +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vse32.v v8, (a5) +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vse32.v v8, (a6) +; CHECK-NEXT: ret store <4 x i32> , ptr %z0 store <4 x i32> , ptr %z1 store <4 x i32> , ptr %z2 @@ -543,18 +513,17 @@ ; CHECK-NEXT: vsrl.vi v8, v8, 1 ; CHECK-NEXT: vadd.vi v8, v8, 3 ; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 3 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: vse16.v v8, (a3) ; CHECK-NEXT: vse16.v v8, (a4) -; CHECK-NEXT: vmv.v.i v8, 3 -; CHECK-NEXT: vmv.v.i v9, 4 -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vse16.v v9, (a5) +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vmv.v.i v8, 4 +; CHECK-NEXT: vmerge.vim v8, v8, 3, v0 +; CHECK-NEXT: vse16.v v8, (a5) ; CHECK-NEXT: li a0, 4 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vslide1down.vx v8, v9, a0 ; CHECK-NEXT: vse16.v v8, (a6) ; CHECK-NEXT: ret store <4 x i16> , ptr %z0 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -372,11 +372,9 @@ ; CHECK-LABEL: splat_ve4_ins_i1ve3: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: vmv.v.i v10, 4 -; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vmv.v.i v9, 4 +; CHECK-NEXT: vmerge.vim v10, v9, 3, v0 ; CHECK-NEXT: vrgather.vv v9, v8, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret @@ -469,15 +467,13 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0_ins_i2we4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 2 -; CHECK-NEXT: vmv.v.i v8, 4 -; CHECK-NEXT: vmv.v.i v11, 0 -; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v11, v8, 2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v11, v10, 4, v0 ; CHECK-NEXT: li a0, 70 ; CHECK-NEXT: vmv.v.x v0, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 2 ; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1272,19 +1272,17 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: lui a1, %hi(.LCPI68_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0) -; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vmulhu.vv v9, v8, v9 -; CHECK-NEXT: vsub.vv v8, v8, v9 +; CHECK-NEXT: vle32.v v10, (a1) +; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: vmv.s.x v10, a1 -; CHECK-NEXT: vmv.v.i v11, 0 -; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v11, v10, 2 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmulhu.vv v8, v8, v11 -; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0 +; CHECK-NEXT: vmulhu.vv v10, v8, v10 +; CHECK-NEXT: vsub.vv v8, v8, v10 +; CHECK-NEXT: vmulhu.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: vmv.v.i v9, 2 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vslide1down.vx v9, v9, a1 @@ -1525,16 +1523,15 @@ ; RV32-NEXT: vrsub.vi v10, v10, 0 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmadd.vv v10, v8, v9 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsrl.vx v8, v10, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 1 -; RV32-NEXT: vmv.v.i v11, 0 -; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v11, v9, 2 +; RV32-NEXT: vmv.v.i v0, 4 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vsra.vv v9, v10, v11 -; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsra.vv v8, v10, v8 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsrl.vx v9, v10, a1 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret ; @@ -5200,33 +5197,31 @@ ; LMULMAX1-RV32-LABEL: mulhu_v8i32: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle32.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle32.v v9, (a1) +; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0 ; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI183_0) ; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI183_0) -; LMULMAX1-RV32-NEXT: vle32.v v9, (a2) -; LMULMAX1-RV32-NEXT: vle32.v v10, (a0) -; LMULMAX1-RV32-NEXT: vmulhu.vv v11, v8, v9 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vle32.v v11, (a2) +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 4 ; LMULMAX1-RV32-NEXT: lui a2, 524288 -; LMULMAX1-RV32-NEXT: vmv.s.x v12, a2 -; LMULMAX1-RV32-NEXT: vmv.v.i v13, 0 -; LMULMAX1-RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; LMULMAX1-RV32-NEXT: vslideup.vi v13, v12, 2 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmulhu.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: vmv.v.i v11, 2 +; LMULMAX1-RV32-NEXT: vmerge.vxm v10, v10, a2, v0 +; LMULMAX1-RV32-NEXT: vmulhu.vv v12, v9, v11 +; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v12 +; LMULMAX1-RV32-NEXT: vmulhu.vv v9, v9, v10 +; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v12 +; LMULMAX1-RV32-NEXT: vmv.v.i v12, 2 ; LMULMAX1-RV32-NEXT: li a2, 1 -; LMULMAX1-RV32-NEXT: vslide1down.vx v11, v11, a2 -; LMULMAX1-RV32-NEXT: vsrl.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: vmulhu.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsub.vv v10, v10, v9 -; LMULMAX1-RV32-NEXT: vmulhu.vv v10, v10, v13 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vv v9, v9, v11 -; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV32-NEXT: vslide1down.vx v12, v12, a2 +; LMULMAX1-RV32-NEXT: vsrl.vv v9, v9, v12 +; LMULMAX1-RV32-NEXT: vmulhu.vv v11, v8, v11 +; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vmulhu.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vsrl.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v9, (a1) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: mulhu_v8i32: @@ -5283,24 +5278,24 @@ ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64-NEXT: vmv.v.i v10, 0 +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 4 +; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI184_0) ; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI184_0) -; LMULMAX2-RV64-NEXT: vle64.v v10, (a1) -; LMULMAX2-RV64-NEXT: vmulhu.vv v10, v8, v10 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64-NEXT: vle64.v v12, (a1) ; LMULMAX2-RV64-NEXT: li a1, -1 ; LMULMAX2-RV64-NEXT: slli a1, a1, 63 -; LMULMAX2-RV64-NEXT: vmv.s.x v12, a1 -; LMULMAX2-RV64-NEXT: vmv.v.i v14, 0 -; LMULMAX2-RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; LMULMAX2-RV64-NEXT: vslideup.vi v14, v12, 2 -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; LMULMAX2-RV64-NEXT: vmulhu.vv v12, v8, v12 ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI184_1) ; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI184_1) -; LMULMAX2-RV64-NEXT: vle64.v v12, (a1) -; LMULMAX2-RV64-NEXT: vmulhu.vv v8, v8, v14 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vv v8, v8, v12 +; LMULMAX2-RV64-NEXT: vle64.v v14, (a1) +; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v12 +; LMULMAX2-RV64-NEXT: vmulhu.vv v8, v8, v10 +; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v12 +; LMULMAX2-RV64-NEXT: vsrl.vv v8, v8, v14 ; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64-NEXT: ret ; Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -89,9 +89,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB1_2 ; RV64ZVE32F-NEXT: .LBB1_4: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) ret <2 x i8> %v @@ -137,9 +138,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB2_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB2_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 @@ -190,9 +192,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB3_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB3_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 @@ -243,9 +246,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB4_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB4_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vsext.vf4 v9, v8 @@ -296,9 +300,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB5_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB5_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vzext.vf4 v9, v8 @@ -356,9 +361,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB6_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB6_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 @@ -417,9 +423,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB7_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB7_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 @@ -477,25 +484,28 @@ ; RV64ZVE32F-NEXT: .LBB8_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB8_3 ; RV64ZVE32F-NEXT: .LBB8_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB8_4 ; RV64ZVE32F-NEXT: .LBB8_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lbu a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %m, <4 x i8> %passthru) ret <4 x i8> %v @@ -543,25 +553,28 @@ ; RV64ZVE32F-NEXT: .LBB9_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB9_3 ; RV64ZVE32F-NEXT: .LBB9_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB9_4 ; RV64ZVE32F-NEXT: .LBB9_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lbu a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -643,33 +656,38 @@ ; RV64ZVE32F-NEXT: .LBB11_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_3 ; RV64ZVE32F-NEXT: .LBB11_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_4 ; RV64ZVE32F-NEXT: .LBB11_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_5 ; RV64ZVE32F-NEXT: .LBB11_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_6 ; RV64ZVE32F-NEXT: .LBB11_14: # %cond.load13 @@ -739,9 +757,9 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB12_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -751,9 +769,9 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB12_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -791,19 +809,20 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB12_8 ; RV64ZVE32F-NEXT: .LBB12_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB12_9 ; RV64ZVE32F-NEXT: j .LBB12_10 @@ -913,9 +932,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB14_2 ; RV64ZVE32F-NEXT: .LBB14_4: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) ret <2 x i16> %v @@ -961,9 +981,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB15_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB15_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 @@ -1014,9 +1035,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB16_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB16_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 @@ -1074,9 +1096,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB17_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB17_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 @@ -1137,9 +1160,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB18_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -1199,25 +1223,28 @@ ; RV64ZVE32F-NEXT: .LBB19_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_3 ; RV64ZVE32F-NEXT: .LBB19_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB19_4 ; RV64ZVE32F-NEXT: .LBB19_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> %m, <4 x i16> %passthru) ret <4 x i16> %v @@ -1265,25 +1292,28 @@ ; RV64ZVE32F-NEXT: .LBB20_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB20_3 ; RV64ZVE32F-NEXT: .LBB20_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB20_4 ; RV64ZVE32F-NEXT: .LBB20_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -1365,33 +1395,38 @@ ; RV64ZVE32F-NEXT: .LBB22_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_3 ; RV64ZVE32F-NEXT: .LBB22_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_4 ; RV64ZVE32F-NEXT: .LBB22_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_5 ; RV64ZVE32F-NEXT: .LBB22_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_6 ; RV64ZVE32F-NEXT: .LBB22_14: # %cond.load13 @@ -1465,9 +1500,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB23_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -1478,9 +1513,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB23_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1520,20 +1555,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB23_8 ; RV64ZVE32F-NEXT: .LBB23_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_9 ; RV64ZVE32F-NEXT: j .LBB23_10 @@ -1608,9 +1644,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB24_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -1621,9 +1657,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB24_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1663,20 +1699,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB24_8 ; RV64ZVE32F-NEXT: .LBB24_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB24_9 ; RV64ZVE32F-NEXT: j .LBB24_10 @@ -1754,9 +1791,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB25_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -1768,9 +1805,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB25_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1812,21 +1849,22 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB25_8 ; RV64ZVE32F-NEXT: .LBB25_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB25_9 ; RV64ZVE32F-NEXT: j .LBB25_10 @@ -1903,9 +1941,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB26_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -1916,9 +1954,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB26_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1958,9 +1996,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB26_8 ; RV64ZVE32F-NEXT: .LBB26_14: # %cond.load10 @@ -1969,9 +2007,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB26_9 ; RV64ZVE32F-NEXT: j .LBB26_10 @@ -2083,9 +2123,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB28_2 ; RV64ZVE32F-NEXT: .LBB28_4: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) ret <2 x i32> %v @@ -2140,9 +2181,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB29_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 @@ -2199,9 +2241,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB30_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 @@ -2261,25 +2304,28 @@ ; RV64ZVE32F-NEXT: .LBB31_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB31_3 ; RV64ZVE32F-NEXT: .LBB31_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB31_4 ; RV64ZVE32F-NEXT: .LBB31_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %m, <4 x i32> %passthru) ret <4 x i32> %v @@ -2326,25 +2372,28 @@ ; RV64ZVE32F-NEXT: .LBB32_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB32_3 ; RV64ZVE32F-NEXT: .LBB32_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB32_4 ; RV64ZVE32F-NEXT: .LBB32_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v9, a0 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -2426,33 +2475,38 @@ ; RV64ZVE32F-NEXT: .LBB34_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_3 ; RV64ZVE32F-NEXT: .LBB34_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_4 ; RV64ZVE32F-NEXT: .LBB34_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_5 ; RV64ZVE32F-NEXT: .LBB34_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_6 ; RV64ZVE32F-NEXT: .LBB34_14: # %cond.load13 @@ -2525,10 +2579,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -2539,10 +2592,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB35_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2583,21 +2635,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 ; RV64ZVE32F-NEXT: .LBB35_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_9 ; RV64ZVE32F-NEXT: j .LBB35_10 @@ -2673,10 +2725,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB36_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -2687,10 +2738,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB36_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2731,21 +2781,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB36_8 ; RV64ZVE32F-NEXT: .LBB36_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB36_9 ; RV64ZVE32F-NEXT: j .LBB36_10 @@ -2824,10 +2874,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB37_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -2839,10 +2888,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB37_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2885,22 +2933,22 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB37_8 ; RV64ZVE32F-NEXT: .LBB37_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB37_9 ; RV64ZVE32F-NEXT: j .LBB37_10 @@ -2980,10 +3028,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB38_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -2994,10 +3041,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB38_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -3038,10 +3084,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB38_8 ; RV64ZVE32F-NEXT: .LBB38_14: # %cond.load10 @@ -3050,9 +3095,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB38_9 ; RV64ZVE32F-NEXT: j .LBB38_10 @@ -3129,10 +3176,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB39_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -3143,10 +3189,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB39_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -3187,10 +3232,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB39_8 ; RV64ZVE32F-NEXT: .LBB39_14: # %cond.load10 @@ -3199,9 +3243,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB39_9 ; RV64ZVE32F-NEXT: j .LBB39_10 @@ -3283,10 +3329,9 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a3, v0 ; RV64ZVE32F-NEXT: .LBB40_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 @@ -3298,10 +3343,9 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a3, v0 ; RV64ZVE32F-NEXT: .LBB40_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -3344,10 +3388,9 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a3, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: beqz a3, .LBB40_8 ; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load10 @@ -3357,9 +3400,11 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a4, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a3, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: bnez a3, .LBB40_9 ; RV64ZVE32F-NEXT: j .LBB40_10 @@ -3438,9 +3483,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB41_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 @@ -3483,9 +3528,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB41_6 ; RV64ZVE32F-NEXT: .LBB41_13: # %cond.load7 @@ -3495,9 +3540,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB41_7 ; RV64ZVE32F-NEXT: .LBB41_14: # %cond.load10 @@ -3506,9 +3551,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB41_8 ; RV64ZVE32F-NEXT: j .LBB41_9 @@ -7084,9 +7131,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB59_2 ; RV64ZVE32F-NEXT: .LBB59_4: # %cond.load1 ; RV64ZVE32F-NEXT: flh fa5, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: ret %v = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x half> %passthru) ret <2 x half> %v @@ -7136,25 +7184,28 @@ ; RV64ZVE32F-NEXT: .LBB60_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_3 ; RV64ZVE32F-NEXT: .LBB60_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB60_4 ; RV64ZVE32F-NEXT: .LBB60_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: ret %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> %m, <4 x half> %passthru) ret <4 x half> %v @@ -7202,25 +7253,28 @@ ; RV64ZVE32F-NEXT: .LBB61_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB61_3 ; RV64ZVE32F-NEXT: .LBB61_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB61_4 ; RV64ZVE32F-NEXT: .LBB61_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -7302,33 +7356,38 @@ ; RV64ZVE32F-NEXT: .LBB63_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_3 ; RV64ZVE32F-NEXT: .LBB63_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_4 ; RV64ZVE32F-NEXT: .LBB63_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_5 ; RV64ZVE32F-NEXT: .LBB63_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_6 ; RV64ZVE32F-NEXT: .LBB63_14: # %cond.load13 @@ -7402,9 +7461,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB64_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -7415,9 +7474,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB64_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7457,20 +7516,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB64_8 ; RV64ZVE32F-NEXT: .LBB64_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB64_9 ; RV64ZVE32F-NEXT: j .LBB64_10 @@ -7545,9 +7605,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB65_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -7558,9 +7618,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB65_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7600,20 +7660,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB65_8 ; RV64ZVE32F-NEXT: .LBB65_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB65_9 ; RV64ZVE32F-NEXT: j .LBB65_10 @@ -7691,9 +7752,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB66_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -7705,9 +7766,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB66_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7749,21 +7810,22 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB66_8 ; RV64ZVE32F-NEXT: .LBB66_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB66_9 ; RV64ZVE32F-NEXT: j .LBB66_10 @@ -7840,9 +7902,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB67_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -7853,9 +7915,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: .LBB67_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7895,9 +7957,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB67_8 ; RV64ZVE32F-NEXT: .LBB67_14: # %cond.load10 @@ -7906,9 +7968,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v9, v9, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB67_9 ; RV64ZVE32F-NEXT: j .LBB67_10 @@ -8020,9 +8084,10 @@ ; RV64ZVE32F-NEXT: beqz a2, .LBB69_2 ; RV64ZVE32F-NEXT: .LBB69_4: # %cond.load1 ; RV64ZVE32F-NEXT: flw fa5, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: ret %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x float> %passthru) ret <2 x float> %v @@ -8072,25 +8137,28 @@ ; RV64ZVE32F-NEXT: .LBB70_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB70_3 ; RV64ZVE32F-NEXT: .LBB70_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB70_4 ; RV64ZVE32F-NEXT: .LBB70_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: ret %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %m, <4 x float> %passthru) ret <4 x float> %v @@ -8137,25 +8205,28 @@ ; RV64ZVE32F-NEXT: .LBB71_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB71_3 ; RV64ZVE32F-NEXT: .LBB71_7: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: beqz a1, .LBB71_4 ; RV64ZVE32F-NEXT: .LBB71_8: # %cond.load7 ; RV64ZVE32F-NEXT: ld a0, 24(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -8237,33 +8308,38 @@ ; RV64ZVE32F-NEXT: .LBB73_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB73_3 ; RV64ZVE32F-NEXT: .LBB73_11: # %cond.load4 ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB73_4 ; RV64ZVE32F-NEXT: .LBB73_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld a2, 24(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB73_5 ; RV64ZVE32F-NEXT: .LBB73_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld a2, 32(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB73_6 ; RV64ZVE32F-NEXT: .LBB73_14: # %cond.load13 @@ -8336,10 +8412,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB74_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -8350,10 +8425,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB74_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -8394,21 +8468,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB74_8 ; RV64ZVE32F-NEXT: .LBB74_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB74_9 ; RV64ZVE32F-NEXT: j .LBB74_10 @@ -8484,10 +8558,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB75_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -8498,10 +8571,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB75_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -8542,21 +8614,21 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB75_8 ; RV64ZVE32F-NEXT: .LBB75_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB75_9 ; RV64ZVE32F-NEXT: j .LBB75_10 @@ -8635,10 +8707,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB76_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -8650,10 +8721,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB76_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -8696,22 +8766,22 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB76_8 ; RV64ZVE32F-NEXT: .LBB76_14: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB76_9 ; RV64ZVE32F-NEXT: j .LBB76_10 @@ -8791,10 +8861,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB77_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -8805,10 +8874,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB77_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -8849,10 +8917,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB77_8 ; RV64ZVE32F-NEXT: .LBB77_14: # %cond.load10 @@ -8861,9 +8928,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB77_9 ; RV64ZVE32F-NEXT: j .LBB77_10 @@ -8940,10 +9009,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB78_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -8954,10 +9022,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB78_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -8998,10 +9065,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB78_8 ; RV64ZVE32F-NEXT: .LBB78_14: # %cond.load10 @@ -9010,9 +9076,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB78_9 ; RV64ZVE32F-NEXT: j .LBB78_10 @@ -9094,10 +9162,9 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB79_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 @@ -9109,10 +9176,9 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB79_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -9155,10 +9221,9 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: beqz a3, .LBB79_8 ; RV64ZVE32F-NEXT: .LBB79_14: # %cond.load10 @@ -9168,9 +9233,11 @@ ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: bnez a3, .LBB79_9 ; RV64ZVE32F-NEXT: j .LBB79_10 @@ -9249,9 +9316,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: .LBB80_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 @@ -9294,9 +9361,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v14, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB80_6 ; RV64ZVE32F-NEXT: .LBB80_13: # %cond.load7 @@ -9306,9 +9373,9 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB80_7 ; RV64ZVE32F-NEXT: .LBB80_14: # %cond.load10 @@ -9317,9 +9384,11 @@ ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: li a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmerge.vfm v10, v10, fa5, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB80_8 ; RV64ZVE32F-NEXT: j .LBB80_9 @@ -12140,9 +12209,10 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB97_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -12152,9 +12222,10 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: .LBB97_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -12266,9 +12337,10 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 3 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_8 ; RV64ZVE32F-NEXT: .LBB97_27: # %cond.load10 @@ -12276,9 +12348,11 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4 +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v9, v9, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_9 ; RV64ZVE32F-NEXT: j .LBB97_10 @@ -12400,10 +12474,11 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 2 ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB98_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -12413,10 +12488,11 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 4 ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: .LBB98_6: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -12672,10 +12748,11 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.v.i v0, 8 ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 3 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_8 ; RV64ZVE32F-NEXT: .LBB98_51: # %cond.load10 @@ -12683,11 +12760,12 @@ ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v0, a3 ; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vmerge.vxm v10, v10, a2, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_9 ; RV64ZVE32F-NEXT: j .LBB98_10 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll @@ -258,10 +258,9 @@ ; CHECK-LABEL: vslide1down_v2f64_inverted: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vrgather.vi v9, v8, 0 -; CHECK-NEXT: vfmv.s.f v8, fa0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vfmerge.vfm v8, v9, fa0, v0 ; CHECK-NEXT: ret %v1 = shufflevector <2 x double> %v, <2 x double> poison, <2 x i32> %v2 = insertelement <2 x double> %v1, double %b, i64 1 @@ -272,11 +271,9 @@ ; CHECK-LABEL: vslide1down_4xi8_inverted: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vmerge.vxm v8, v9, a0, v0 ; CHECK-NEXT: ret %v1 = shufflevector <4 x i8> %v, <4 x i8> poison, <4 x i32> %v2 = insertelement <4 x i8> %v1, i8 %b, i64 1 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll @@ -302,12 +302,12 @@ define <2 x double> @vslide1up_2xf64_as_rotate(<2 x double> %v, double %b) { ; CHECK-LABEL: vslide1up_2xf64_as_rotate: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vfmerge.vfm v9, v8, fa0, v0 +; CHECK-NEXT: vslidedown.vi v8, v9, 1 ; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: vslidedown.vi v9, v8, 1 -; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %v1 = insertelement <2 x double> %v, double %b, i64 1 %v2 = shufflevector <2 x double> %v1, <2 x double> poison, <2 x i32> @@ -317,12 +317,12 @@ define <4 x i8> @vslide1up_4xi8_as_rotate(<4 x i8> %v, i8 %b) { ; CHECK-LABEL: vslide1up_4xi8_as_rotate: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vslidedown.vi v9, v8, 3 -; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vmerge.vxm v9, v8, a0, v0 +; CHECK-NEXT: vslidedown.vi v8, v9, 3 +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %v1 = insertelement <4 x i8> %v, i8 %b, i64 3 %v2 = shufflevector <4 x i8> %v1, <4 x i8> poison, <4 x i32> Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll @@ -190,10 +190,9 @@ ; RV32LMULMAX1-LABEL: stepvector_v2i64: ; RV32LMULMAX1: # %bb.0: ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 +; RV32LMULMAX1-NEXT: vmv.v.i v0, 4 ; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 +; RV32LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32LMULMAX1-NEXT: ret ; ; RV64LMULMAX1-LABEL: stepvector_v2i64: @@ -205,10 +204,9 @@ ; RV32LMULMAX2-LABEL: stepvector_v2i64: ; RV32LMULMAX2: # %bb.0: ; RV32LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX2-NEXT: vmv.v.i v9, 1 +; RV32LMULMAX2-NEXT: vmv.v.i v0, 4 ; RV32LMULMAX2-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX2-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX2-NEXT: vslideup.vi v8, v9, 2 +; RV32LMULMAX2-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32LMULMAX2-NEXT: ret ; ; RV64LMULMAX2-LABEL: stepvector_v2i64: @@ -226,14 +224,12 @@ ; RV32LMULMAX1-LABEL: stepvector_v4i64: ; RV32LMULMAX1: # %bb.0: ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 ; RV32LMULMAX1-NEXT: lui a0, %hi(.LCPI14_0) ; RV32LMULMAX1-NEXT: addi a0, a0, %lo(.LCPI14_0) -; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32LMULMAX1-NEXT: vle32.v v9, (a0) +; RV32LMULMAX1-NEXT: vmv.v.i v0, 4 +; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV32LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32LMULMAX1-NEXT: ret ; ; RV64LMULMAX1-LABEL: stepvector_v4i64: @@ -266,13 +262,8 @@ ; RV32LMULMAX1-LABEL: stepvector_v8i64: ; RV32LMULMAX1: # %bb.0: ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 ; RV32LMULMAX1-NEXT: lui a0, %hi(.LCPI15_0) ; RV32LMULMAX1-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32LMULMAX1-NEXT: vle32.v v9, (a0) ; RV32LMULMAX1-NEXT: lui a0, %hi(.LCPI15_1) ; RV32LMULMAX1-NEXT: addi a0, a0, %lo(.LCPI15_1) @@ -280,6 +271,9 @@ ; RV32LMULMAX1-NEXT: lui a0, %hi(.LCPI15_2) ; RV32LMULMAX1-NEXT: addi a0, a0, %lo(.LCPI15_2) ; RV32LMULMAX1-NEXT: vle32.v v11, (a0) +; RV32LMULMAX1-NEXT: vmv.v.i v0, 4 +; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV32LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32LMULMAX1-NEXT: ret ; ; RV64LMULMAX1-LABEL: stepvector_v8i64: @@ -318,13 +312,8 @@ ; RV32LMULMAX1-LABEL: stepvector_v16i64: ; RV32LMULMAX1: # %bb.0: ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 ; RV32LMULMAX1-NEXT: lui a0, %hi(.LCPI16_0) ; RV32LMULMAX1-NEXT: addi a0, a0, %lo(.LCPI16_0) -; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32LMULMAX1-NEXT: vle32.v v9, (a0) ; RV32LMULMAX1-NEXT: lui a0, %hi(.LCPI16_1) ; RV32LMULMAX1-NEXT: addi a0, a0, %lo(.LCPI16_1) @@ -344,6 +333,9 @@ ; RV32LMULMAX1-NEXT: lui a0, %hi(.LCPI16_6) ; RV32LMULMAX1-NEXT: addi a0, a0, %lo(.LCPI16_6) ; RV32LMULMAX1-NEXT: vle32.v v15, (a0) +; RV32LMULMAX1-NEXT: vmv.v.i v0, 4 +; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 +; RV32LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32LMULMAX1-NEXT: ret ; ; RV64LMULMAX1-LABEL: stepvector_v16i64: Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -106,9 +106,9 @@ ; RV32-SLOW-NEXT: lbu a0, 0(a0) ; RV32-SLOW-NEXT: slli a1, a1, 8 ; RV32-SLOW-NEXT: or a0, a1, a0 -; RV32-SLOW-NEXT: vmv.s.x v8, a0 +; RV32-SLOW-NEXT: vmv.v.i v0, 2 ; RV32-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV32-SLOW-NEXT: vslideup.vi v9, v8, 1 +; RV32-SLOW-NEXT: vmerge.vxm v9, v9, a0, v0 ; RV32-SLOW-NEXT: vmv1r.v v8, v9 ; RV32-SLOW-NEXT: ret ; @@ -143,9 +143,9 @@ ; RV64-SLOW-NEXT: lbu a0, 0(a0) ; RV64-SLOW-NEXT: slli a1, a1, 8 ; RV64-SLOW-NEXT: or a0, a1, a0 -; RV64-SLOW-NEXT: vmv.s.x v8, a0 +; RV64-SLOW-NEXT: vmv.v.i v0, 2 ; RV64-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1 +; RV64-SLOW-NEXT: vmerge.vxm v9, v9, a0, v0 ; RV64-SLOW-NEXT: vmv1r.v v8, v9 ; RV64-SLOW-NEXT: ret ; @@ -236,9 +236,9 @@ ; RV64-SLOW-NEXT: lwu a0, 0(a0) ; RV64-SLOW-NEXT: slli a1, a1, 32 ; RV64-SLOW-NEXT: or a0, a1, a0 -; RV64-SLOW-NEXT: vmv.s.x v8, a0 +; RV64-SLOW-NEXT: vmv.v.i v0, 2 ; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1 +; RV64-SLOW-NEXT: vmerge.vxm v9, v9, a0, v0 ; RV64-SLOW-NEXT: vmv1r.v v8, v9 ; RV64-SLOW-NEXT: ret ; @@ -524,9 +524,10 @@ ; RV32-SLOW-NEXT: slli a0, a0, 24 ; RV32-SLOW-NEXT: or a0, a0, a4 ; RV32-SLOW-NEXT: or a0, a0, a2 +; RV32-SLOW-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-SLOW-NEXT: vmv.v.i v0, 2 ; RV32-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-SLOW-NEXT: vmv.s.x v9, a0 -; RV32-SLOW-NEXT: vslideup.vi v8, v9, 1 +; RV32-SLOW-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV32-SLOW-NEXT: .LBB8_4: # %else2 ; RV32-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-SLOW-NEXT: vse32.v v8, (a1) @@ -568,9 +569,10 @@ ; RV64-SLOW-NEXT: slli a0, a0, 24 ; RV64-SLOW-NEXT: or a0, a0, a4 ; RV64-SLOW-NEXT: or a0, a0, a2 +; RV64-SLOW-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-SLOW-NEXT: vmv.v.i v0, 2 ; RV64-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-SLOW-NEXT: vmv.s.x v9, a0 -; RV64-SLOW-NEXT: vslideup.vi v8, v9, 1 +; RV64-SLOW-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64-SLOW-NEXT: .LBB8_4: # %else2 ; RV64-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-SLOW-NEXT: vse32.v v8, (a1) Index: llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -668,17 +668,18 @@ ; RV32MV-NEXT: vmv.v.i v10, 1 ; RV32MV-NEXT: vmerge.vim v10, v10, -1, v0 ; RV32MV-NEXT: vand.vv v8, v8, v10 -; RV32MV-NEXT: li a0, 2 -; RV32MV-NEXT: vmv.s.x v10, a0 -; RV32MV-NEXT: li a0, 1 -; RV32MV-NEXT: vmv.s.x v12, a0 -; RV32MV-NEXT: vmv.v.i v14, 0 -; RV32MV-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV32MV-NEXT: vslideup.vi v14, v12, 2 -; RV32MV-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV32MV-NEXT: vslideup.vi v14, v10, 4 +; RV32MV-NEXT: vmv.v.i v10, 0 +; RV32MV-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32MV-NEXT: vmv.v.i v0, 4 +; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32MV-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32MV-NEXT: li a0, 16 +; RV32MV-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32MV-NEXT: vmv.v.x v0, a0 +; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32MV-NEXT: vmerge.vim v10, v10, 2, v0 ; RV32MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32MV-NEXT: vmsne.vv v0, v8, v14 +; RV32MV-NEXT: vmsne.vv v0, v8, v10 ; RV32MV-NEXT: vmv.v.i v8, 0 ; RV32MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32MV-NEXT: vsetivli zero, 1, e32, m2, ta, ma