diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -942,6 +942,8 @@ setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER, ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL, ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR}); + if (Subtarget.useRVVForFixedLengthVectors()) + setTargetDAGCombine(ISD::BITCAST); setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); @@ -9049,6 +9051,26 @@ } } } + case ISD::BITCAST: { + assert(Subtarget.useRVVForFixedLengthVectors()); + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); + // If this is a bitcast between a MVT::v4i1/v2i1/v1i1 and an illegal integer + // type, widen both sides to avoid a trip through memory. + if ((SrcVT == MVT::v1i1 || SrcVT == MVT::v2i1 || SrcVT == MVT::v4i1) && + VT.isScalarInteger()) { + unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); + SmallVector Ops(NumConcats, DAG.getUNDEF(SrcVT)); + Ops[0] = N0; + SDLoc DL(N); + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i1, Ops); + N0 = DAG.getBitcast(MVT::i8, N0); + return DAG.getNode(ISD::TRUNCATE, DL, VT, N0); + } + + return SDValue(); + } } return SDValue(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -75,27 +75,14 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB1_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB1_4 ; RV64ZVE32F-NEXT: .LBB1_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB1_3: # %cond.load ; RV64ZVE32F-NEXT: lb a0, 0(a0) @@ -109,7 +96,6 @@ ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) ret <2 x i8> %v @@ -142,20 +128,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB2_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -175,7 +149,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = sext <2 x i8> %v to <2 x i16> @@ -209,20 +182,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB3_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -242,7 +203,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = zext <2 x i8> %v to <2 x i16> @@ -276,20 +236,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB4_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -309,7 +257,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vsext.vf4 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = sext <2 x i8> %v to <2 x i32> @@ -343,20 +290,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB5_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -376,7 +311,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vzext.vf4 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = zext <2 x i8> %v to <2 x i32> @@ -418,20 +352,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB6_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -452,7 +374,6 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = sext <2 x i8> %v to <2 x i64> @@ -494,20 +415,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB7_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -530,7 +439,6 @@ ; RV64ZVE32F-NEXT: andi a1, a0, 255 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: andi a0, a0, 255 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = zext <2 x i8> %v to <2 x i64> @@ -556,20 +464,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) -; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: bnez a2, .LBB8_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else @@ -582,7 +478,6 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: bnez a1, .LBB8_8 ; RV64ZVE32F-NEXT: .LBB8_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB8_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) @@ -616,7 +511,6 @@ ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %m, <4 x i8> %passthru) ret <4 x i8> %v @@ -639,67 +533,53 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB9_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB9_6 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_6 ; RV64ZVE32F-NEXT: .LBB9_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB9_7 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_7 ; RV64ZVE32F-NEXT: .LBB9_3: # %else5 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB9_8 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB9_8 ; RV64ZVE32F-NEXT: .LBB9_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB9_5: # %cond.load -; RV64ZVE32F-NEXT: lb a4, 0(a4) +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a4 -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB9_2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_2 ; RV64ZVE32F-NEXT: .LBB9_6: # %cond.load1 -; RV64ZVE32F-NEXT: lb a3, 0(a3) +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB9_3 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_3 ; RV64ZVE32F-NEXT: .LBB9_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lb a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB9_4 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB9_4 ; RV64ZVE32F-NEXT: .LBB9_8: # %cond.load7 -; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: lb a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -1045,27 +925,14 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB14_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB14_4 ; RV64ZVE32F-NEXT: .LBB14_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB14_3: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) @@ -1079,7 +946,6 @@ ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) ret <2 x i16> %v @@ -1112,20 +978,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB15_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -1145,7 +999,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = sext <2 x i16> %v to <2 x i32> @@ -1179,20 +1032,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB16_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -1212,7 +1053,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = zext <2 x i16> %v to <2 x i32> @@ -1254,20 +1094,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB17_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -1288,7 +1116,6 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = sext <2 x i16> %v to <2 x i64> @@ -1332,20 +1159,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB18_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -1371,7 +1186,6 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: and a1, a2, a1 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = zext <2 x i16> %v to <2 x i64> @@ -1397,20 +1211,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) -; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else @@ -1423,7 +1225,6 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: bnez a1, .LBB19_8 ; RV64ZVE32F-NEXT: .LBB19_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB19_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) @@ -1457,7 +1258,6 @@ ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> %m, <4 x i16> %passthru) ret <4 x i16> %v @@ -1480,67 +1280,53 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB20_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB20_6 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_6 ; RV64ZVE32F-NEXT: .LBB20_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB20_7 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_7 ; RV64ZVE32F-NEXT: .LBB20_3: # %else5 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB20_8 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB20_8 ; RV64ZVE32F-NEXT: .LBB20_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB20_5: # %cond.load -; RV64ZVE32F-NEXT: lh a4, 0(a4) +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a4 -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB20_2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_2 ; RV64ZVE32F-NEXT: .LBB20_6: # %cond.load1 -; RV64ZVE32F-NEXT: lh a3, 0(a3) +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB20_3 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_3 ; RV64ZVE32F-NEXT: .LBB20_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB20_4 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB20_4 ; RV64ZVE32F-NEXT: .LBB20_8: # %cond.load7 -; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: lh a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -2357,27 +2143,14 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB28_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB28_4 ; RV64ZVE32F-NEXT: .LBB28_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB28_3: # %cond.load ; RV64ZVE32F-NEXT: lw a0, 0(a0) @@ -2391,7 +2164,6 @@ ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) ret <2 x i32> %v @@ -2433,20 +2205,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i32_sextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB29_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -2467,7 +2227,6 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) %ev = sext <2 x i32> %v to <2 x i64> @@ -2506,20 +2265,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB30_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -2544,7 +2291,6 @@ ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 32 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) %ev = zext <2 x i32> %v to <2 x i64> @@ -2570,20 +2316,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) -; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: bnez a2, .LBB31_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else @@ -2596,7 +2330,6 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: bnez a1, .LBB31_8 ; RV64ZVE32F-NEXT: .LBB31_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB31_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) @@ -2630,7 +2363,6 @@ ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %m, <4 x i32> %passthru) ret <4 x i32> %v @@ -2652,67 +2384,53 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB32_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB32_6 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_6 ; RV64ZVE32F-NEXT: .LBB32_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB32_7 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_7 ; RV64ZVE32F-NEXT: .LBB32_3: # %else5 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB32_8 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB32_8 ; RV64ZVE32F-NEXT: .LBB32_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB32_5: # %cond.load -; RV64ZVE32F-NEXT: lw a4, 0(a4) +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a4 -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB32_2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_2 ; RV64ZVE32F-NEXT: .LBB32_6: # %cond.load1 -; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB32_3 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_3 ; RV64ZVE32F-NEXT: .LBB32_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB32_4 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB32_4 ; RV64ZVE32F-NEXT: .LBB32_8: # %cond.load7 -; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: lw a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -3990,24 +3708,12 @@ ; ; RV32ZVE32F-LABEL: mgather_v2i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a2, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a2) -; RV32ZVE32F-NEXT: lbu a4, 15(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a4, v0 ; RV32ZVE32F-NEXT: andi a2, a4, 1 ; RV32ZVE32F-NEXT: beqz a2, .LBB43_3 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -4033,25 +3739,12 @@ ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: sw a1, 8(a0) ; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a4, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a4) -; RV64ZVE32F-NEXT: lbu a4, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi a5, a4, 1 ; RV64ZVE32F-NEXT: beqz a5, .LBB43_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -4064,7 +3757,6 @@ ; RV64ZVE32F-NEXT: .LBB43_4: # %else2 ; RV64ZVE32F-NEXT: mv a0, a2 ; RV64ZVE32F-NEXT: mv a1, a3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %ptrs, i32 8, <2 x i1> %m, <2 x i64> %passthru) ret <2 x i64> %v @@ -4089,24 +3781,12 @@ ; ; RV32ZVE32F-LABEL: mgather_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a2, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a2) -; RV32ZVE32F-NEXT: lbu a6, 15(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a6, v0 ; RV32ZVE32F-NEXT: andi a2, a6, 1 ; RV32ZVE32F-NEXT: beqz a2, .LBB44_5 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -4162,25 +3842,12 @@ ; RV32ZVE32F-NEXT: sw a7, 20(a0) ; RV32ZVE32F-NEXT: sw a1, 24(a0) ; RV32ZVE32F-NEXT: sw a6, 28(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v4i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a3, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a3) -; RV64ZVE32F-NEXT: lbu a5, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 ; RV64ZVE32F-NEXT: andi a3, a5, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB44_5 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load @@ -4221,7 +3888,6 @@ ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a1, 24(a0) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %m, <4 x i64> %passthru) ret <4 x i64> %v @@ -4243,138 +3909,114 @@ ; ; RV32ZVE32F-LABEL: mgather_truemask_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: lw a2, 28(a1) -; RV32ZVE32F-NEXT: lw a3, 24(a1) -; RV32ZVE32F-NEXT: lw a4, 20(a1) -; RV32ZVE32F-NEXT: lw a5, 16(a1) -; RV32ZVE32F-NEXT: lw a6, 12(a1) -; RV32ZVE32F-NEXT: lw t0, 8(a1) -; RV32ZVE32F-NEXT: lw a7, 4(a1) -; RV32ZVE32F-NEXT: lw a1, 0(a1) -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmset.m v0 -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 ; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi t1, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (t1) -; RV32ZVE32F-NEXT: lb t1, 15(sp) -; RV32ZVE32F-NEXT: beqz zero, .LBB45_6 -; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi t2, t1, 2 -; RV32ZVE32F-NEXT: bnez t2, .LBB45_7 -; RV32ZVE32F-NEXT: .LBB45_2: # %else2 -; RV32ZVE32F-NEXT: andi t2, t1, 4 -; RV32ZVE32F-NEXT: bnez t2, .LBB45_8 -; RV32ZVE32F-NEXT: .LBB45_3: # %else5 -; RV32ZVE32F-NEXT: andi t1, t1, 8 -; RV32ZVE32F-NEXT: beqz t1, .LBB45_5 -; RV32ZVE32F-NEXT: .LBB45_4: # %cond.load7 -; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmset.m v9 +; RV32ZVE32F-NEXT: vmv.x.s a6, v9 +; RV32ZVE32F-NEXT: bnez zero, .LBB45_5 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) -; RV32ZVE32F-NEXT: .LBB45_5: # %else8 -; RV32ZVE32F-NEXT: sw a1, 0(a0) -; RV32ZVE32F-NEXT: sw a7, 4(a0) -; RV32ZVE32F-NEXT: sw t0, 8(a0) -; RV32ZVE32F-NEXT: sw a6, 12(a0) -; RV32ZVE32F-NEXT: sw a5, 16(a0) -; RV32ZVE32F-NEXT: sw a4, 20(a0) -; RV32ZVE32F-NEXT: sw a3, 24(a0) -; RV32ZVE32F-NEXT: sw a2, 28(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 -; RV32ZVE32F-NEXT: ret -; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw a7, 4(a1) -; RV32ZVE32F-NEXT: lw a1, 0(a1) -; RV32ZVE32F-NEXT: andi t2, t1, 2 -; RV32ZVE32F-NEXT: beqz t2, .LBB45_2 -; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load1 +; RV32ZVE32F-NEXT: andi a4, a6, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB45_6 +; RV32ZVE32F-NEXT: .LBB45_2: +; RV32ZVE32F-NEXT: lw a4, 12(a1) +; RV32ZVE32F-NEXT: lw a5, 8(a1) +; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB45_7 +; RV32ZVE32F-NEXT: .LBB45_3: +; RV32ZVE32F-NEXT: lw a7, 20(a1) +; RV32ZVE32F-NEXT: lw t0, 16(a1) +; RV32ZVE32F-NEXT: andi a6, a6, 8 +; RV32ZVE32F-NEXT: bnez a6, .LBB45_8 +; RV32ZVE32F-NEXT: .LBB45_4: +; RV32ZVE32F-NEXT: lw a6, 28(a1) +; RV32ZVE32F-NEXT: lw a1, 24(a1) +; RV32ZVE32F-NEXT: j .LBB45_9 +; RV32ZVE32F-NEXT: .LBB45_5: +; RV32ZVE32F-NEXT: lw a2, 4(a1) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: andi a4, a6, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB45_2 +; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s t0, v9 -; RV32ZVE32F-NEXT: lw a6, 4(t0) -; RV32ZVE32F-NEXT: lw t0, 0(t0) -; RV32ZVE32F-NEXT: andi t2, t1, 4 -; RV32ZVE32F-NEXT: beqz t2, .LBB45_3 -; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load4 -; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a5, v9 ; RV32ZVE32F-NEXT: lw a4, 4(a5) ; RV32ZVE32F-NEXT: lw a5, 0(a5) -; RV32ZVE32F-NEXT: andi t1, t1, 8 -; RV32ZVE32F-NEXT: bnez t1, .LBB45_4 -; RV32ZVE32F-NEXT: j .LBB45_5 +; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB45_3 +; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s t0, v9 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi a6, a6, 8 +; RV32ZVE32F-NEXT: beqz a6, .LBB45_4 +; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a6, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: .LBB45_9: # %else8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw a1, 24(a0) +; RV32ZVE32F-NEXT: sw a6, 28(a0) +; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: ld a3, 24(a2) -; RV64ZVE32F-NEXT: ld a4, 16(a2) -; RV64ZVE32F-NEXT: ld a5, 8(a2) -; RV64ZVE32F-NEXT: ld a2, 0(a2) -; RV64ZVE32F-NEXT: ld a6, 24(a1) -; RV64ZVE32F-NEXT: ld a7, 16(a1) -; RV64ZVE32F-NEXT: ld t0, 8(a1) -; RV64ZVE32F-NEXT: ld t1, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a1) -; RV64ZVE32F-NEXT: lb a1, 15(sp) -; RV64ZVE32F-NEXT: beqz zero, .LBB45_6 -; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi t1, a1, 2 -; RV64ZVE32F-NEXT: bnez t1, .LBB45_7 -; RV64ZVE32F-NEXT: .LBB45_2: # %else2 -; RV64ZVE32F-NEXT: andi t0, a1, 4 -; RV64ZVE32F-NEXT: bnez t0, .LBB45_8 -; RV64ZVE32F-NEXT: .LBB45_3: # %else5 -; RV64ZVE32F-NEXT: andi a1, a1, 8 -; RV64ZVE32F-NEXT: beqz a1, .LBB45_5 -; RV64ZVE32F-NEXT: .LBB45_4: # %cond.load7 -; RV64ZVE32F-NEXT: ld a3, 0(a6) -; RV64ZVE32F-NEXT: .LBB45_5: # %else8 -; RV64ZVE32F-NEXT: sd a2, 0(a0) -; RV64ZVE32F-NEXT: sd a5, 8(a0) -; RV64ZVE32F-NEXT: sd a4, 16(a0) -; RV64ZVE32F-NEXT: sd a3, 24(a0) -; RV64ZVE32F-NEXT: addi sp, sp, 16 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB45_6: # %cond.load -; RV64ZVE32F-NEXT: ld a2, 0(t1) -; RV64ZVE32F-NEXT: andi t1, a1, 2 -; RV64ZVE32F-NEXT: beqz t1, .LBB45_2 -; RV64ZVE32F-NEXT: .LBB45_7: # %cond.load1 -; RV64ZVE32F-NEXT: ld a5, 0(t0) -; RV64ZVE32F-NEXT: andi t0, a1, 4 -; RV64ZVE32F-NEXT: beqz t0, .LBB45_3 -; RV64ZVE32F-NEXT: .LBB45_8: # %cond.load4 -; RV64ZVE32F-NEXT: ld a4, 0(a7) -; RV64ZVE32F-NEXT: andi a1, a1, 8 -; RV64ZVE32F-NEXT: bnez a1, .LBB45_4 -; RV64ZVE32F-NEXT: j .LBB45_5 +; RV64ZVE32F-NEXT: vmset.m v8 +; RV64ZVE32F-NEXT: vmv.x.s a5, v8 +; RV64ZVE32F-NEXT: bnez zero, .LBB45_5 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: ld a3, 0(a1) +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB45_6 +; RV64ZVE32F-NEXT: .LBB45_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: bnez a6, .LBB45_7 +; RV64ZVE32F-NEXT: .LBB45_3: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a5, a5, 8 +; RV64ZVE32F-NEXT: bnez a5, .LBB45_8 +; RV64ZVE32F-NEXT: .LBB45_4: +; RV64ZVE32F-NEXT: ld a1, 24(a2) +; RV64ZVE32F-NEXT: j .LBB45_9 +; RV64ZVE32F-NEXT: .LBB45_5: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB45_2 +; RV64ZVE32F-NEXT: .LBB45_6: # %cond.load1 +; RV64ZVE32F-NEXT: ld a4, 8(a1) +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: beqz a6, .LBB45_3 +; RV64ZVE32F-NEXT: .LBB45_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a6, 16(a1) +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a5, a5, 8 +; RV64ZVE32F-NEXT: beqz a5, .LBB45_4 +; RV64ZVE32F-NEXT: .LBB45_8: # %cond.load7 +; RV64ZVE32F-NEXT: ld a1, 24(a1) +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB45_9: # %else8 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a1, 24(a0) +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %mtrue, <4 x i64> %passthru) @@ -7605,27 +7247,14 @@ ; ; RV64ZVE32F-LABEL: mgather_v2f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB59_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB59_4 ; RV64ZVE32F-NEXT: .LBB59_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB59_3: # %cond.load ; RV64ZVE32F-NEXT: flh ft0, 0(a0) @@ -7639,7 +7268,6 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> %ptrs, i32 2, <2 x i1> %m, <2 x half> %passthru) ret <2 x half> %v @@ -7664,20 +7292,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) -; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: bnez a2, .LBB60_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else @@ -7690,7 +7306,6 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: bnez a1, .LBB60_8 ; RV64ZVE32F-NEXT: .LBB60_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB60_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) @@ -7724,7 +7339,6 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> %m, <4 x half> %passthru) ret <4 x half> %v @@ -7747,67 +7361,53 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB61_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB61_6 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_6 ; RV64ZVE32F-NEXT: .LBB61_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB61_7 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_7 ; RV64ZVE32F-NEXT: .LBB61_3: # %else5 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB61_8 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB61_8 ; RV64ZVE32F-NEXT: .LBB61_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB61_5: # %cond.load -; RV64ZVE32F-NEXT: flh ft0, 0(a4) +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB61_2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_2 ; RV64ZVE32F-NEXT: .LBB61_6: # %cond.load1 -; RV64ZVE32F-NEXT: flh ft0, 0(a3) +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB61_3 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_3 ; RV64ZVE32F-NEXT: .LBB61_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flh ft0, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB61_4 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB61_4 ; RV64ZVE32F-NEXT: .LBB61_8: # %cond.load7 -; RV64ZVE32F-NEXT: flh ft0, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -8624,27 +8224,14 @@ ; ; RV64ZVE32F-LABEL: mgather_v2f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB69_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB69_4 ; RV64ZVE32F-NEXT: .LBB69_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB69_3: # %cond.load ; RV64ZVE32F-NEXT: flw ft0, 0(a0) @@ -8658,7 +8245,6 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %ptrs, i32 4, <2 x i1> %m, <2 x float> %passthru) ret <2 x float> %v @@ -8683,20 +8269,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) -; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: bnez a2, .LBB70_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else @@ -8709,7 +8283,6 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 8 ; RV64ZVE32F-NEXT: bnez a1, .LBB70_8 ; RV64ZVE32F-NEXT: .LBB70_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB70_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) @@ -8743,7 +8316,6 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> %m, <4 x float> %passthru) ret <4 x float> %v @@ -8765,67 +8337,53 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB71_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB71_6 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_6 ; RV64ZVE32F-NEXT: .LBB71_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB71_7 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_7 ; RV64ZVE32F-NEXT: .LBB71_3: # %else5 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB71_8 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB71_8 ; RV64ZVE32F-NEXT: .LBB71_4: # %else8 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB71_5: # %cond.load -; RV64ZVE32F-NEXT: flw ft0, 0(a4) +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB71_2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_2 ; RV64ZVE32F-NEXT: .LBB71_6: # %cond.load1 -; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB71_3 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_3 ; RV64ZVE32F-NEXT: .LBB71_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: flw ft0, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 -; RV64ZVE32F-NEXT: andi a0, a0, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB71_4 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB71_4 ; RV64ZVE32F-NEXT: .LBB71_8: # %cond.load7 -; RV64ZVE32F-NEXT: flw ft0, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -10101,30 +9659,17 @@ ; ; RV32ZVE32F-LABEL: mgather_v2f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a0, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a0) -; RV32ZVE32F-NEXT: lbu a0, 15(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 ; RV32ZVE32F-NEXT: andi a1, a0, 1 ; RV32ZVE32F-NEXT: bnez a1, .LBB82_3 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB82_4 ; RV32ZVE32F-NEXT: .LBB82_2: # %else2 -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB82_3: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a0, a0, 2 @@ -10134,32 +9679,18 @@ ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fld fa1, 0(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB82_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB82_4 ; RV64ZVE32F-NEXT: .LBB82_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB82_3: # %cond.load ; RV64ZVE32F-NEXT: fld fa0, 0(a0) @@ -10167,7 +9698,6 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB82_2 ; RV64ZVE32F-NEXT: .LBB82_4: # %cond.load1 ; RV64ZVE32F-NEXT: fld fa1, 0(a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 8, <2 x i1> %m, <2 x double> %passthru) ret <2 x double> %v @@ -10192,20 +9722,8 @@ ; ; RV32ZVE32F-LABEL: mgather_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a1, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a1) -; RV32ZVE32F-NEXT: lbu a1, 15(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 ; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: bnez a2, .LBB83_6 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -10227,10 +9745,9 @@ ; RV32ZVE32F-NEXT: fsd fa1, 8(a0) ; RV32ZVE32F-NEXT: fsd fa2, 16(a0) ; RV32ZVE32F-NEXT: fsd fa3, 24(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB83_6: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -10253,20 +9770,8 @@ ; ; RV64ZVE32F-LABEL: mgather_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB83_6 ; RV64ZVE32F-NEXT: # %bb.1: # %else @@ -10286,7 +9791,6 @@ ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) ; RV64ZVE32F-NEXT: fsd fa3, 24(a0) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB83_6: # %cond.load ; RV64ZVE32F-NEXT: ld a3, 0(a1) @@ -10324,21 +9828,9 @@ ; ; RV32ZVE32F-LABEL: mgather_truemask_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmset.m v0 -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 ; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a1, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a1) -; RV32ZVE32F-NEXT: lb a1, 15(sp) +; RV32ZVE32F-NEXT: vmset.m v9 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: beqz zero, .LBB84_6 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -10359,7 +9851,6 @@ ; RV32ZVE32F-NEXT: fsd fa1, 8(a0) ; RV32ZVE32F-NEXT: fsd fa2, 16(a0) ; RV32ZVE32F-NEXT: fsd fa3, 24(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB84_6: # %cond.load ; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu @@ -10385,56 +9876,43 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: ld a2, 24(a1) -; RV64ZVE32F-NEXT: ld a3, 16(a1) -; RV64ZVE32F-NEXT: ld a4, 8(a1) -; RV64ZVE32F-NEXT: ld a5, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a1) -; RV64ZVE32F-NEXT: lb a1, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: beqz zero, .LBB84_6 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a5, a1, 2 -; RV64ZVE32F-NEXT: bnez a5, .LBB84_7 +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB84_7 ; RV64ZVE32F-NEXT: .LBB84_2: # %else2 -; RV64ZVE32F-NEXT: andi a4, a1, 4 -; RV64ZVE32F-NEXT: bnez a4, .LBB84_8 +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB84_8 ; RV64ZVE32F-NEXT: .LBB84_3: # %else5 -; RV64ZVE32F-NEXT: andi a1, a1, 8 -; RV64ZVE32F-NEXT: beqz a1, .LBB84_5 +; RV64ZVE32F-NEXT: andi a2, a2, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_5 ; RV64ZVE32F-NEXT: .LBB84_4: # %cond.load7 -; RV64ZVE32F-NEXT: fld fa3, 0(a2) +; RV64ZVE32F-NEXT: ld a1, 24(a1) +; RV64ZVE32F-NEXT: fld fa3, 0(a1) ; RV64ZVE32F-NEXT: .LBB84_5: # %else8 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) ; RV64ZVE32F-NEXT: fsd fa3, 24(a0) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB84_6: # %cond.load -; RV64ZVE32F-NEXT: fld fa0, 0(a5) -; RV64ZVE32F-NEXT: andi a5, a1, 2 -; RV64ZVE32F-NEXT: beqz a5, .LBB84_2 +; RV64ZVE32F-NEXT: ld a3, 0(a1) +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB84_2 ; RV64ZVE32F-NEXT: .LBB84_7: # %cond.load1 -; RV64ZVE32F-NEXT: fld fa1, 0(a4) -; RV64ZVE32F-NEXT: andi a4, a1, 4 -; RV64ZVE32F-NEXT: beqz a4, .LBB84_3 +; RV64ZVE32F-NEXT: ld a3, 8(a1) +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB84_3 ; RV64ZVE32F-NEXT: .LBB84_8: # %cond.load4 +; RV64ZVE32F-NEXT: ld a3, 16(a1) ; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a1, a1, 8 -; RV64ZVE32F-NEXT: bnez a1, .LBB84_4 +; RV64ZVE32F-NEXT: andi a2, a2, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_4 ; RV64ZVE32F-NEXT: j .LBB84_5 %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -69,27 +69,14 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB1_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB1_4 ; RV64ZVE32F-NEXT: .LBB1_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB1_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu @@ -100,7 +87,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %val, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) ret void @@ -130,21 +116,8 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i16_truncstore_v2i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 ; RV64ZVE32F-NEXT: bnez a3, .LBB2_3 @@ -152,7 +125,6 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB2_4 ; RV64ZVE32F-NEXT: .LBB2_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB2_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu @@ -163,7 +135,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i16> %val to <2 x i8> call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) @@ -200,23 +171,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 ; RV64ZVE32F-NEXT: bnez a3, .LBB3_3 @@ -224,7 +182,6 @@ ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB3_4 ; RV64ZVE32F-NEXT: .LBB3_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB3_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu @@ -235,7 +192,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i32> %val to <2 x i8> call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) @@ -288,31 +244,20 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: addi sp, sp, -16 ; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a4, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a4) -; RV64ZVE32F-NEXT: lbu a4, 15(sp) -; RV64ZVE32F-NEXT: sb a1, 14(sp) -; RV64ZVE32F-NEXT: sb a0, 13(sp) +; RV64ZVE32F-NEXT: sb a1, 15(sp) +; RV64ZVE32F-NEXT: sb a0, 14(sp) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: addi a0, sp, 14 +; RV64ZVE32F-NEXT: addi a0, sp, 15 ; RV64ZVE32F-NEXT: vle8.v v9, (a0) -; RV64ZVE32F-NEXT: addi a0, sp, 13 +; RV64ZVE32F-NEXT: addi a0, sp, 14 ; RV64ZVE32F-NEXT: vle8.v v8, (a0) ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; RV64ZVE32F-NEXT: andi a0, a4, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v0 +; RV64ZVE32F-NEXT: andi a1, a0, 1 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: bnez a0, .LBB4_3 +; RV64ZVE32F-NEXT: bnez a1, .LBB4_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB4_4 ; RV64ZVE32F-NEXT: .LBB4_2: # %else2 ; RV64ZVE32F-NEXT: addi sp, sp, 16 @@ -320,7 +265,7 @@ ; RV64ZVE32F-NEXT: .LBB4_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vse8.v v8, (a2) -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB4_2 ; RV64ZVE32F-NEXT: .LBB4_4: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu @@ -350,60 +295,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) ; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) -; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a5, a3, 1 ; RV64ZVE32F-NEXT: bnez a5, .LBB5_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB5_6 ; RV64ZVE32F-NEXT: .LBB5_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB5_7 ; RV64ZVE32F-NEXT: .LBB5_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB5_8 ; RV64ZVE32F-NEXT: .LBB5_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB5_5: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vse8.v v8, (a0) -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB5_2 ; RV64ZVE32F-NEXT: .LBB5_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v9, (a4) -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB5_3 ; RV64ZVE32F-NEXT: .LBB5_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v9, (a3) -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB5_4 ; RV64ZVE32F-NEXT: .LBB5_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse8.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %m) ret void @@ -424,60 +355,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: ld a4, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB6_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB6_6 +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB6_6 ; RV64ZVE32F-NEXT: .LBB6_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB6_7 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB6_7 ; RV64ZVE32F-NEXT: .LBB6_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB6_8 ; RV64ZVE32F-NEXT: .LBB6_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB6_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vse8.v v8, (a4) -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB6_2 +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB6_2 ; RV64ZVE32F-NEXT: .LBB6_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vse8.v v9, (a3) -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB6_3 +; RV64ZVE32F-NEXT: vse8.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB6_3 ; RV64ZVE32F-NEXT: .LBB6_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse8.v v9, (a2) -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB6_4 ; RV64ZVE32F-NEXT: .LBB6_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse8.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -777,27 +694,14 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB11_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB11_4 ; RV64ZVE32F-NEXT: .LBB11_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB11_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu @@ -808,7 +712,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %val, <2 x i16*> %ptrs, i32 2, <2 x i1> %m) ret void @@ -838,29 +741,16 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB12_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB12_4 ; RV64ZVE32F-NEXT: .LBB12_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB12_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu @@ -871,7 +761,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i32> %val to <2 x i16> call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %tval, <2 x i16*> %ptrs, i32 2, <2 x i1> %m) @@ -920,31 +809,21 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: addi sp, sp, -16 ; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a4, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a4) -; RV64ZVE32F-NEXT: lbu a4, 15(sp) -; RV64ZVE32F-NEXT: sh a1, 12(sp) -; RV64ZVE32F-NEXT: sh a0, 10(sp) +; RV64ZVE32F-NEXT: sh a1, 14(sp) +; RV64ZVE32F-NEXT: sh a0, 12(sp) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: addi a0, sp, 12 +; RV64ZVE32F-NEXT: addi a0, sp, 14 ; RV64ZVE32F-NEXT: vle16.v v9, (a0) -; RV64ZVE32F-NEXT: addi a0, sp, 10 +; RV64ZVE32F-NEXT: addi a0, sp, 12 ; RV64ZVE32F-NEXT: vle16.v v8, (a0) ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; RV64ZVE32F-NEXT: andi a0, a4, 1 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: bnez a0, .LBB13_3 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v0 +; RV64ZVE32F-NEXT: andi a1, a0, 1 +; RV64ZVE32F-NEXT: bnez a1, .LBB13_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB13_4 ; RV64ZVE32F-NEXT: .LBB13_2: # %else2 ; RV64ZVE32F-NEXT: addi sp, sp, 16 @@ -952,7 +831,7 @@ ; RV64ZVE32F-NEXT: .LBB13_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a2) -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB13_2 ; RV64ZVE32F-NEXT: .LBB13_4: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu @@ -982,60 +861,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) ; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) -; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a5, a3, 1 ; RV64ZVE32F-NEXT: bnez a5, .LBB14_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB14_6 ; RV64ZVE32F-NEXT: .LBB14_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB14_7 ; RV64ZVE32F-NEXT: .LBB14_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB14_8 ; RV64ZVE32F-NEXT: .LBB14_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB14_5: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a0) -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB14_2 ; RV64ZVE32F-NEXT: .LBB14_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB14_3 ; RV64ZVE32F-NEXT: .LBB14_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v9, (a3) -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB14_4 ; RV64ZVE32F-NEXT: .LBB14_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse16.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> %m) ret void @@ -1056,60 +921,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: ld a4, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB15_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB15_6 +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB15_6 ; RV64ZVE32F-NEXT: .LBB15_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB15_7 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB15_7 ; RV64ZVE32F-NEXT: .LBB15_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB15_8 ; RV64ZVE32F-NEXT: .LBB15_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB15_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vse16.v v8, (a4) -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB15_2 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB15_2 ; RV64ZVE32F-NEXT: .LBB15_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vse16.v v9, (a3) -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB15_3 +; RV64ZVE32F-NEXT: vse16.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB15_3 ; RV64ZVE32F-NEXT: .LBB15_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB15_4 ; RV64ZVE32F-NEXT: .LBB15_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse16.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -1823,27 +1674,14 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB23_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB23_4 ; RV64ZVE32F-NEXT: .LBB23_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB23_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu @@ -1854,7 +1692,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 4, <2 x i1> %m) ret void @@ -1888,47 +1725,37 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -32 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 32 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a4, sp, 31 -; RV64ZVE32F-NEXT: vsm.v v8, (a4) -; RV64ZVE32F-NEXT: lbu a4, 31(sp) -; RV64ZVE32F-NEXT: sw a1, 24(sp) -; RV64ZVE32F-NEXT: sw a0, 20(sp) +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: sw a1, 12(sp) +; RV64ZVE32F-NEXT: sw a0, 8(sp) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: addi a0, sp, 24 +; RV64ZVE32F-NEXT: addi a0, sp, 12 ; RV64ZVE32F-NEXT: vle32.v v9, (a0) -; RV64ZVE32F-NEXT: addi a0, sp, 20 +; RV64ZVE32F-NEXT: addi a0, sp, 8 ; RV64ZVE32F-NEXT: vle32.v v8, (a0) ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu -; RV64ZVE32F-NEXT: andi a0, a4, 1 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: bnez a0, .LBB24_3 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v0 +; RV64ZVE32F-NEXT: andi a1, a0, 1 +; RV64ZVE32F-NEXT: bnez a1, .LBB24_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB24_4 ; RV64ZVE32F-NEXT: .LBB24_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 32 +; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB24_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a2) -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB24_2 ; RV64ZVE32F-NEXT: .LBB24_4: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v8, (a3) -; RV64ZVE32F-NEXT: addi sp, sp, 32 +; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i32> call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %tval, <2 x i32*> %ptrs, i32 4, <2 x i1> %m) @@ -1952,60 +1779,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) ; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) -; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a5, a3, 1 ; RV64ZVE32F-NEXT: bnez a5, .LBB25_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB25_6 ; RV64ZVE32F-NEXT: .LBB25_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB25_7 ; RV64ZVE32F-NEXT: .LBB25_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB25_8 ; RV64ZVE32F-NEXT: .LBB25_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB25_5: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a0) -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB25_2 ; RV64ZVE32F-NEXT: .LBB25_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v9, (a4) -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB25_3 ; RV64ZVE32F-NEXT: .LBB25_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v9, (a3) -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: vse32.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB25_4 ; RV64ZVE32F-NEXT: .LBB25_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> %m) ret void @@ -2026,60 +1839,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: ld a4, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB26_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB26_6 +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB26_6 ; RV64ZVE32F-NEXT: .LBB26_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB26_7 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB26_7 ; RV64ZVE32F-NEXT: .LBB26_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB26_8 ; RV64ZVE32F-NEXT: .LBB26_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB26_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vse32.v v8, (a4) -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB26_2 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB26_2 ; RV64ZVE32F-NEXT: .LBB26_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v9, (a3) -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB26_3 +; RV64ZVE32F-NEXT: vse32.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB26_3 ; RV64ZVE32F-NEXT: .LBB26_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v9, (a2) -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB26_4 ; RV64ZVE32F-NEXT: .LBB26_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -3198,34 +2997,21 @@ ; ; RV32ZVE32F-LABEL: mscatter_v2i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a1, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a1) -; RV32ZVE32F-NEXT: lbu a3, 15(sp) ; RV32ZVE32F-NEXT: lw a2, 12(a0) ; RV32ZVE32F-NEXT: lw a1, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v0 ; RV32ZVE32F-NEXT: andi a4, a3, 1 ; RV32ZVE32F-NEXT: bnez a4, .LBB37_3 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a3, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB37_4 ; RV32ZVE32F-NEXT: .LBB37_2: # %else2 -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB37_3: # %cond.store ; RV32ZVE32F-NEXT: lw a4, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a5, v8 ; RV32ZVE32F-NEXT: sw a4, 4(a5) ; RV32ZVE32F-NEXT: sw a0, 0(a5) @@ -3237,32 +3023,18 @@ ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: sw a1, 0(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a4, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a4) -; RV64ZVE32F-NEXT: lbu a4, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi a5, a4, 1 ; RV64ZVE32F-NEXT: bnez a5, .LBB37_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB37_4 ; RV64ZVE32F-NEXT: .LBB37_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB37_3: # %cond.store ; RV64ZVE32F-NEXT: sd a0, 0(a2) @@ -3270,7 +3042,6 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB37_2 ; RV64ZVE32F-NEXT: .LBB37_4: # %cond.store1 ; RV64ZVE32F-NEXT: sd a1, 0(a3) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %val, <2 x i64*> %ptrs, i32 8, <2 x i1> %m) ret void @@ -3293,26 +3064,14 @@ ; ; RV32ZVE32F-LABEL: mscatter_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a1, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a1) ; RV32ZVE32F-NEXT: lw a1, 28(a0) ; RV32ZVE32F-NEXT: lw a2, 24(a0) ; RV32ZVE32F-NEXT: lw a3, 20(a0) ; RV32ZVE32F-NEXT: lw a4, 16(a0) -; RV32ZVE32F-NEXT: lbu a5, 15(sp) ; RV32ZVE32F-NEXT: lw a7, 12(a0) ; RV32ZVE32F-NEXT: lw a6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a5, v0 ; RV32ZVE32F-NEXT: andi t0, a5, 1 ; RV32ZVE32F-NEXT: bnez t0, .LBB38_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -3325,12 +3084,11 @@ ; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB38_8 ; RV32ZVE32F-NEXT: .LBB38_4: # %else6 -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB38_5: # %cond.store ; RV32ZVE32F-NEXT: lw t0, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s t1, v8 ; RV32ZVE32F-NEXT: sw t0, 4(t1) ; RV32ZVE32F-NEXT: sw a0, 0(t1) @@ -3358,62 +3116,47 @@ ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 -; RV32ZVE32F-NEXT: ret -; -; RV64ZVE32F-LABEL: mscatter_v4i64: -; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a2) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4i64: +; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: ld a2, 24(a1) ; RV64ZVE32F-NEXT: ld a4, 16(a1) ; RV64ZVE32F-NEXT: ld a7, 8(a1) ; RV64ZVE32F-NEXT: ld a3, 24(a0) -; RV64ZVE32F-NEXT: lbu a5, 15(sp) -; RV64ZVE32F-NEXT: ld a6, 16(a0) +; RV64ZVE32F-NEXT: ld a5, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) -; RV64ZVE32F-NEXT: andi t1, a5, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a6, v0 +; RV64ZVE32F-NEXT: andi t1, a6, 1 ; RV64ZVE32F-NEXT: bnez t1, .LBB38_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a6, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_6 ; RV64ZVE32F-NEXT: .LBB38_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a6, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_7 ; RV64ZVE32F-NEXT: .LBB38_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: andi a0, a6, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_8 ; RV64ZVE32F-NEXT: .LBB38_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB38_5: # %cond.store ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: sd a0, 0(a1) -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a6, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_2 ; RV64ZVE32F-NEXT: .LBB38_6: # %cond.store1 ; RV64ZVE32F-NEXT: sd t0, 0(a7) -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a6, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_3 ; RV64ZVE32F-NEXT: .LBB38_7: # %cond.store3 -; RV64ZVE32F-NEXT: sd a6, 0(a4) -; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: sd a5, 0(a4) +; RV64ZVE32F-NEXT: andi a0, a6, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_4 ; RV64ZVE32F-NEXT: .LBB38_8: # %cond.store5 ; RV64ZVE32F-NEXT: sd a3, 0(a2) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> %val, <4 x i64*> %ptrs, i32 8, <4 x i1> %m) ret void @@ -3434,64 +3177,51 @@ ; ; RV32ZVE32F-LABEL: mscatter_truemask_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV32ZVE32F-NEXT: lw a1, 28(a0) ; RV32ZVE32F-NEXT: lw a2, 24(a0) ; RV32ZVE32F-NEXT: lw a3, 20(a0) ; RV32ZVE32F-NEXT: lw a4, 16(a0) -; RV32ZVE32F-NEXT: lw a6, 12(a0) -; RV32ZVE32F-NEXT: lw a5, 8(a0) -; RV32ZVE32F-NEXT: lw t0, 4(a0) -; RV32ZVE32F-NEXT: lw a7, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmset.m v0 -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: lw a7, 12(a0) +; RV32ZVE32F-NEXT: lw a6, 8(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a0, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a0) -; RV32ZVE32F-NEXT: lb a0, 15(sp) +; RV32ZVE32F-NEXT: vmset.m v9 +; RV32ZVE32F-NEXT: vmv.x.s a5, v9 ; RV32ZVE32F-NEXT: beqz zero, .LBB39_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a7, a0, 2 -; RV32ZVE32F-NEXT: bnez a7, .LBB39_6 +; RV32ZVE32F-NEXT: andi a0, a5, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB39_6 ; RV32ZVE32F-NEXT: .LBB39_2: # %else2 -; RV32ZVE32F-NEXT: andi a5, a0, 4 -; RV32ZVE32F-NEXT: bnez a5, .LBB39_7 +; RV32ZVE32F-NEXT: andi a0, a5, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB39_7 ; RV32ZVE32F-NEXT: .LBB39_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB39_8 ; RV32ZVE32F-NEXT: .LBB39_4: # %else6 -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB39_5: # %cond.store +; RV32ZVE32F-NEXT: lw t0, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s t1, v8 ; RV32ZVE32F-NEXT: sw t0, 4(t1) -; RV32ZVE32F-NEXT: sw a7, 0(t1) -; RV32ZVE32F-NEXT: andi a7, a0, 2 -; RV32ZVE32F-NEXT: beqz a7, .LBB39_2 +; RV32ZVE32F-NEXT: sw a0, 0(t1) +; RV32ZVE32F-NEXT: andi a0, a5, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB39_2 ; RV32ZVE32F-NEXT: .LBB39_6: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV32ZVE32F-NEXT: vmv.x.s a7, v9 -; RV32ZVE32F-NEXT: sw a6, 4(a7) -; RV32ZVE32F-NEXT: sw a5, 0(a7) -; RV32ZVE32F-NEXT: andi a5, a0, 4 -; RV32ZVE32F-NEXT: beqz a5, .LBB39_3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 +; RV32ZVE32F-NEXT: sw a7, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB39_3 ; RV32ZVE32F-NEXT: .LBB39_7: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a5, v9 -; RV32ZVE32F-NEXT: sw a4, 0(a5) -; RV32ZVE32F-NEXT: sw a3, 4(a5) -; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB39_4 ; RV32ZVE32F-NEXT: .LBB39_8: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu @@ -3499,62 +3229,47 @@ ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV64ZVE32F-NEXT: ld a2, 24(a1) -; RV64ZVE32F-NEXT: ld a3, 16(a1) -; RV64ZVE32F-NEXT: ld a5, 8(a1) -; RV64ZVE32F-NEXT: ld a7, 0(a1) -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a4, 16(a0) -; RV64ZVE32F-NEXT: ld a6, 8(a0) -; RV64ZVE32F-NEXT: ld t0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: ld a4, 16(a1) +; RV64ZVE32F-NEXT: ld a7, 8(a1) +; RV64ZVE32F-NEXT: ld a3, 24(a0) +; RV64ZVE32F-NEXT: ld a5, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 ; RV64ZVE32F-NEXT: beqz zero, .LBB39_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a7, a0, 2 -; RV64ZVE32F-NEXT: bnez a7, .LBB39_6 +; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB39_6 ; RV64ZVE32F-NEXT: .LBB39_2: # %else2 -; RV64ZVE32F-NEXT: andi a5, a0, 4 -; RV64ZVE32F-NEXT: bnez a5, .LBB39_7 +; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB39_7 ; RV64ZVE32F-NEXT: .LBB39_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a6, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB39_8 ; RV64ZVE32F-NEXT: .LBB39_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB39_5: # %cond.store -; RV64ZVE32F-NEXT: sd t0, 0(a7) -; RV64ZVE32F-NEXT: andi a7, a0, 2 -; RV64ZVE32F-NEXT: beqz a7, .LBB39_2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: sd a0, 0(a1) +; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB39_2 ; RV64ZVE32F-NEXT: .LBB39_6: # %cond.store1 -; RV64ZVE32F-NEXT: sd a6, 0(a5) -; RV64ZVE32F-NEXT: andi a5, a0, 4 -; RV64ZVE32F-NEXT: beqz a5, .LBB39_3 +; RV64ZVE32F-NEXT: sd t0, 0(a7) +; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB39_3 ; RV64ZVE32F-NEXT: .LBB39_7: # %cond.store3 -; RV64ZVE32F-NEXT: sd a4, 0(a3) -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: sd a5, 0(a4) +; RV64ZVE32F-NEXT: andi a0, a6, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB39_4 ; RV64ZVE32F-NEXT: .LBB39_8: # %cond.store5 -; RV64ZVE32F-NEXT: sd a1, 0(a2) -; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: sd a3, 0(a2) ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -6435,27 +6150,14 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB53_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB53_4 ; RV64ZVE32F-NEXT: .LBB53_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB53_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu @@ -6466,7 +6168,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> %val, <2 x half*> %ptrs, i32 2, <2 x i1> %m) ret void @@ -6489,60 +6190,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) ; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) -; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a5, a3, 1 ; RV64ZVE32F-NEXT: bnez a5, .LBB54_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB54_6 ; RV64ZVE32F-NEXT: .LBB54_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB54_7 ; RV64ZVE32F-NEXT: .LBB54_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB54_8 ; RV64ZVE32F-NEXT: .LBB54_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB54_5: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vse16.v v8, (a0) -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB54_2 ; RV64ZVE32F-NEXT: .LBB54_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v9, (a4) -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB54_3 ; RV64ZVE32F-NEXT: .LBB54_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v9, (a3) -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB54_4 ; RV64ZVE32F-NEXT: .LBB54_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse16.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> %m) ret void @@ -6563,60 +6250,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: ld a4, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB55_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB55_6 +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB55_6 ; RV64ZVE32F-NEXT: .LBB55_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB55_7 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB55_7 ; RV64ZVE32F-NEXT: .LBB55_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB55_8 ; RV64ZVE32F-NEXT: .LBB55_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB55_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu -; RV64ZVE32F-NEXT: vse16.v v8, (a4) -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB55_2 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB55_2 ; RV64ZVE32F-NEXT: .LBB55_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vse16.v v9, (a3) -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB55_3 +; RV64ZVE32F-NEXT: vse16.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB55_3 ; RV64ZVE32F-NEXT: .LBB55_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB55_4 ; RV64ZVE32F-NEXT: .LBB55_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse16.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -7330,27 +7003,14 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB63_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB63_4 ; RV64ZVE32F-NEXT: .LBB63_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB63_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu @@ -7361,7 +7021,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %val, <2 x float*> %ptrs, i32 4, <2 x i1> %m) ret void @@ -7384,60 +7043,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a1) ; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) -; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a5, a3, 1 ; RV64ZVE32F-NEXT: bnez a5, .LBB64_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB64_6 ; RV64ZVE32F-NEXT: .LBB64_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB64_7 ; RV64ZVE32F-NEXT: .LBB64_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB64_8 ; RV64ZVE32F-NEXT: .LBB64_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB64_5: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vse32.v v8, (a0) -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB64_2 ; RV64ZVE32F-NEXT: .LBB64_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v9, (a4) -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB64_3 ; RV64ZVE32F-NEXT: .LBB64_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v9, (a3) -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: vse32.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB64_4 ; RV64ZVE32F-NEXT: .LBB64_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> %m) ret void @@ -7458,60 +7103,46 @@ ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v10, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: ld a4, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v9, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: beqz zero, .LBB65_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB65_6 +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB65_6 ; RV64ZVE32F-NEXT: .LBB65_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB65_7 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB65_7 ; RV64ZVE32F-NEXT: .LBB65_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB65_8 ; RV64ZVE32F-NEXT: .LBB65_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB65_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vse32.v v8, (a4) -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB65_2 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB65_2 ; RV64ZVE32F-NEXT: .LBB65_6: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vse32.v v9, (a3) -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB65_3 +; RV64ZVE32F-NEXT: vse32.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB65_3 ; RV64ZVE32F-NEXT: .LBB65_7: # %cond.store3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV64ZVE32F-NEXT: vse32.v v9, (a2) -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB65_4 ; RV64ZVE32F-NEXT: .LBB65_8: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v8, (a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer @@ -8629,30 +8260,17 @@ ; ; RV32ZVE32F-LABEL: mscatter_v2f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a0, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a0) -; RV32ZVE32F-NEXT: lbu a0, 15(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 ; RV32ZVE32F-NEXT: andi a1, a0, 1 ; RV32ZVE32F-NEXT: bnez a1, .LBB76_3 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB76_4 ; RV32ZVE32F-NEXT: .LBB76_2: # %else2 -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB76_3: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a0, a0, 2 @@ -8662,32 +8280,18 @@ ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fsd fa1, 0(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_v2f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a2, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a2) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB76_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a2, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB76_4 ; RV64ZVE32F-NEXT: .LBB76_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB76_3: # %cond.store ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) @@ -8695,7 +8299,6 @@ ; RV64ZVE32F-NEXT: beqz a0, .LBB76_2 ; RV64ZVE32F-NEXT: .LBB76_4: # %cond.store1 ; RV64ZVE32F-NEXT: fsd fa1, 0(a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 8, <2 x i1> %m) ret void @@ -8718,20 +8321,8 @@ ; ; RV32ZVE32F-LABEL: mscatter_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a0, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a0) -; RV32ZVE32F-NEXT: lbu a0, 15(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 ; RV32ZVE32F-NEXT: andi a1, a0, 1 ; RV32ZVE32F-NEXT: bnez a1, .LBB77_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else @@ -8744,10 +8335,9 @@ ; RV32ZVE32F-NEXT: andi a0, a0, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB77_8 ; RV32ZVE32F-NEXT: .LBB77_4: # %else6 -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB77_5: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -8771,58 +8361,43 @@ ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fsd fa3, 0(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a1, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a1) ; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: lbu a2, 15(sp) -; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a5, a3, 1 ; RV64ZVE32F-NEXT: bnez a5, .LBB77_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB77_6 ; RV64ZVE32F-NEXT: .LBB77_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB77_7 ; RV64ZVE32F-NEXT: .LBB77_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB77_8 ; RV64ZVE32F-NEXT: .LBB77_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB77_5: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB77_2 ; RV64ZVE32F-NEXT: .LBB77_6: # %cond.store1 ; RV64ZVE32F-NEXT: fsd fa1, 0(a4) -; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: andi a0, a3, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB77_3 ; RV64ZVE32F-NEXT: .LBB77_7: # %cond.store3 -; RV64ZVE32F-NEXT: fsd fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB77_4 ; RV64ZVE32F-NEXT: .LBB77_8: # %cond.store5 ; RV64ZVE32F-NEXT: fsd fa3, 0(a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %val, <4 x double*> %ptrs, i32 8, <4 x i1> %m) ret void @@ -8843,21 +8418,9 @@ ; ; RV32ZVE32F-LABEL: mscatter_truemask_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi sp, sp, -16 -; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32ZVE32F-NEXT: vmset.m v0 -; RV32ZVE32F-NEXT: vmv.v.i v9, 0 -; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 ; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmv.v.i v10, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 -; RV32ZVE32F-NEXT: addi a0, sp, 15 -; RV32ZVE32F-NEXT: vsm.v v9, (a0) -; RV32ZVE32F-NEXT: lb a0, 15(sp) +; RV32ZVE32F-NEXT: vmset.m v9 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 ; RV32ZVE32F-NEXT: beqz zero, .LBB78_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -8869,7 +8432,6 @@ ; RV32ZVE32F-NEXT: andi a0, a0, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB78_8 ; RV32ZVE32F-NEXT: .LBB78_4: # %else6 -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB78_5: # %cond.store ; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu @@ -8896,58 +8458,43 @@ ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fsd fa3, 0(a0) -; RV32ZVE32F-NEXT: addi sp, sp, 16 ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) -; RV64ZVE32F-NEXT: ld a3, 8(a0) -; RV64ZVE32F-NEXT: ld a4, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64ZVE32F-NEXT: vmset.m v0 -; RV64ZVE32F-NEXT: vmv.v.i v8, 0 -; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v9, 0 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: ld a4, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 -; RV64ZVE32F-NEXT: addi a0, sp, 15 -; RV64ZVE32F-NEXT: vsm.v v8, (a0) -; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: vmset.m v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: beqz zero, .LBB78_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: bnez a4, .LBB78_6 +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB78_6 ; RV64ZVE32F-NEXT: .LBB78_2: # %else2 -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: bnez a3, .LBB78_7 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB78_7 ; RV64ZVE32F-NEXT: .LBB78_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB78_8 ; RV64ZVE32F-NEXT: .LBB78_4: # %else6 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB78_5: # %cond.store -; RV64ZVE32F-NEXT: fsd fa0, 0(a4) -; RV64ZVE32F-NEXT: andi a4, a0, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB78_2 +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB78_2 ; RV64ZVE32F-NEXT: .LBB78_6: # %cond.store1 -; RV64ZVE32F-NEXT: fsd fa1, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a0, 4 -; RV64ZVE32F-NEXT: beqz a3, .LBB78_3 +; RV64ZVE32F-NEXT: fsd fa1, 0(a4) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB78_3 ; RV64ZVE32F-NEXT: .LBB78_7: # %cond.store3 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: andi a0, a3, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB78_4 ; RV64ZVE32F-NEXT: .LBB78_8: # %cond.store5 ; RV64ZVE32F-NEXT: fsd fa3, 0(a1) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -49,24 +49,18 @@ define <2 x i16> @mgather_v2i16_align1(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { ; RV32-LABEL: mgather_v2i16_align1: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmv.v.i v11, 0 -; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32-NEXT: vslideup.vi v11, v10, 0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmsne.vi v10, v11, 0 -; RV32-NEXT: addi a0, sp, 15 -; RV32-NEXT: vsm.v v10, (a0) -; RV32-NEXT: lbu a0, 15(sp) +; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV32-NEXT: vmv.x.s a0, v0 ; RV32-NEXT: andi a1, a0, 1 -; RV32-NEXT: beqz a1, .LBB4_2 -; RV32-NEXT: # %bb.1: # %cond.load -; RV32-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; RV32-NEXT: bnez a1, .LBB4_3 +; RV32-NEXT: # %bb.1: # %else +; RV32-NEXT: andi a0, a0, 2 +; RV32-NEXT: bnez a0, .LBB4_4 +; RV32-NEXT: .LBB4_2: # %else2 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; RV32-NEXT: .LBB4_3: # %cond.load +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: lb a2, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) @@ -74,10 +68,9 @@ ; RV32-NEXT: or a1, a2, a1 ; RV32-NEXT: vsetivli zero, 2, e16, mf4, tu, mu ; RV32-NEXT: vmv.s.x v9, a1 -; RV32-NEXT: .LBB4_2: # %else ; RV32-NEXT: andi a0, a0, 2 -; RV32-NEXT: beqz a0, .LBB4_4 -; RV32-NEXT: # %bb.3: # %cond.load1 +; RV32-NEXT: beqz a0, .LBB4_2 +; RV32-NEXT: .LBB4_4: # %cond.load1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: vmv.x.s a0, v8 @@ -88,31 +81,23 @@ ; RV32-NEXT: vmv.s.x v8, a0 ; RV32-NEXT: vsetivli zero, 2, e16, mf4, tu, mu ; RV32-NEXT: vslideup.vi v9, v8, 1 -; RV32-NEXT: .LBB4_4: # %else2 ; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: mgather_v2i16_align1: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmv.v.i v11, 0 -; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64-NEXT: vslideup.vi v11, v10, 0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmsne.vi v10, v11, 0 -; RV64-NEXT: addi a0, sp, 15 -; RV64-NEXT: vsm.v v10, (a0) -; RV64-NEXT: lbu a0, 15(sp) +; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV64-NEXT: vmv.x.s a0, v0 ; RV64-NEXT: andi a1, a0, 1 -; RV64-NEXT: beqz a1, .LBB4_2 -; RV64-NEXT: # %bb.1: # %cond.load -; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: bnez a1, .LBB4_3 +; RV64-NEXT: # %bb.1: # %else +; RV64-NEXT: andi a0, a0, 2 +; RV64-NEXT: bnez a0, .LBB4_4 +; RV64-NEXT: .LBB4_2: # %else2 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret +; RV64-NEXT: .LBB4_3: # %cond.load +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: lb a2, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) @@ -120,10 +105,9 @@ ; RV64-NEXT: or a1, a2, a1 ; RV64-NEXT: vsetivli zero, 2, e16, mf4, tu, mu ; RV64-NEXT: vmv.s.x v9, a1 -; RV64-NEXT: .LBB4_2: # %else ; RV64-NEXT: andi a0, a0, 2 -; RV64-NEXT: beqz a0, .LBB4_4 -; RV64-NEXT: # %bb.3: # %cond.load1 +; RV64-NEXT: beqz a0, .LBB4_2 +; RV64-NEXT: .LBB4_4: # %cond.load1 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vslidedown.vi v8, v8, 1 ; RV64-NEXT: vmv.x.s a0, v8 @@ -134,9 +118,7 @@ ; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: vsetivli zero, 2, e16, mf4, tu, mu ; RV64-NEXT: vslideup.vi v9, v8, 1 -; RV64-NEXT: .LBB4_4: # %else2 ; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 1, <2 x i1> %m, <2 x i16> %passthru) ret <2 x i16> %v @@ -147,25 +129,19 @@ define <2 x i64> @mgather_v2i64_align4(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passthru) { ; RV32-LABEL: mgather_v2i64_align4: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmv.v.i v11, 0 -; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32-NEXT: vslideup.vi v11, v10, 0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmsne.vi v10, v11, 0 -; RV32-NEXT: addi a0, sp, 15 -; RV32-NEXT: vsm.v v10, (a0) -; RV32-NEXT: lbu a0, 15(sp) +; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV32-NEXT: vmv.x.s a0, v0 ; RV32-NEXT: andi a1, a0, 1 ; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: beqz a1, .LBB5_2 -; RV32-NEXT: # %bb.1: # %cond.load +; RV32-NEXT: bnez a1, .LBB5_3 +; RV32-NEXT: # %bb.1: # %else +; RV32-NEXT: andi a0, a0, 2 +; RV32-NEXT: bnez a0, .LBB5_4 +; RV32-NEXT: .LBB5_2: # %else2 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; RV32-NEXT: .LBB5_3: # %cond.load ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: lw a2, 4(a1) ; RV32-NEXT: lw a1, 0(a1) @@ -173,10 +149,9 @@ ; RV32-NEXT: vslide1up.vx v12, v11, a1 ; RV32-NEXT: vsetivli zero, 1, e64, m1, tu, mu ; RV32-NEXT: vslideup.vi v9, v12, 0 -; RV32-NEXT: .LBB5_2: # %else ; RV32-NEXT: andi a0, a0, 2 -; RV32-NEXT: beqz a0, .LBB5_4 -; RV32-NEXT: # %bb.3: # %cond.load1 +; RV32-NEXT: beqz a0, .LBB5_2 +; RV32-NEXT: .LBB5_4: # %cond.load1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: vmv.x.s a0, v8 @@ -187,31 +162,23 @@ ; RV32-NEXT: vslide1up.vx v10, v8, a0 ; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, mu ; RV32-NEXT: vslideup.vi v9, v10, 1 -; RV32-NEXT: .LBB5_4: # %else2 ; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: mgather_v2i64_align4: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmv.v.i v11, 0 -; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64-NEXT: vslideup.vi v11, v10, 0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmsne.vi v10, v11, 0 -; RV64-NEXT: addi a0, sp, 15 -; RV64-NEXT: vsm.v v10, (a0) -; RV64-NEXT: lbu a0, 15(sp) +; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV64-NEXT: vmv.x.s a0, v0 ; RV64-NEXT: andi a1, a0, 1 -; RV64-NEXT: beqz a1, .LBB5_2 -; RV64-NEXT: # %bb.1: # %cond.load -; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; RV64-NEXT: bnez a1, .LBB5_3 +; RV64-NEXT: # %bb.1: # %else +; RV64-NEXT: andi a0, a0, 2 +; RV64-NEXT: bnez a0, .LBB5_4 +; RV64-NEXT: .LBB5_2: # %else2 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret +; RV64-NEXT: .LBB5_3: # %cond.load +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: lwu a2, 4(a1) ; RV64-NEXT: lwu a1, 0(a1) @@ -219,10 +186,9 @@ ; RV64-NEXT: or a1, a2, a1 ; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu ; RV64-NEXT: vmv.s.x v9, a1 -; RV64-NEXT: .LBB5_2: # %else ; RV64-NEXT: andi a0, a0, 2 -; RV64-NEXT: beqz a0, .LBB5_4 -; RV64-NEXT: # %bb.3: # %cond.load1 +; RV64-NEXT: beqz a0, .LBB5_2 +; RV64-NEXT: .LBB5_4: # %cond.load1 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vslidedown.vi v8, v8, 1 ; RV64-NEXT: vmv.x.s a0, v8 @@ -233,9 +199,7 @@ ; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, mu ; RV64-NEXT: vslideup.vi v9, v8, 1 -; RV64-NEXT: .LBB5_4: # %else2 ; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %ptrs, i32 4, <2 x i1> %m, <2 x i64> %passthru) ret <2 x i64> %v @@ -246,20 +210,8 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m) { ; RV32-LABEL: mscatter_v4i16_align1: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmv.v.i v11, 0 -; RV32-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV32-NEXT: vslideup.vi v11, v10, 0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmsne.vi v10, v11, 0 -; RV32-NEXT: addi a0, sp, 15 -; RV32-NEXT: vsm.v v10, (a0) -; RV32-NEXT: lbu a0, 15(sp) +; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV32-NEXT: vmv.x.s a0, v0 ; RV32-NEXT: andi a1, a0, 1 ; RV32-NEXT: bnez a1, .LBB6_5 ; RV32-NEXT: # %bb.1: # %else @@ -272,7 +224,6 @@ ; RV32-NEXT: andi a0, a0, 8 ; RV32-NEXT: bnez a0, .LBB6_8 ; RV32-NEXT: .LBB6_4: # %else6 -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; RV32-NEXT: .LBB6_5: # %cond.store ; RV32-NEXT: vsetivli zero, 0, e16, mf2, ta, mu @@ -318,25 +269,12 @@ ; RV32-NEXT: sb a0, 0(a1) ; RV32-NEXT: srli a0, a0, 8 ; RV32-NEXT: sb a0, 1(a1) -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: mscatter_v4i16_align1: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64-NEXT: vmv.v.i v9, 0 -; RV64-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmv.v.i v12, 0 -; RV64-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; RV64-NEXT: vslideup.vi v12, v9, 0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmsne.vi v9, v12, 0 -; RV64-NEXT: addi a0, sp, 15 -; RV64-NEXT: vsm.v v9, (a0) -; RV64-NEXT: lbu a0, 15(sp) +; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV64-NEXT: vmv.x.s a0, v0 ; RV64-NEXT: andi a1, a0, 1 ; RV64-NEXT: bnez a1, .LBB6_5 ; RV64-NEXT: # %bb.1: # %else @@ -349,7 +287,6 @@ ; RV64-NEXT: andi a0, a0, 8 ; RV64-NEXT: bnez a0, .LBB6_8 ; RV64-NEXT: .LBB6_4: # %else6 -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; RV64-NEXT: .LBB6_5: # %cond.store ; RV64-NEXT: vsetivli zero, 0, e16, mf2, ta, mu @@ -395,7 +332,6 @@ ; RV64-NEXT: sb a0, 0(a1) ; RV64-NEXT: srli a0, a0, 8 ; RV64-NEXT: sb a0, 1(a1) -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 1, <4 x i1> %m) ret void @@ -406,30 +342,17 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m) { ; RV32-LABEL: mscatter_v2i32_align2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmv.v.i v11, 0 -; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32-NEXT: vslideup.vi v11, v10, 0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmsne.vi v10, v11, 0 -; RV32-NEXT: addi a0, sp, 15 -; RV32-NEXT: vsm.v v10, (a0) -; RV32-NEXT: lbu a0, 15(sp) +; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV32-NEXT: vmv.x.s a0, v0 ; RV32-NEXT: andi a1, a0, 1 ; RV32-NEXT: bnez a1, .LBB7_3 ; RV32-NEXT: # %bb.1: # %else ; RV32-NEXT: andi a0, a0, 2 ; RV32-NEXT: bnez a0, .LBB7_4 ; RV32-NEXT: .LBB7_2: # %else2 -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; RV32-NEXT: .LBB7_3: # %cond.store -; RV32-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: vmv.x.s a2, v9 ; RV32-NEXT: sh a1, 0(a2) @@ -446,35 +369,21 @@ ; RV32-NEXT: sh a0, 0(a1) ; RV32-NEXT: srli a0, a0, 16 ; RV32-NEXT: sh a0, 2(a1) -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: mscatter_v2i32_align2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmv.v.i v11, 0 -; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64-NEXT: vslideup.vi v11, v10, 0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmsne.vi v10, v11, 0 -; RV64-NEXT: addi a0, sp, 15 -; RV64-NEXT: vsm.v v10, (a0) -; RV64-NEXT: lbu a0, 15(sp) +; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, mu +; RV64-NEXT: vmv.x.s a0, v0 ; RV64-NEXT: andi a1, a0, 1 ; RV64-NEXT: bnez a1, .LBB7_3 ; RV64-NEXT: # %bb.1: # %else ; RV64-NEXT: andi a0, a0, 2 ; RV64-NEXT: bnez a0, .LBB7_4 ; RV64-NEXT: .LBB7_2: # %else2 -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; RV64-NEXT: .LBB7_3: # %cond.store -; RV64-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; RV64-NEXT: vmv.x.s a2, v9 @@ -493,7 +402,6 @@ ; RV64-NEXT: sh a0, 0(a1) ; RV64-NEXT: srli a0, a0, 16 ; RV64-NEXT: sh a0, 2(a1) -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 2, <2 x i1> %m) ret void @@ -504,21 +412,10 @@ define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %res_ptr) nounwind { ; RV32-LABEL: masked_load_v2i32_align1: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vmseq.vi v0, v8, 0 -; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmseq.vi v8, v8, 0 ; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; RV32-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV32-NEXT: vslideup.vi v9, v8, 0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmsne.vi v8, v9, 0 -; RV32-NEXT: addi a2, sp, 15 -; RV32-NEXT: vsm.v v8, (a2) -; RV32-NEXT: lbu a2, 15(sp) +; RV32-NEXT: vmv.x.s a2, v8 ; RV32-NEXT: andi a3, a2, 1 ; RV32-NEXT: beqz a3, .LBB8_2 ; RV32-NEXT: # %bb.1: # %cond.load @@ -559,26 +456,14 @@ ; RV32-NEXT: .LBB8_4: # %else2 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; RV32-NEXT: vse32.v v8, (a1) -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: masked_load_v2i32_align1: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vmseq.vi v0, v8, 0 -; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmseq.vi v8, v8, 0 ; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; RV64-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmv.v.i v9, 0 -; RV64-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; RV64-NEXT: vslideup.vi v9, v8, 0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmsne.vi v8, v9, 0 -; RV64-NEXT: addi a2, sp, 15 -; RV64-NEXT: vsm.v v8, (a2) -; RV64-NEXT: lbu a2, 15(sp) +; RV64-NEXT: vmv.x.s a2, v8 ; RV64-NEXT: andi a3, a2, 1 ; RV64-NEXT: beqz a3, .LBB8_2 ; RV64-NEXT: # %bb.1: # %cond.load @@ -619,7 +504,6 @@ ; RV64-NEXT: .LBB8_4: # %else2 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; RV64-NEXT: vse32.v v8, (a1) -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %mask = icmp eq <2 x i32> %m, zeroinitializer %load = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %a, i32 1, <2 x i1> %mask, <2 x i32> undef) @@ -632,31 +516,19 @@ define void @masked_store_v2i32_align2(<2 x i32> %val, <2 x i32>* %a, <2 x i32> %m) nounwind { ; CHECK-LABEL: masked_store_v2i32_align2: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vmseq.vi v0, v9, 0 -; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vmseq.vi v9, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v10, v9, 0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmsne.vi v9, v10, 0 -; CHECK-NEXT: addi a1, sp, 15 -; CHECK-NEXT: vsm.v v9, (a1) -; CHECK-NEXT: lbu a1, 15(sp) +; CHECK-NEXT: vmv.x.s a1, v9 ; CHECK-NEXT: andi a2, a1, 1 ; CHECK-NEXT: bnez a2, .LBB9_3 ; CHECK-NEXT: # %bb.1: # %else ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: bnez a1, .LBB9_4 ; CHECK-NEXT: .LBB9_2: # %else2 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB9_3: # %cond.store -; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmv.x.s a2, v8 ; CHECK-NEXT: sh a2, 0(a0) ; CHECK-NEXT: srli a2, a2, 16 @@ -670,7 +542,6 @@ ; CHECK-NEXT: sh a1, 4(a0) ; CHECK-NEXT: srli a1, a1, 16 ; CHECK-NEXT: sh a1, 6(a0) -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %mask = icmp eq <2 x i32> %m, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %a, i32 2, <2 x i1> %mask)