diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -477,7 +477,7 @@ SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerMGATHER(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerMGATHERMSCATTER(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -475,6 +475,7 @@ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); @@ -517,6 +518,7 @@ setOperationAction(ISD::FCOPYSIGN, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); @@ -691,6 +693,7 @@ if (Subtarget.hasStdExtV()) { setTargetDAGCombine(ISD::FCOPYSIGN); setTargetDAGCombine(ISD::MGATHER); + setTargetDAGCombine(ISD::MSCATTER); } } @@ -1711,7 +1714,8 @@ case ISD::FCOPYSIGN: return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG); case ISD::MGATHER: - return lowerMGATHER(Op, DAG); + case ISD::MSCATTER: + return lowerMGATHERMSCATTER(Op, DAG); } } @@ -3407,39 +3411,50 @@ // "unsigned unscaled" addressing mode; indices are implicitly zero-extended or // truncated to XLEN and are treated as byte offsets. Any signed or scaled // indexing is extended to the XLEN value type and scaled accordingly. -SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const { - MaskedGatherSDNode *N = cast(Op.getNode()); +SDValue RISCVTargetLowering::lowerMGATHERMSCATTER(SDValue Op, + SelectionDAG &DAG) const { + auto *N = cast(Op.getNode()); SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); - SDValue PassThru = N->getPassThru(); MVT XLenVT = Subtarget.getXLenVT(); assert(N->getBasePtr().getSimpleValueType() == XLenVT && "Unexpected pointer type"); - // Targets have to explicitly opt-in for extending vector loads. - assert(N->getExtensionType() == ISD::NON_EXTLOAD && + // Targets have to explicitly opt-in for extending vector loads and + // truncating vector stores. + const auto *MGN = dyn_cast(N); + const auto *MSN = dyn_cast(N); + assert((!MGN || MGN->getExtensionType() == ISD::NON_EXTLOAD) && "Unexpected extending MGATHER"); + assert((!MSN || !MSN->isTruncatingStore()) && + "Unexpected extending MSCATTER"); - SDValue VL = getDefaultVLOps(VT, VT, DL, DAG, Subtarget).second; // If the mask is known to be all ones, optimize to an unmasked intrinsic; // the selection of the masked intrinsics doesn't do this for us. - if (ISD::isConstantSplatVectorAllOnes(Mask.getNode())) { - SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vloxei, DL, XLenVT); - SDValue Ops[] = {N->getChain(), IntID, N->getBasePtr(), Index, VL}; - return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, - DAG.getVTList(VT, MVT::Other), Ops, - N->getMemoryVT(), N->getMemOperand()); - } + unsigned IntID = 0; + MVT IndexVT = Index.getSimpleValueType(); + SDValue VL = getDefaultVLOps(IndexVT, IndexVT, DL, DAG, Subtarget).second; + bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode()); - SDValue IntID = - DAG.getTargetConstant(Intrinsic::riscv_vloxei_mask, DL, XLenVT); - SDValue Ops[] = {N->getChain(), IntID, PassThru, N->getBasePtr(), - Index, Mask, VL}; - return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, - DAG.getVTList(VT, MVT::Other), Ops, - N->getMemoryVT(), N->getMemOperand()); + if (IsUnmasked) + IntID = MGN ? Intrinsic::riscv_vloxei : Intrinsic::riscv_vsoxei; + else + IntID = MGN ? Intrinsic::riscv_vloxei_mask : Intrinsic::riscv_vsoxei_mask; + SmallVector Ops{N->getChain(), + DAG.getTargetConstant(IntID, DL, XLenVT)}; + if (MSN) + Ops.push_back(MSN->getValue()); + else if (!IsUnmasked) + Ops.push_back(MGN->getPassThru()); + Ops.push_back(N->getBasePtr()); + Ops.push_back(Index); + if (!IsUnmasked) + Ops.push_back(Mask); + Ops.push_back(VL); + return DAG.getMemIntrinsicNode( + MGN ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, N->getVTList(), + Ops, N->getMemoryVT(), N->getMemOperand()); } // Returns the opcode of the target-specific SDNode that implements the 32-bit @@ -4459,18 +4474,19 @@ return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N->getOperand(0), DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound)); } - case ISD::MGATHER: { + case ISD::MGATHER: + case ISD::MSCATTER: { if (!DCI.isBeforeLegalize()) break; - MaskedGatherSDNode *MGN = cast(N); - SDValue Index = MGN->getIndex(); + MaskedGatherScatterSDNode *MGSN = cast(N); + SDValue Index = MGSN->getIndex(); EVT IndexVT = Index.getValueType(); MVT XLenVT = Subtarget.getXLenVT(); // RISCV indexed loads only support the "unsigned unscaled" addressing // mode, so anything else must be manually legalized. - bool NeedsIdxLegalization = - MGN->isIndexScaled() || - (MGN->isIndexSigned() && IndexVT.getVectorElementType().bitsLT(XLenVT)); + bool NeedsIdxLegalization = MGSN->isIndexScaled() || + (MGSN->isIndexSigned() && + IndexVT.getVectorElementType().bitsLT(XLenVT)); if (!NeedsIdxLegalization) break; @@ -4481,13 +4497,13 @@ // LLVM's legalization take care of the splitting. if (IndexVT.getVectorElementType().bitsLT(XLenVT)) { IndexVT = IndexVT.changeVectorElementType(XLenVT); - Index = DAG.getNode(MGN->isIndexSigned() ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND, + Index = DAG.getNode(MGSN->isIndexSigned() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, DL, IndexVT, Index); } unsigned Scale = N->getConstantOperandVal(5); - if (MGN->isIndexScaled() && Scale != 1) { + if (MGSN->isIndexScaled() && Scale != 1) { // Manually scale the indices by the element size. // TODO: Sanitize the scale operand here? assert(isPowerOf2_32(Scale) && "Expecting power-of-two types"); @@ -4496,11 +4512,19 @@ } ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_UNSCALED; - return DAG.getMaskedGather( - N->getVTList(), MGN->getMemoryVT(), DL, - {MGN->getChain(), MGN->getPassThru(), MGN->getMask(), MGN->getBasePtr(), - Index, MGN->getScale()}, - MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType()); + if (const auto *MGN = dyn_cast(N)) { + return DAG.getMaskedGather( + N->getVTList(), MGSN->getMemoryVT(), DL, + {MGSN->getChain(), MGN->getPassThru(), MGSN->getMask(), + MGSN->getBasePtr(), Index, MGN->getScale()}, + MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType()); + } + const auto *MSN = cast(N); + return DAG.getMaskedScatter( + N->getVTList(), MGSN->getMemoryVT(), DL, + {MGSN->getChain(), MSN->getValue(), MGSN->getMask(), MGSN->getBasePtr(), + Index, MGSN->getScale()}, + MGSN->getMemOperand(), NewIndexTy, MSN->isTruncatingStore()); } } diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -0,0 +1,1854 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64 + +declare void @llvm.masked.scatter.nxv1i8.nxv1p0i8(, , i32, ) + +define void @mscatter_nxv1i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i8.nxv1p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i8.nxv2p0i8(, , i32, ) + +define void @mscatter_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_nxv2i16_truncstore_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i16_truncstore_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i16_truncstore_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %tval, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_nxv2i32_truncstore_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i32_truncstore_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vnsrl.wi v26, v25, 0 +; RV32-NEXT: vsoxei32.v v26, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i32_truncstore_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vnsrl.wi v26, v25, 0 +; RV64-NEXT: vsoxei64.v v26, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %tval, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_nxv2i64_truncstore_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64_truncstore_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v26, v25, 0 +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vnsrl.wi v25, v26, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64_truncstore_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v26, v25, 0 +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vnsrl.wi v25, v26, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %tval, %ptrs, i32 1, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i8.nxv4p0i8(, , i32, ) + +define void @mscatter_nxv4i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i8.nxv4p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_truemask_nxv4i8( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i8.nxv4p0i8( %val, %ptrs, i32 1, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i8.nxv8p0i8(, , i32, ) + +define void @mscatter_nxv8i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i8.nxv8p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8( %val, i8* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v9 +; RV32-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v9 +; RV64-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i8, i8* %base, %idxs + call void @llvm.masked.scatter.nxv8i8.nxv8p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1i16.nxv1p0i16(, , i32, ) + +define void @mscatter_nxv1i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i16.nxv1p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i16.nxv2p0i16(, , i32, ) + +define void @mscatter_nxv2i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i16.nxv2p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_nxv2i32_truncstore_nxv2i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i32_truncstore_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i32_truncstore_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i16.nxv2p0i16( %tval, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_nxv2i64_truncstore_nxv2i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64_truncstore_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v26, v25, 0 +; RV32-NEXT: vsoxei32.v v26, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64_truncstore_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v26, v25, 0 +; RV64-NEXT: vsoxei64.v v26, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i16.nxv2p0i16( %tval, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i16.nxv4p0i16(, , i32, ) + +define void @mscatter_nxv4i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_truemask_nxv4i16( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %val, %ptrs, i32 2, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i16.nxv8p0i16(, , i32, ) + +define void @mscatter_nxv8i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i16, i16* %base, %idxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i16, i16* %base, %eidxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i16, i16* %base, %eidxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i16, i16* %base, %idxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1i32.nxv1p0i32(, , i32, ) + +define void @mscatter_nxv1i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i32.nxv1p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i32.nxv2p0i32(, , i32, ) + +define void @mscatter_nxv2i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i32.nxv2p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_nxv2i64_truncstore_nxv2i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64_truncstore_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64_truncstore_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i32.nxv2p0i32( %tval, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i32.nxv4p0i32(, , i32, ) + +define void @mscatter_nxv4i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_truemask_nxv4i32( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %val, %ptrs, i32 4, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i32.nxv8p0i32(, , i32, ) + +define void @mscatter_nxv8i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1i64.nxv1p0i64(, , i32, ) + +define void @mscatter_nxv1i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i64.nxv2p0i64(, , i32, ) + +define void @mscatter_nxv2i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i64.nxv2p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i64.nxv4p0i64(, , i32, ) + +define void @mscatter_nxv4i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_truemask_nxv4i64( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( %val, %ptrs, i32 8, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i64.nxv8p0i64(, , i32, ) + +define void @mscatter_nxv8i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i32_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v16, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i32_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i32_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1f16.nxv1p0f16(, , i32, ) + +define void @mscatter_nxv1f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1f16.nxv1p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2f16.nxv2p0f16(, , i32, ) + +define void @mscatter_nxv2f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2f16.nxv2p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4f16.nxv4p0f16(, , i32, ) + +define void @mscatter_nxv4f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f16.nxv4p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_truemask_nxv4f16( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4f16.nxv4p0f16( %val, %ptrs, i32 2, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8f16.nxv8p0f16(, , i32, ) + +define void @mscatter_nxv8f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds half, half* %base, %idxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds half, half* %base, %eidxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds half, half* %base, %eidxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds half, half* %base, %idxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1f32.nxv1p0f32(, , i32, ) + +define void @mscatter_nxv1f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1f32.nxv1p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2f32.nxv2p0f32(, , i32, ) + +define void @mscatter_nxv2f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4f32.nxv4p0f32(, , i32, ) + +define void @mscatter_nxv4f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_truemask_nxv4f32( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( %val, %ptrs, i32 4, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8f32.nxv8p0f32(, , i32, ) + +define void @mscatter_nxv8f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1f64.nxv1p0f64(, , i32, ) + +define void @mscatter_nxv1f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1f64.nxv1p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2f64.nxv2p0f64(, , i32, ) + +define void @mscatter_nxv2f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4f64.nxv4p0f64(, , i32, ) + +define void @mscatter_nxv4f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_truemask_nxv4f64( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( %val, %ptrs, i32 8, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8f64.nxv8p0f64(, , i32, ) + +define void @mscatter_nxv8f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i32_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v16, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i32_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i32_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv16f64.nxv16p0f64(, , i32, ) + +declare @llvm.experimental.vector.insert.nxv8f64.nxv16f64(, , i64) +declare @llvm.experimental.vector.insert.nxv8p0f64.nxv16p0f64(, , i64) + +define void @mscatter_nxv16f64( %val0, %val1, %ptrs0, %ptrs1, %m) { +; RV32-LABEL: mscatter_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl4re32.v v28, (a0) +; RV32-NEXT: vl4re32.v v24, (a1) +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v28, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: vsetvli a1, zero, e8,mf4,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a0 +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v16, (zero), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: vl8re64.v v24, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vl8re64.v v16, (a1) +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: srli a0, a0, 3 +; RV64-NEXT: vsetvli a1, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a0 +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %p0 = call @llvm.experimental.vector.insert.nxv8p0f64.nxv16p0f64( undef, %ptrs0, i64 0) + %p1 = call @llvm.experimental.vector.insert.nxv8p0f64.nxv16p0f64( %p0, %ptrs1, i64 8) + %v0 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( %v0, %val1, i64 8) + call void @llvm.masked.scatter.nxv16f64.nxv16p0f64( %v1, %p1, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv16i8_nxv16f64( %val0, %val1, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv16i8_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl2r.v v2, (a1) +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v2 +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv16i8_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vl2r.v v2, (a1) +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v2 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v8, v3 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v0 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( %v0, %val1, i64 8) + call void @llvm.masked.scatter.nxv16f64.nxv16p0f64( %v1, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv16i16_nxv16f64( %val0, %val1, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl4re16.v v4, (a1) +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v4 +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vl4re16.v v4, (a1) +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v4 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v8, v6 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v0 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( %v0, %val1, i64 8) + call void @llvm.masked.scatter.nxv16f64.nxv16p0f64( %v1, %ptrs, i32 8, %m) + ret void +}