Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -854,6 +854,8 @@ void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo, + SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi, bool SplitSETCC = false); @@ -883,6 +885,7 @@ SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); + SDValue SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo); SDValue SplitVecOp_Gather(MemSDNode *MGT, unsigned OpNo); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -948,6 +948,9 @@ case ISD::VP_LOAD: SplitVecRes_VP_LOAD(cast(N), Lo, Hi); break; + case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: + SplitVecRes_VP_STRIDED_LOAD(cast(N), Lo, Hi); + break; case ISD::MLOAD: SplitVecRes_MLOAD(cast(N), Lo, Hi); break; @@ -1871,6 +1874,79 @@ ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, + SDValue &Lo, SDValue &Hi) { + assert(SLD->isUnindexed() && + "Indexed VP strided load during type legalization!"); + assert(SLD->getOffset().isUndef() && + "Unexpected indexed variable-length load offset"); + + SDLoc DL(SLD); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(SLD->getValueType(0)); + + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = + DAG.GetDependentSplitDestVTs(SLD->getMemoryVT(), LoVT, &HiIsEmpty); + + SDValue Mask = SLD->getMask(); + SDValue LoMask, HiMask; + if (Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), LoMask, HiMask); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, LoMask, HiMask); + else + std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL); + } + + SDValue LoEVL, HiEVL; + std::tie(LoEVL, HiEVL) = + DAG.SplitEVL(SLD->getVectorLength(), SLD->getValueType(0), DL); + + // Generate the low vp_strided_load + Lo = DAG.getStridedLoadVP( + SLD->getAddressingMode(), SLD->getExtensionType(), LoVT, DL, + SLD->getChain(), SLD->getBasePtr(), SLD->getOffset(), SLD->getStride(), + LoMask, LoEVL, LoMemVT, SLD->getMemOperand(), SLD->isExpandingLoad()); + + if (HiIsEmpty) { + // The high vp_strided_load has zero storage size. We therefore simply set + // it to the low vp_strided_load and rely on subsequent removal from the + // chain. + Hi = Lo; + } else { + // Generate the high vp_strided_load. + // To calculate the high base address, we need to sum to the low base + // address stride number of bytes for each element already loaded by low, + // that is: Ptr = Ptr + (LoEVL * Stride) + EVT PtrVT = SLD->getBasePtr().getValueType(); + SDValue Increment = + DAG.getNode(ISD::MUL, DL, PtrVT, LoEVL, + DAG.getSExtOrTrunc(SLD->getStride(), DL, PtrVT)); + SDValue Ptr = + DAG.getNode(ISD::ADD, DL, PtrVT, SLD->getBasePtr(), Increment); + + Hi = DAG.getStridedLoadVP( + SLD->getAddressingMode(), SLD->getExtensionType(), HiVT, DL, + SLD->getChain(), Ptr, SLD->getOffset(), SLD->getStride(), HiMask, HiEVL, + MachinePointerInfo(SLD->getPointerInfo().getAddrSpace()), HiMemVT, + SLD->getOriginalAlign(), MachineMemOperand::MOLoad, SLD->getAAInfo(), + SLD->getRanges(), SLD->isExpandingLoad()); + } + + // Build a factor node to remember that this load is independent of the + // other one. + SDValue Ch = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(SLD, 1), Ch); +} + void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi) { assert(MLD->isUnindexed() && "Indexed masked load during type legalization!"); @@ -2354,6 +2430,9 @@ case ISD::VP_STORE: Res = SplitVecOp_VP_STORE(cast(N), OpNo); break; + case ISD::EXPERIMENTAL_VP_STRIDED_STORE: + Res = SplitVecOp_VP_STRIDED_STORE(cast(N), OpNo); + break; case ISD::MSTORE: Res = SplitVecOp_MSTORE(cast(N), OpNo); break; @@ -2825,6 +2904,80 @@ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } +SDValue DAGTypeLegalizer::SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, + unsigned OpNo) { + assert(N->isUnindexed() && "Indexed vp_strided_store of a vector?"); + assert(N->getOffset().isUndef() && "Unexpected VP strided store offset"); + + SDLoc DL(N); + + SDValue Data = N->getValue(); + SDValue LoData, HiData; + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Data, LoData, HiData); + else + std::tie(LoData, HiData) = DAG.SplitVector(Data, DL); + + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = DAG.GetDependentSplitDestVTs( + N->getMemoryVT(), LoData.getValueType(), &HiIsEmpty); + + SDValue Mask = N->getMask(); + SDValue LoMask, HiMask; + if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) + SplitVecRes_SETCC(Mask.getNode(), LoMask, HiMask); + else if (getTypeAction(Mask.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(Mask, LoMask, HiMask); + else + std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL); + + SDValue LoEVL, HiEVL; + std::tie(LoEVL, HiEVL) = + DAG.SplitEVL(N->getVectorLength(), Data.getValueType(), DL); + + // Generate the low vp_strided_store + SDValue Lo = DAG.getStridedStoreVP( + N->getChain(), DL, LoData, N->getBasePtr(), N->getOffset(), + N->getStride(), LoMask, LoEVL, LoMemVT, N->getMemOperand(), + N->getAddressingMode(), N->isTruncatingStore(), N->isCompressingStore()); + + // If the high vp_strided_store has zero storage size, only the low + // vp_strided_store is needed. + if (HiIsEmpty) + return Lo; + + // Generate the high vp_strided_store. + // To calculate the high base address, we need to sum to the low base + // address stride number of bytes for each element already stored by low, + // that is: Ptr = Ptr + (LoEVL * Stride) + EVT PtrVT = N->getBasePtr().getValueType(); + SDValue Increment = + DAG.getNode(ISD::MUL, DL, PtrVT, LoEVL, + DAG.getSExtOrTrunc(N->getStride(), DL, PtrVT)); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, N->getBasePtr(), Increment); + + Align Alignment = N->getOriginalAlign(); + if (LoMemVT.isScalableVector()) + Alignment = commonAlignment(Alignment, + LoMemVT.getSizeInBits().getKnownMinSize() / 8); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo(N->getPointerInfo().getAddrSpace()), + MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment, + N->getAAInfo(), N->getRanges()); + + SDValue Hi = DAG.getStridedStoreVP( + N->getChain(), DL, HiData, Ptr, N->getOffset(), N->getStride(), HiMask, + HiEVL, HiMemVT, MMO, N->getAddressingMode(), N->isTruncatingStore(), + N->isCompressingStore()); + + // Build a factor node to remember that this store is independent of the + // other one. + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { assert(N->isUnindexed() && "Indexed masked store of vector?"); Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK-RV64 @@ -600,3 +600,227 @@ } declare <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0f64.i32(double*, i32, <3 x i1>, i32) + +; Splitting +define <32 x double> @strided_vpload_v32f64(double* %ptr, i32 signext %stride, <32 x i1> %m, i32 zeroext %evl) nounwind { +; CHECK-RV32-LABEL: strided_vpload_v32f64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi a4, a2, -16 +; CHECK-RV32-NEXT: vmv1r.v v8, v0 +; CHECK-RV32-NEXT: li a3, 0 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB33_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: .LBB33_2: +; CHECK-RV32-NEXT: li a4, 16 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB33_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: li a2, 16 +; CHECK-RV32-NEXT: .LBB33_4: +; CHECK-RV32-NEXT: mul a4, a2, a1 +; CHECK-RV32-NEXT: add a4, a0, a4 +; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1, v0.t +; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV32-NEXT: vmv1r.v v0, v8 +; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_vpload_v32f64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi a4, a2, -16 +; CHECK-RV64-NEXT: vmv1r.v v8, v0 +; CHECK-RV64-NEXT: li a3, 0 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB33_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a3, a4 +; CHECK-RV64-NEXT: .LBB33_2: +; CHECK-RV64-NEXT: li a4, 16 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB33_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: li a2, 16 +; CHECK-RV64-NEXT: .LBB33_4: +; CHECK-RV64-NEXT: mul a4, a2, a1 +; CHECK-RV64-NEXT: add a4, a0, a4 +; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v16, (a4), a1, v0.t +; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV64-NEXT: vmv1r.v v0, v8 +; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-RV64-NEXT: ret + %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0f64.i32(double* %ptr, i32 %stride, <32 x i1> %m, i32 %evl) + ret <32 x double> %load +} + +define <32 x double> @strided_vpload_v32f64_allones_mask(double* %ptr, i32 signext %stride, i32 zeroext %evl) nounwind { +; CHECK-RV32-LABEL: strided_vpload_v32f64_allones_mask: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi a4, a2, -16 +; CHECK-RV32-NEXT: li a3, 0 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB34_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: .LBB34_2: +; CHECK-RV32-NEXT: li a4, 16 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB34_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: li a2, 16 +; CHECK-RV32-NEXT: .LBB34_4: +; CHECK-RV32-NEXT: mul a4, a2, a1 +; CHECK-RV32-NEXT: add a4, a0, a4 +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_vpload_v32f64_allones_mask: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi a4, a2, -16 +; CHECK-RV64-NEXT: li a3, 0 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB34_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a3, a4 +; CHECK-RV64-NEXT: .LBB34_2: +; CHECK-RV64-NEXT: li a4, 16 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB34_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: li a2, 16 +; CHECK-RV64-NEXT: .LBB34_4: +; CHECK-RV64-NEXT: mul a4, a2, a1 +; CHECK-RV64-NEXT: add a4, a0, a4 +; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1 +; CHECK-RV64-NEXT: ret + %one = insertelement <32 x i1> poison, i1 true, i32 0 + %allones = shufflevector <32 x i1> %one, <32 x i1> poison, <32 x i32> zeroinitializer + %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0f64.i32(double* %ptr, i32 %stride, <32 x i1> %allones, i32 %evl) + ret <32 x double> %load +} + +declare <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0f64.i32(double*, i32, <32 x i1>, i32) + +; Widening + splitting (with HiIsEmpty == true) +define <33 x double> @strided_load_v33f64(double* %ptr, i64 %stride, <33 x i1> %mask, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_load_v33f64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a5, 32 +; CHECK-RV32-NEXT: vmv1r.v v8, v0 +; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: bltu a4, a5, .LBB35_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: li a3, 32 +; CHECK-RV32-NEXT: .LBB35_2: +; CHECK-RV32-NEXT: addi a5, a3, -16 +; CHECK-RV32-NEXT: li a7, 0 +; CHECK-RV32-NEXT: bltu a3, a5, .LBB35_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a7, a5 +; CHECK-RV32-NEXT: .LBB35_4: +; CHECK-RV32-NEXT: li a6, 16 +; CHECK-RV32-NEXT: mv a5, a3 +; CHECK-RV32-NEXT: bltu a3, a6, .LBB35_6 +; CHECK-RV32-NEXT: # %bb.5: +; CHECK-RV32-NEXT: li a5, 16 +; CHECK-RV32-NEXT: .LBB35_6: +; CHECK-RV32-NEXT: mul t0, a5, a2 +; CHECK-RV32-NEXT: add t0, a1, t0 +; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v16, (t0), a2, v0.t +; CHECK-RV32-NEXT: addi t0, a4, -32 +; CHECK-RV32-NEXT: li a7, 0 +; CHECK-RV32-NEXT: bltu a4, t0, .LBB35_8 +; CHECK-RV32-NEXT: # %bb.7: +; CHECK-RV32-NEXT: mv a7, t0 +; CHECK-RV32-NEXT: .LBB35_8: +; CHECK-RV32-NEXT: bltu a7, a6, .LBB35_10 +; CHECK-RV32-NEXT: # %bb.9: +; CHECK-RV32-NEXT: li a7, 16 +; CHECK-RV32-NEXT: .LBB35_10: +; CHECK-RV32-NEXT: mul a3, a3, a2 +; CHECK-RV32-NEXT: add a3, a1, a3 +; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4 +; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v24, (a3), a2, v0.t +; CHECK-RV32-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV32-NEXT: vmv1r.v v0, v8 +; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t +; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV32-NEXT: vse64.v v8, (a0) +; CHECK-RV32-NEXT: addi a1, a0, 256 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m8, ta, mu +; CHECK-RV32-NEXT: vse64.v v24, (a1) +; CHECK-RV32-NEXT: addi a0, a0, 128 +; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV32-NEXT: vse64.v v16, (a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_load_v33f64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: li a5, 32 +; CHECK-RV64-NEXT: vmv1r.v v8, v0 +; CHECK-RV64-NEXT: mv a4, a3 +; CHECK-RV64-NEXT: bltu a3, a5, .LBB35_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: li a4, 32 +; CHECK-RV64-NEXT: .LBB35_2: +; CHECK-RV64-NEXT: addi a5, a4, -16 +; CHECK-RV64-NEXT: li a7, 0 +; CHECK-RV64-NEXT: bltu a4, a5, .LBB35_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a7, a5 +; CHECK-RV64-NEXT: .LBB35_4: +; CHECK-RV64-NEXT: li a6, 16 +; CHECK-RV64-NEXT: mv a5, a4 +; CHECK-RV64-NEXT: bltu a4, a6, .LBB35_6 +; CHECK-RV64-NEXT: # %bb.5: +; CHECK-RV64-NEXT: li a5, 16 +; CHECK-RV64-NEXT: .LBB35_6: +; CHECK-RV64-NEXT: mul t0, a5, a2 +; CHECK-RV64-NEXT: add t0, a1, t0 +; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v16, (t0), a2, v0.t +; CHECK-RV64-NEXT: addi t0, a3, -32 +; CHECK-RV64-NEXT: li a7, 0 +; CHECK-RV64-NEXT: bltu a3, t0, .LBB35_8 +; CHECK-RV64-NEXT: # %bb.7: +; CHECK-RV64-NEXT: mv a7, t0 +; CHECK-RV64-NEXT: .LBB35_8: +; CHECK-RV64-NEXT: bltu a7, a6, .LBB35_10 +; CHECK-RV64-NEXT: # %bb.9: +; CHECK-RV64-NEXT: li a7, 16 +; CHECK-RV64-NEXT: .LBB35_10: +; CHECK-RV64-NEXT: mul a3, a4, a2 +; CHECK-RV64-NEXT: add a3, a1, a3 +; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4 +; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v24, (a3), a2, v0.t +; CHECK-RV64-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV64-NEXT: vmv1r.v v0, v8 +; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t +; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV64-NEXT: vse64.v v8, (a0) +; CHECK-RV64-NEXT: addi a1, a0, 256 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m8, ta, mu +; CHECK-RV64-NEXT: vse64.v v24, (a1) +; CHECK-RV64-NEXT: addi a0, a0, 128 +; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV64-NEXT: vse64.v v16, (a0) +; CHECK-RV64-NEXT: ret + %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0f64.i64(double* %ptr, i64 %stride, <33 x i1> %mask, i32 %evl) + ret <33 x double> %v +} + +declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0f64.i64(double*, i64, <33 x i1>, i32) Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK-RV64 @@ -492,3 +492,108 @@ } declare void @llvm.experimental.vp.strided.store.v3f32.p0f32.i32(<3 x float>, float* , i32, <3 x i1>, i32) + +; Splitting +define void @strided_store_v32f64(<32 x double> %v, double* %ptr, i32 signext %stride, <32 x i1> %mask, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_store_v32f64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a4, 16 +; CHECK-RV32-NEXT: mv a3, a2 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB27_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: li a3, 16 +; CHECK-RV32-NEXT: .LBB27_2: +; CHECK-RV32-NEXT: li a4, 0 +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV32-NEXT: addi a5, a2, -16 +; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-RV32-NEXT: bltu a2, a5, .LBB27_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a4, a5 +; CHECK-RV32-NEXT: .LBB27_4: +; CHECK-RV32-NEXT: mul a2, a3, a1 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-RV32-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1, v0.t +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_store_v32f64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: li a4, 16 +; CHECK-RV64-NEXT: mv a3, a2 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB27_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: li a3, 16 +; CHECK-RV64-NEXT: .LBB27_2: +; CHECK-RV64-NEXT: li a4, 0 +; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV64-NEXT: addi a5, a2, -16 +; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-RV64-NEXT: bltu a2, a5, .LBB27_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a4, a5 +; CHECK-RV64-NEXT: .LBB27_4: +; CHECK-RV64-NEXT: mul a2, a3, a1 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-RV64-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1, v0.t +; CHECK-RV64-NEXT: ret + call void @llvm.experimental.vp.strided.store.v32f64.p0f64.i32(<32 x double> %v, double* %ptr, i32 %stride, <32 x i1> %mask, i32 %evl) + ret void +} + +define void @strided_store_v32f64_allones_mask(<32 x double> %v, double *%ptr, i32 signext %stride, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_store_v32f64_allones_mask: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a4, 16 +; CHECK-RV32-NEXT: mv a3, a2 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB28_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: li a3, 16 +; CHECK-RV32-NEXT: .LBB28_2: +; CHECK-RV32-NEXT: li a4, 0 +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV32-NEXT: addi a5, a2, -16 +; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1 +; CHECK-RV32-NEXT: bltu a2, a5, .LBB28_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a4, a5 +; CHECK-RV32-NEXT: .LBB28_4: +; CHECK-RV32-NEXT: mul a2, a3, a1 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_store_v32f64_allones_mask: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: li a4, 16 +; CHECK-RV64-NEXT: mv a3, a2 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB28_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: li a3, 16 +; CHECK-RV64-NEXT: .LBB28_2: +; CHECK-RV64-NEXT: li a4, 0 +; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV64-NEXT: addi a5, a2, -16 +; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1 +; CHECK-RV64-NEXT: bltu a2, a5, .LBB28_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a4, a5 +; CHECK-RV64-NEXT: .LBB28_4: +; CHECK-RV64-NEXT: mul a2, a3, a1 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1 +; CHECK-RV64-NEXT: ret + %one = insertelement <32 x i1> poison, i1 true, i32 0 + %allones = shufflevector <32 x i1> %one, <32 x i1> poison, <32 x i32> zeroinitializer + call void @llvm.experimental.vp.strided.store.v32f64.p0f64.i32(<32 x double> %v, double* %ptr, i32 %stride, <32 x i1> %allones, i32 %evl) + ret void +} + +declare void @llvm.experimental.vp.strided.store.v32f64.p0f64.i32(<32 x double>, double*, i32, <32 x i1>, i32) Index: llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64 declare @llvm.experimental.vp.strided.load.nxv1i8.p0i8.i8(i8*, i8, , i32) @@ -760,3 +760,227 @@ } declare @llvm.experimental.vp.strided.load.nxv3f64.p0f64.i32(double*, i32, , i32) + +; Splitting +define @strided_load_nxv16f64(double* %ptr, i64 %stride, %mask, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_load_nxv16f64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vmv1r.v v8, v0 +; CHECK-RV32-NEXT: li a2, 0 +; CHECK-RV32-NEXT: csrr a4, vlenb +; CHECK-RV32-NEXT: sub a6, a3, a4 +; CHECK-RV32-NEXT: srli a5, a4, 3 +; CHECK-RV32-NEXT: bltu a3, a6, .LBB42_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a2, a6 +; CHECK-RV32-NEXT: .LBB42_2: +; CHECK-RV32-NEXT: vsetvli a6, zero, e8, mf4, ta, mu +; CHECK-RV32-NEXT: vslidedown.vx v0, v8, a5 +; CHECK-RV32-NEXT: bltu a3, a4, .LBB42_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: .LBB42_4: +; CHECK-RV32-NEXT: mul a4, a3, a1 +; CHECK-RV32-NEXT: add a4, a0, a4 +; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1, v0.t +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV32-NEXT: vmv1r.v v0, v8 +; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_load_nxv16f64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vmv1r.v v8, v0 +; CHECK-RV64-NEXT: li a3, 0 +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: sub a6, a2, a4 +; CHECK-RV64-NEXT: srli a5, a4, 3 +; CHECK-RV64-NEXT: bltu a2, a6, .LBB42_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a3, a6 +; CHECK-RV64-NEXT: .LBB42_2: +; CHECK-RV64-NEXT: vsetvli a6, zero, e8, mf4, ta, mu +; CHECK-RV64-NEXT: vslidedown.vx v0, v8, a5 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB42_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a2, a4 +; CHECK-RV64-NEXT: .LBB42_4: +; CHECK-RV64-NEXT: mul a4, a2, a1 +; CHECK-RV64-NEXT: add a4, a0, a4 +; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v16, (a4), a1, v0.t +; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV64-NEXT: vmv1r.v v0, v8 +; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-RV64-NEXT: ret + %v = call @llvm.experimental.vp.strided.load.nxv16f64.p0f64.i64(double* %ptr, i64 %stride, %mask, i32 %evl) + ret %v +} + +define @strided_load_nxv16f64_allones_mask(double* %ptr, i64 %stride, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_load_nxv16f64_allones_mask: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: csrr a4, vlenb +; CHECK-RV32-NEXT: sub a5, a3, a4 +; CHECK-RV32-NEXT: li a2, 0 +; CHECK-RV32-NEXT: bltu a3, a5, .LBB43_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a2, a5 +; CHECK-RV32-NEXT: .LBB43_2: +; CHECK-RV32-NEXT: bltu a3, a4, .LBB43_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: .LBB43_4: +; CHECK-RV32-NEXT: mul a4, a3, a1 +; CHECK-RV32-NEXT: add a4, a0, a4 +; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_load_nxv16f64_allones_mask: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: sub a5, a2, a4 +; CHECK-RV64-NEXT: li a3, 0 +; CHECK-RV64-NEXT: bltu a2, a5, .LBB43_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a3, a5 +; CHECK-RV64-NEXT: .LBB43_2: +; CHECK-RV64-NEXT: bltu a2, a4, .LBB43_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a2, a4 +; CHECK-RV64-NEXT: .LBB43_4: +; CHECK-RV64-NEXT: mul a4, a2, a1 +; CHECK-RV64-NEXT: add a4, a0, a4 +; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v16, (a4), a1 +; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1 +; CHECK-RV64-NEXT: ret + %one = insertelement poison, i1 true, i32 0 + %allones = shufflevector %one, poison, zeroinitializer + %v = call @llvm.experimental.vp.strided.load.nxv16f64.p0f64.i64(double* %ptr, i64 %stride, %allones, i32 %evl) + ret %v +} + +declare @llvm.experimental.vp.strided.load.nxv16f64.p0f64.i64(double*, i64, , i32) + +; Widening + splitting (with HiIsEmpty == true) +; NOTE: We can't return as that introduces a vector +; store that can't yet be legalized through widening. In order to test purely +; the vp.strided.load legalization, we manually split it. +define @strided_load_nxv17f64(double* %ptr, i64 %stride, %mask, i32 zeroext %evl, * %hi_ptr) { +; CHECK-RV32-LABEL: strided_load_nxv17f64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: slli a7, a2, 1 +; CHECK-RV32-NEXT: vmv1r.v v8, v0 +; CHECK-RV32-NEXT: mv a5, a3 +; CHECK-RV32-NEXT: bltu a3, a7, .LBB44_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a5, a7 +; CHECK-RV32-NEXT: .LBB44_2: +; CHECK-RV32-NEXT: sub a6, a5, a2 +; CHECK-RV32-NEXT: li t0, 0 +; CHECK-RV32-NEXT: bltu a5, a6, .LBB44_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv t0, a6 +; CHECK-RV32-NEXT: .LBB44_4: +; CHECK-RV32-NEXT: srli a6, a2, 3 +; CHECK-RV32-NEXT: vsetvli t1, zero, e8, mf4, ta, mu +; CHECK-RV32-NEXT: vslidedown.vx v0, v8, a6 +; CHECK-RV32-NEXT: mv a6, a5 +; CHECK-RV32-NEXT: bltu a5, a2, .LBB44_6 +; CHECK-RV32-NEXT: # %bb.5: +; CHECK-RV32-NEXT: mv a6, a2 +; CHECK-RV32-NEXT: .LBB44_6: +; CHECK-RV32-NEXT: mul t1, a6, a1 +; CHECK-RV32-NEXT: add t1, a0, t1 +; CHECK-RV32-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v16, (t1), a1, v0.t +; CHECK-RV32-NEXT: li t0, 0 +; CHECK-RV32-NEXT: sub t1, a3, a7 +; CHECK-RV32-NEXT: srli a7, a2, 2 +; CHECK-RV32-NEXT: bltu a3, t1, .LBB44_8 +; CHECK-RV32-NEXT: # %bb.7: +; CHECK-RV32-NEXT: mv t0, t1 +; CHECK-RV32-NEXT: .LBB44_8: +; CHECK-RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-RV32-NEXT: vslidedown.vx v0, v8, a7 +; CHECK-RV32-NEXT: bltu t0, a2, .LBB44_10 +; CHECK-RV32-NEXT: # %bb.9: +; CHECK-RV32-NEXT: mv t0, a2 +; CHECK-RV32-NEXT: .LBB44_10: +; CHECK-RV32-NEXT: mul a2, a5, a1 +; CHECK-RV32-NEXT: add a2, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-RV32-NEXT: vlse64.v v24, (a2), a1, v0.t +; CHECK-RV32-NEXT: vsetvli zero, a6, e64, m8, ta, mu +; CHECK-RV32-NEXT: vmv1r.v v0, v8 +; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-RV32-NEXT: vs1r.v v24, (a4) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_load_nxv17f64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a7, a4, 1 +; CHECK-RV64-NEXT: vmv1r.v v8, v0 +; CHECK-RV64-NEXT: mv a5, a2 +; CHECK-RV64-NEXT: bltu a2, a7, .LBB44_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a5, a7 +; CHECK-RV64-NEXT: .LBB44_2: +; CHECK-RV64-NEXT: sub a6, a5, a4 +; CHECK-RV64-NEXT: li t0, 0 +; CHECK-RV64-NEXT: bltu a5, a6, .LBB44_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv t0, a6 +; CHECK-RV64-NEXT: .LBB44_4: +; CHECK-RV64-NEXT: srli a6, a4, 3 +; CHECK-RV64-NEXT: vsetvli t1, zero, e8, mf4, ta, mu +; CHECK-RV64-NEXT: vslidedown.vx v0, v8, a6 +; CHECK-RV64-NEXT: mv a6, a5 +; CHECK-RV64-NEXT: bltu a5, a4, .LBB44_6 +; CHECK-RV64-NEXT: # %bb.5: +; CHECK-RV64-NEXT: mv a6, a4 +; CHECK-RV64-NEXT: .LBB44_6: +; CHECK-RV64-NEXT: mul t1, a6, a1 +; CHECK-RV64-NEXT: add t1, a0, t1 +; CHECK-RV64-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v16, (t1), a1, v0.t +; CHECK-RV64-NEXT: li t0, 0 +; CHECK-RV64-NEXT: sub t1, a2, a7 +; CHECK-RV64-NEXT: srli a7, a4, 2 +; CHECK-RV64-NEXT: bltu a2, t1, .LBB44_8 +; CHECK-RV64-NEXT: # %bb.7: +; CHECK-RV64-NEXT: mv t0, t1 +; CHECK-RV64-NEXT: .LBB44_8: +; CHECK-RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-RV64-NEXT: vslidedown.vx v0, v8, a7 +; CHECK-RV64-NEXT: bltu t0, a4, .LBB44_10 +; CHECK-RV64-NEXT: # %bb.9: +; CHECK-RV64-NEXT: mv t0, a4 +; CHECK-RV64-NEXT: .LBB44_10: +; CHECK-RV64-NEXT: mul a2, a5, a1 +; CHECK-RV64-NEXT: add a2, a0, a2 +; CHECK-RV64-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-RV64-NEXT: vlse64.v v24, (a2), a1, v0.t +; CHECK-RV64-NEXT: vsetvli zero, a6, e64, m8, ta, mu +; CHECK-RV64-NEXT: vmv1r.v v0, v8 +; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-RV64-NEXT: vs1r.v v24, (a3) +; CHECK-RV64-NEXT: ret + %v = call @llvm.experimental.vp.strided.load.nxv17f64.p0f64.i64(double* %ptr, i64 %stride, %mask, i32 %evl) + %lo = call @llvm.experimental.vector.extract.nxv16f64( %v, i64 0) + %hi = call @llvm.experimental.vector.extract.nxv1f64( %v, i64 16) + store %hi, * %hi_ptr + ret %lo +} + +declare @llvm.experimental.vp.strided.load.nxv17f64.p0f64.i64(double*, i64, , i32) +declare @llvm.experimental.vector.extract.nxv1f64( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv16f64( %vec, i64 %idx) Index: llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll +++ llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+v,+experimental-zvfh \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+experimental-zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64 declare void @llvm.experimental.vp.strided.store.nxv1i8.p0i8.i8(, i8*, i8, , i32) @@ -616,3 +616,245 @@ } declare void @llvm.experimental.vp.strided.store.nxv3f32.p0f32.i32(, float* , i32, , i32) + +; Splitting +define void @strided_store_nxv16f64( %v, double* %ptr, i32 signext %stride, %mask, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_store_nxv16f64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: csrr a3, vlenb +; CHECK-RV32-NEXT: mv a4, a2 +; CHECK-RV32-NEXT: bltu a2, a3, .LBB34_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a4, a3 +; CHECK-RV32-NEXT: .LBB34_2: +; CHECK-RV32-NEXT: li a5, 0 +; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-RV32-NEXT: srli a6, a3, 3 +; CHECK-RV32-NEXT: vsetvli a7, zero, e8, mf4, ta, mu +; CHECK-RV32-NEXT: sub a3, a2, a3 +; CHECK-RV32-NEXT: vslidedown.vx v0, v0, a6 +; CHECK-RV32-NEXT: bltu a2, a3, .LBB34_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a5, a3 +; CHECK-RV32-NEXT: .LBB34_4: +; CHECK-RV32-NEXT: mul a2, a4, a1 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1, v0.t +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_store_nxv16f64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: mv a4, a2 +; CHECK-RV64-NEXT: bltu a2, a3, .LBB34_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a4, a3 +; CHECK-RV64-NEXT: .LBB34_2: +; CHECK-RV64-NEXT: li a5, 0 +; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-RV64-NEXT: srli a6, a3, 3 +; CHECK-RV64-NEXT: vsetvli a7, zero, e8, mf4, ta, mu +; CHECK-RV64-NEXT: sub a3, a2, a3 +; CHECK-RV64-NEXT: vslidedown.vx v0, v0, a6 +; CHECK-RV64-NEXT: bltu a2, a3, .LBB34_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a5, a3 +; CHECK-RV64-NEXT: .LBB34_4: +; CHECK-RV64-NEXT: mul a2, a4, a1 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1, v0.t +; CHECK-RV64-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv16f64.p0f64.i32( %v, double* %ptr, i32 %stride, %mask, i32 %evl) + ret void +} + +define void @strided_store_nxv16f64_allones_mask( %v, double *%ptr, i32 signext %stride, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_store_nxv16f64_allones_mask: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: csrr a4, vlenb +; CHECK-RV32-NEXT: mv a3, a2 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB35_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: .LBB35_2: +; CHECK-RV32-NEXT: li a5, 0 +; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV32-NEXT: sub a4, a2, a4 +; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1 +; CHECK-RV32-NEXT: bltu a2, a4, .LBB35_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a5, a4 +; CHECK-RV32-NEXT: .LBB35_4: +; CHECK-RV32-NEXT: mul a2, a3, a1 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_store_nxv16f64_allones_mask: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: mv a3, a2 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB35_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a3, a4 +; CHECK-RV64-NEXT: .LBB35_2: +; CHECK-RV64-NEXT: li a5, 0 +; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-RV64-NEXT: sub a4, a2, a4 +; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB35_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a5, a4 +; CHECK-RV64-NEXT: .LBB35_4: +; CHECK-RV64-NEXT: mul a2, a3, a1 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1 +; CHECK-RV64-NEXT: ret + %one = insertelement poison, i1 true, i32 0 + %allones = shufflevector %one, poison, zeroinitializer + call void @llvm.experimental.vp.strided.store.nxv16f64.p0f64.i32( %v, double* %ptr, i32 %stride, %allones, i32 %evl) + ret void +} + +declare void @llvm.experimental.vp.strided.store.nxv16f64.p0f64.i32(, double*, i32, , i32) + +; Widening + splitting (with HiIsEmpty == true) +define void @strided_store_nxv17f64( %v, double* %ptr, i32 signext %stride, %mask, i32 zeroext %evl) { +; CHECK-RV32-LABEL: strided_store_nxv17f64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: csrr a4, vlenb +; CHECK-RV32-NEXT: slli a4, a4, 3 +; CHECK-RV32-NEXT: sub sp, sp, a4 +; CHECK-RV32-NEXT: csrr a4, vlenb +; CHECK-RV32-NEXT: slli a7, a4, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v0 +; CHECK-RV32-NEXT: addi a5, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: mv a6, a3 +; CHECK-RV32-NEXT: bltu a3, a7, .LBB36_2 +; CHECK-RV32-NEXT: # %bb.1: +; CHECK-RV32-NEXT: mv a6, a7 +; CHECK-RV32-NEXT: .LBB36_2: +; CHECK-RV32-NEXT: mv a5, a6 +; CHECK-RV32-NEXT: bltu a6, a4, .LBB36_4 +; CHECK-RV32-NEXT: # %bb.3: +; CHECK-RV32-NEXT: mv a5, a4 +; CHECK-RV32-NEXT: .LBB36_4: +; CHECK-RV32-NEXT: li t0, 0 +; CHECK-RV32-NEXT: vl8re64.v v16, (a0) +; CHECK-RV32-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV32-NEXT: vmv1r.v v0, v24 +; CHECK-RV32-NEXT: vsse64.v v8, (a1), a2, v0.t +; CHECK-RV32-NEXT: sub a7, a3, a7 +; CHECK-RV32-NEXT: srli a0, a4, 2 +; CHECK-RV32-NEXT: bltu a3, a7, .LBB36_6 +; CHECK-RV32-NEXT: # %bb.5: +; CHECK-RV32-NEXT: mv t0, a7 +; CHECK-RV32-NEXT: .LBB36_6: +; CHECK-RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-RV32-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-RV32-NEXT: bltu t0, a4, .LBB36_8 +; CHECK-RV32-NEXT: # %bb.7: +; CHECK-RV32-NEXT: mv t0, a4 +; CHECK-RV32-NEXT: .LBB36_8: +; CHECK-RV32-NEXT: li a0, 0 +; CHECK-RV32-NEXT: mul a3, a6, a2 +; CHECK-RV32-NEXT: add a7, a1, a3 +; CHECK-RV32-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-RV32-NEXT: sub a3, a6, a4 +; CHECK-RV32-NEXT: vsse64.v v16, (a7), a2, v0.t +; CHECK-RV32-NEXT: bltu a6, a3, .LBB36_10 +; CHECK-RV32-NEXT: # %bb.9: +; CHECK-RV32-NEXT: mv a0, a3 +; CHECK-RV32-NEXT: .LBB36_10: +; CHECK-RV32-NEXT: srli a3, a4, 3 +; CHECK-RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; CHECK-RV32-NEXT: vslidedown.vx v0, v24, a3 +; CHECK-RV32-NEXT: mul a3, a5, a2 +; CHECK-RV32-NEXT: add a1, a1, a3 +; CHECK-RV32-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vsse64.v v8, (a1), a2, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add sp, sp, a0 +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: strided_store_nxv17f64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a4, a4, 3 +; CHECK-RV64-NEXT: sub sp, sp, a4 +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a7, a4, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v0 +; CHECK-RV64-NEXT: addi a5, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: mv a6, a3 +; CHECK-RV64-NEXT: bltu a3, a7, .LBB36_2 +; CHECK-RV64-NEXT: # %bb.1: +; CHECK-RV64-NEXT: mv a6, a7 +; CHECK-RV64-NEXT: .LBB36_2: +; CHECK-RV64-NEXT: mv a5, a6 +; CHECK-RV64-NEXT: bltu a6, a4, .LBB36_4 +; CHECK-RV64-NEXT: # %bb.3: +; CHECK-RV64-NEXT: mv a5, a4 +; CHECK-RV64-NEXT: .LBB36_4: +; CHECK-RV64-NEXT: li t0, 0 +; CHECK-RV64-NEXT: vl8re64.v v16, (a0) +; CHECK-RV64-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-RV64-NEXT: vmv1r.v v0, v24 +; CHECK-RV64-NEXT: vsse64.v v8, (a1), a2, v0.t +; CHECK-RV64-NEXT: sub a7, a3, a7 +; CHECK-RV64-NEXT: srli a0, a4, 2 +; CHECK-RV64-NEXT: bltu a3, a7, .LBB36_6 +; CHECK-RV64-NEXT: # %bb.5: +; CHECK-RV64-NEXT: mv t0, a7 +; CHECK-RV64-NEXT: .LBB36_6: +; CHECK-RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-RV64-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-RV64-NEXT: bltu t0, a4, .LBB36_8 +; CHECK-RV64-NEXT: # %bb.7: +; CHECK-RV64-NEXT: mv t0, a4 +; CHECK-RV64-NEXT: .LBB36_8: +; CHECK-RV64-NEXT: li a0, 0 +; CHECK-RV64-NEXT: mul a3, a6, a2 +; CHECK-RV64-NEXT: add a7, a1, a3 +; CHECK-RV64-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-RV64-NEXT: sub a3, a6, a4 +; CHECK-RV64-NEXT: vsse64.v v16, (a7), a2, v0.t +; CHECK-RV64-NEXT: bltu a6, a3, .LBB36_10 +; CHECK-RV64-NEXT: # %bb.9: +; CHECK-RV64-NEXT: mv a0, a3 +; CHECK-RV64-NEXT: .LBB36_10: +; CHECK-RV64-NEXT: srli a3, a4, 3 +; CHECK-RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; CHECK-RV64-NEXT: vslidedown.vx v0, v24, a3 +; CHECK-RV64-NEXT: mul a3, a5, a2 +; CHECK-RV64-NEXT: add a1, a1, a3 +; CHECK-RV64-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsse64.v v8, (a1), a2, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add sp, sp, a0 +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv17f64.p0f64.i32( %v, double* %ptr, i32 %stride, %mask, i32 %evl) + ret void +} + +declare void @llvm.experimental.vp.strided.store.nxv17f64.p0f64.i32(, double*, i32, , i32)