diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1350,13 +1350,9 @@ SDValue getIndexedLoadVP(SDValue OrigLoad, const SDLoc &dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM); SDValue getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, - SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, - Align Alignment, MachineMemOperand::Flags MMOFlags, - const AAMDNodes &AAInfo = AAMDNodes(), - bool IsCompressing = false); - SDValue getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, - SDValue Mask, SDValue EVL, MachineMemOperand *MMO, - bool IsCompressing = false); + SDValue Offset, SDValue Mask, SDValue EVL, EVT MemVT, + MachineMemOperand *MMO, ISD::MemIndexedMode AM, + bool IsTruncating = false, bool IsCompressing = false); SDValue getTruncStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -846,6 +846,7 @@ void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi); void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -872,6 +873,7 @@ SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); + SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo); @@ -910,6 +912,7 @@ SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); + SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_ScalarOp(SDNode* N); @@ -944,6 +947,7 @@ SDValue WidenVecOp_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); + SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -937,6 +937,9 @@ case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; + case ISD::VP_LOAD: + SplitVecRes_VP_LOAD(cast(N), Lo, Hi); + break; case ISD::MLOAD: SplitVecRes_MLOAD(cast(N), Lo, Hi); break; @@ -1752,6 +1755,86 @@ ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, + SDValue &Hi) { + assert(LD->isUnindexed() && "Indexed VP load during type legalization!"); + EVT LoVT, HiVT; + SDLoc dl(LD); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0)); + + ISD::LoadExtType ExtType = LD->getExtensionType(); + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + SDValue Offset = LD->getOffset(); + assert(Offset.isUndef() && "Unexpected indexed variable-length load offset"); + Align Alignment = LD->getOriginalAlign(); + SDValue Mask = LD->getMask(); + SDValue EVL = LD->getVectorLength(); + EVT MemoryVT = LD->getMemoryVT(); + + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = + DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty); + + // Split Mask operand + SDValue MaskLo, MaskHi; + if (Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + } + + // Split EVL operand + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, LD->getValueType(0), dl); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + LD->getPointerInfo(), MachineMemOperand::MOLoad, + MemoryLocation::UnknownSize, Alignment, LD->getAAInfo(), LD->getRanges()); + + Lo = + DAG.getLoadVP(LD->getAddressingMode(), ExtType, LoVT, dl, Ch, Ptr, Offset, + MaskLo, EVLLo, LoMemVT, MMO, LD->isExpandingLoad()); + + if (HiIsEmpty) { + // The hi vp_load has zero storage size. We therefore simply set it to + // the low vp_load and rely on subsequent removal from the chain. + Hi = Lo; + } else { + // Generate hi vp_load. + Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG, + LD->isExpandingLoad()); + + MachinePointerInfo MPI; + if (LoMemVT.isScalableVector()) + MPI = MachinePointerInfo(LD->getPointerInfo().getAddrSpace()); + else + MPI = LD->getPointerInfo().getWithOffset( + LoMemVT.getStoreSize().getFixedSize()); + + MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment, + LD->getAAInfo(), LD->getRanges()); + + Hi = DAG.getLoadVP(LD->getAddressingMode(), ExtType, HiVT, dl, Ch, Ptr, + Offset, MaskHi, EVLHi, HiMemVT, MMO, + LD->isExpandingLoad()); + } + + // Build a factor node to remember that this load is independent of the + // other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(LD, 1), Ch); +} + void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi) { assert(MLD->isUnindexed() && "Indexed masked load during type legalization!"); @@ -2192,6 +2275,9 @@ case ISD::STORE: Res = SplitVecOp_STORE(cast(N), OpNo); break; + case ISD::VP_STORE: + Res = SplitVecOp_VP_STORE(cast(N), OpNo); + break; case ISD::MSTORE: Res = SplitVecOp_MSTORE(cast(N), OpNo); break; @@ -2595,6 +2681,84 @@ return SDValue(); } +SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) { + assert(N->isUnindexed() && "Indexed vp_store of vector?"); + SDValue Ch = N->getChain(); + SDValue Ptr = N->getBasePtr(); + SDValue Offset = N->getOffset(); + assert(Offset.isUndef() && "Unexpected VP store offset"); + SDValue Mask = N->getMask(); + SDValue EVL = N->getVectorLength(); + SDValue Data = N->getValue(); + Align Alignment = N->getOriginalAlign(); + SDLoc DL(N); + + SDValue DataLo, DataHi; + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + // Split Data operand + GetSplitVector(Data, DataLo, DataHi); + else + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + + // Split Mask operand + SDValue MaskLo, MaskHi; + if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + } + + EVT MemoryVT = N->getMemoryVT(); + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = + DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty); + + // Split EVL + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, Data.getValueType(), DL); + + SDValue Lo, Hi; + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + N->getPointerInfo(), MachineMemOperand::MOStore, + MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); + + Lo = DAG.getStoreVP(Ch, DL, DataLo, Ptr, Offset, MaskLo, EVLLo, LoMemVT, MMO, + N->getAddressingMode(), N->isTruncatingStore(), + N->isCompressingStore()); + + // If the hi vp_store has zero storage size, only the lo vp_store is needed. + if (HiIsEmpty) + return Lo; + + Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, + N->isCompressingStore()); + + MachinePointerInfo MPI; + if (LoMemVT.isScalableVector()) { + Alignment = commonAlignment(Alignment, + LoMemVT.getSizeInBits().getKnownMinSize() / 8); + MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace()); + } else + MPI = N->getPointerInfo().getWithOffset( + LoMemVT.getStoreSize().getFixedSize()); + + MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment, + N->getAAInfo(), N->getRanges()); + + Hi = DAG.getStoreVP(Ch, DL, DataHi, Ptr, Offset, MaskHi, EVLHi, HiMemVT, MMO, + N->getAddressingMode(), N->isTruncatingStore(), + N->isCompressingStore()); + + // Build a factor node to remember that this store is independent of the + // other one. + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { assert(N->isUnindexed() && "Indexed masked store of vector?"); @@ -3028,6 +3192,9 @@ case ISD::VECTOR_SHUFFLE: Res = WidenVecRes_VECTOR_SHUFFLE(cast(N)); break; + case ISD::VP_LOAD: + Res = WidenVecRes_VP_LOAD(cast(N)); + break; case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast(N)); break; @@ -4193,6 +4360,33 @@ report_fatal_error("Unable to widen vector load"); } +SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) { + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + SDValue EVL = N->getVectorLength(); + ISD::LoadExtType ExtType = N->getExtensionType(); + SDLoc dl(N); + + // The mask should be widened as well + assert(getTypeAction(Mask.getValueType()) == TargetLowering::TypeWidenVector && + "Unable to widen binary VP op"); + EVT WideMaskVT = + TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType()); + Mask = GetWidenedVector(Mask); + assert(Mask.getValueType().getVectorElementCount() == + WideMaskVT.getVectorElementCount() && + "Unable to widen vector load"); + + SDValue Res = + DAG.getLoadVP(N->getAddressingMode(), ExtType, WidenVT, dl, N->getChain(), + N->getBasePtr(), N->getOffset(), Mask, EVL, + N->getMemoryVT(), N->getMemOperand(), N->isExpandingLoad()); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0)); @@ -4680,6 +4874,7 @@ case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; + case ISD::VP_STORE: Res = WidenVecOp_VP_STORE(N, OpNo); break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break; case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; @@ -5061,15 +5256,54 @@ unsigned NumVTElts = StVT.getVectorMinNumElements(); SDValue EVL = DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts)); - const auto *MMO = ST->getMemOperand(); - return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), Mask, - EVL, MMO->getPointerInfo(), MMO->getAlign(), - MMO->getFlags(), MMO->getAAInfo()); + return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), + DAG.getUNDEF(ST->getBasePtr().getValueType()), Mask, + EVL, StVal.getValueType(), ST->getMemOperand(), + ST->getAddressingMode()); } report_fatal_error("Unable to widen vector store"); } +SDValue DAGTypeLegalizer::WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo) { + assert((OpNo == 1 || OpNo == 3) && + "Can widen only data or mask operand of vp_store"); + VPStoreSDNode *ST = cast(N); + SDValue Mask = ST->getMask(); + SDValue StVal = ST->getValue(); + SDLoc dl(N); + + if (OpNo == 1) { + // Widen the value. + StVal = GetWidenedVector(StVal); + + // We only handle the case where the mask needs widening to an + // identically-sized type as the vector inputs. + assert(getTypeAction(Mask.getValueType()) == + TargetLowering::TypeWidenVector && + "Unable to widen VP store"); + Mask = GetWidenedVector(Mask); + } else { + Mask = GetWidenedVector(Mask); + + // We only handle the case where the stored value needs widening to an + // identically-sized type as the mask. + EVT ValueVT = StVal.getValueType(); + assert(getTypeAction(ValueVT) == TargetLowering::TypeWidenVector && + "Unable to widen VP store"); + StVal = GetWidenedVector(StVal); + } + + assert(Mask.getValueType().getVectorElementCount() == + StVal.getValueType().getVectorElementCount() && + "Mask and data vectors should have the same number of elements"); + return DAG.getStoreVP(ST->getChain(), dl, StVal, ST->getBasePtr(), + ST->getOffset(), Mask, ST->getVectorLength(), + ST->getMemoryVT(), ST->getMemOperand(), + ST->getAddressingMode(), ST->isTruncatingStore(), + ST->isCompressingStore()); +} + SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { assert((OpNo == 1 || OpNo == 3) && "Can widen only data or mask operand of mstore"); @@ -5748,6 +5982,8 @@ EVT InVT = InOp.getValueType(); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); + assert(!InVT.isScalableVector() && !NVT.isScalableVector() && + "cannot modify scalable vectors in this way"); SDLoc dl(InOp); // Check if InOp already has the right width. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7694,23 +7694,6 @@ SDValue Offset, SDValue Mask, SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) { - if (VT == MemVT) { - ExtType = ISD::NON_EXTLOAD; - } else if (ExtType == ISD::NON_EXTLOAD) { - assert(VT == MemVT && "Non-extending load from different memory type!"); - } else { - // Extending load. - assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) && - "Should only be an extending load, not truncating!"); - assert(VT.isInteger() == MemVT.isInteger() && - "Cannot convert from FP to Int or Int -> FP!"); - assert(VT.isVector() == MemVT.isVector() && - "Cannot use an ext load to convert to or from a vector!"); - assert((!VT.isVector() || - VT.getVectorElementCount() == MemVT.getVectorElementCount()) && - "Cannot use an ext load to change the number of vector elements!"); - } - bool Indexed = AM != ISD::UNINDEXED; assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!"); @@ -7799,48 +7782,29 @@ } SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, - SDValue Ptr, SDValue Mask, SDValue EVL, - MachinePointerInfo PtrInfo, Align Alignment, - MachineMemOperand::Flags MMOFlags, - const AAMDNodes &AAInfo, bool IsCompressing) { + SDValue Ptr, SDValue Offset, SDValue Mask, + SDValue EVL, EVT MemVT, MachineMemOperand *MMO, + ISD::MemIndexedMode AM, bool IsTruncating, + bool IsCompressing) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); - - MMOFlags |= MachineMemOperand::MOStore; - assert((MMOFlags & MachineMemOperand::MOLoad) == 0); - - if (PtrInfo.V.isNull()) - PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr); - - MachineFunction &MF = getMachineFunction(); - uint64_t Size = - MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize()); - MachineMemOperand *MMO = - MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo); - return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing); -} - -SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, - SDValue Ptr, SDValue Mask, SDValue EVL, - MachineMemOperand *MMO, bool IsCompressing) { - assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); - EVT VT = Val.getValueType(); - SDVTList VTs = getVTList(MVT::Other); - SDValue Undef = getUNDEF(Ptr.getValueType()); - SDValue Ops[] = {Chain, Val, Ptr, Undef, Mask, EVL}; + bool Indexed = AM != ISD::UNINDEXED; + assert((Indexed || Offset.isUndef()) && "Unindexed vp_store with an offset!"); + SDVTList VTs = Indexed ? getVTList(Ptr.getValueType(), MVT::Other) + : getVTList(MVT::Other); + SDValue Ops[] = {Chain, Val, Ptr, Offset, Mask, EVL}; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops); - ID.AddInteger(VT.getRawBits()); + ID.AddInteger(MemVT.getRawBits()); ID.AddInteger(getSyntheticNodeSubclassData( - dl.getIROrder(), VTs, ISD::UNINDEXED, false, IsCompressing, VT, MMO)); + dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO)); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { cast(E)->refineAlignment(MMO); return SDValue(E, 0); } - auto *N = - newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, - ISD::UNINDEXED, false, IsCompressing, VT, MMO); + auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, AM, + IsTruncating, IsCompressing, MemVT, MMO); createOperands(N, Ops); CSEMap.InsertNode(N, IP); @@ -7882,7 +7846,9 @@ assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (VT == SVT) - return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing); + return getStoreVP(Chain, dl, Val, Ptr, getUNDEF(Ptr.getValueType()), Mask, + EVL, VT, MMO, ISD::UNINDEXED, + /*IsTruncating*/ false, IsCompressing); assert(SVT.getScalarType().bitsLT(VT.getScalarType()) && "Should only be a truncating store, not extending!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7390,12 +7390,14 @@ AAMDNodes AAInfo = VPIntrin.getAAMetadata(); SDValue ST; if (!IsScatter) { + SDValue Ptr = OpValues[1]; + SDValue Offset = DAG.getUNDEF(Ptr.getValueType()); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, MemoryLocation::UnknownSize, *Alignment, AAInfo); - ST = - DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], OpValues[1], - OpValues[2], OpValues[3], MMO, false /* IsTruncating */); + ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset, + OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED, + /* IsTruncating */ false, /*IsCompressing*/ false); } else { unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7536,12 +7536,12 @@ if (VecVT.getVectorElementType() == MemVT) { SDLoc DL(N); MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount()); - return DAG.getStoreVP(Store->getChain(), DL, Src, Store->getBasePtr(), - DAG.getConstant(1, DL, MaskVT), - DAG.getConstant(1, DL, Subtarget.getXLenVT()), - Store->getPointerInfo(), - Store->getOriginalAlign(), - Store->getMemOperand()->getFlags()); + return DAG.getStoreVP( + Store->getChain(), DL, Src, Store->getBasePtr(), Store->getOffset(), + DAG.getConstant(1, DL, MaskVT), + DAG.getConstant(1, DL, Subtarget.getXLenVT()), MemVT, + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore(), /*IsCompress*/ false); } } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll @@ -16,6 +16,18 @@ ret <2 x i8> %load } +declare <3 x i8> @llvm.vp.load.v3i8.p0v3i8(<3 x i8>*, <3 x i1>, i32) + +define <3 x i8> @vpload_v3i8(<3 x i8>* %ptr, <3 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_v3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <3 x i8> @llvm.vp.load.v3i8.p0v3i8(<3 x i8>* %ptr, <3 x i1> %m, i32 %evl) + ret <3 x i8> %load +} + declare <4 x i8> @llvm.vp.load.v4i8.p0v4i8(<4 x i8>*, <4 x i1>, i32) define <4 x i8> @vpload_v4i8(<4 x i8>* %ptr, <4 x i1> %m, i32 zeroext %evl) { @@ -124,6 +136,30 @@ ret <4 x i32> %load } +declare <6 x i32> @llvm.vp.load.v6i32.p0v6i32(<6 x i32>*, <6 x i1>, i32) + +define <6 x i32> @vpload_v6i32(<6 x i32>* %ptr, <6 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_v6i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <6 x i32> @llvm.vp.load.v6i32.p0v6i32(<6 x i32>* %ptr, <6 x i1> %m, i32 %evl) + ret <6 x i32> %load +} + +define <6 x i32> @vpload_v6i32_allones_mask(<6 x i32>* %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vpload_v6i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %a = insertelement <6 x i1> undef, i1 true, i32 0 + %b = shufflevector <6 x i1> %a, <6 x i1> poison, <6 x i32> zeroinitializer + %load = call <6 x i32> @llvm.vp.load.v6i32.p0v6i32(<6 x i32>* %ptr, <6 x i1> %b, i32 %evl) + ret <6 x i32> %load +} + declare <8 x i32> @llvm.vp.load.v8i32.p0v8i32(<8 x i32>*, <8 x i1>, i32) define <8 x i32> @vpload_v8i32(<8 x i32>* %ptr, <8 x i1> %m, i32 zeroext %evl) { @@ -339,3 +375,91 @@ %load = call <8 x double> @llvm.vp.load.v8f64.p0v8f64(<8 x double>* %ptr, <8 x i1> %m, i32 %evl) ret <8 x double> %load } + +declare <32 x double> @llvm.vp.load.v32f64.p0v32f64(<32 x double>*, <32 x i1>, i32) + +define <32 x double> @vpload_v32f64(<32 x double>* %ptr, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_v32f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a3, a1, -16 +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: bltu a1, a3, .LBB31_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-NEXT: addi a3, a0, 128 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v16, (a3), v0.t +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: bltu a1, a2, .LBB31_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB31_4: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <32 x double> @llvm.vp.load.v32f64.p0v32f64(<32 x double>* %ptr, <32 x i1> %m, i32 %evl) + ret <32 x double> %load +} + +declare <33 x double> @llvm.vp.load.v33f64.p0v33f64(<33 x double>*, <33 x i1>, i32) + +; Widen to v64f64 then split into 4 x v16f64, of which 1 is empty. + +define <33 x double> @vpload_v33f64(<33 x double>* %ptr, <33 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_v33f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a4, a2, -32 +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: bltu a2, a4, .LBB32_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: li a6, 16 +; CHECK-NEXT: bltu a5, a6, .LBB32_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: li a5, 16 +; CHECK-NEXT: .LBB32_4: +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v8, 4 +; CHECK-NEXT: addi a4, a1, 256 +; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v16, (a4), v0.t +; CHECK-NEXT: li a4, 32 +; CHECK-NEXT: bltu a2, a4, .LBB32_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: .LBB32_6: +; CHECK-NEXT: addi a5, a2, -16 +; CHECK-NEXT: bltu a2, a5, .LBB32_8 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: mv a3, a5 +; CHECK-NEXT: .LBB32_8: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-NEXT: addi a4, a1, 128 +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v24, (a4), v0.t +; CHECK-NEXT: bltu a2, a6, .LBB32_10 +; CHECK-NEXT: # %bb.9: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: .LBB32_10: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vle64.v v8, (a1), v0.t +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vse64.v v24, (a1) +; CHECK-NEXT: vfmv.f.s ft0, v16 +; CHECK-NEXT: fsd ft0, 256(a0) +; CHECK-NEXT: ret + %load = call <33 x double> @llvm.vp.load.v33f64.p0v33f64(<33 x double>* %ptr, <33 x i1> %m, i32 %evl) + ret <33 x double> %load +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll @@ -208,6 +208,18 @@ ret void } +declare void @llvm.vp.store.v6f32.p0v6f32(<6 x float>, <6 x float>*, <6 x i1>, i32) + +define void @vpstore_v6f32(<6 x float> %val, <6 x float>* %ptr, <6 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpstore_v6f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v6f32.p0v6f32(<6 x float> %val, <6 x float>* %ptr, <6 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.vp.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, <8 x i1>, i32) define void @vpstore_v8f32(<8 x float> %val, <8 x float>* %ptr, <8 x i1> %m, i32 zeroext %evl) { @@ -267,3 +279,32 @@ call void @llvm.vp.store.v2i8.p0v2i8(<2 x i8> %val, <2 x i8>* %ptr, <2 x i1> %b, i32 %evl) ret void } + +declare void @llvm.vp.store.v32f64.p0v32f64(<32 x double>, <32 x double>*, <32 x i1>, i32) + +define void @vpstore_v32f64(<32 x double> %val, <32 x double>* %ptr, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpstore_v32f64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: bltu a1, a2, .LBB23_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-NEXT: addi a3, a1, -16 +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: bltu a1, a3, .LBB23_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB23_4: +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v16, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v32f64.p0v32f64(<32 x double> %val, <32 x double>* %ptr, <32 x i1> %m, i32 %evl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.load.nxv1i8.p0nxv1i8(*, , i32) @@ -40,6 +40,18 @@ ret %load } +declare @llvm.vp.load.nxv3i8.p0nxv3i8(*, , i32) + +define @vpload_nxv3i8(* %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.vp.load.nxv3i8.p0nxv3i8(* %ptr, %m, i32 %evl) + ret %load +} + declare @llvm.vp.load.nxv4i8.p0nxv4i8(*, , i32) define @vpload_nxv4i8(* %ptr, %m, i32 zeroext %evl) { @@ -435,3 +447,103 @@ %load = call @llvm.vp.load.nxv8f64.p0nxv8f64(* %ptr, %m, i32 %evl) ret %load } + +declare @llvm.vp.load.nxv16f64.p0nxv16f64(*, , i32) + +define @vpload_nxv16f64(* %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_nxv16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a5, a2, 3 +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; CHECK-NEXT: sub a4, a1, a2 +; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: bltu a1, a4, .LBB37_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a3, a4 +; CHECK-NEXT: .LBB37_2: +; CHECK-NEXT: slli a4, a2, 3 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v16, (a4), v0.t +; CHECK-NEXT: bltu a1, a2, .LBB37_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: .LBB37_4: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.vp.load.nxv16f64.p0nxv16f64(* %ptr, %m, i32 %evl) + ret %load +} + +declare @llvm.vp.load.nxv17f64.p0nxv17f64(*, , i32) + +declare @llvm.experimental.vector.extract.nxv1f64( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv16f64( %vec, i64 %idx) + +; Note: We can't return as that introduces a vector +; store can't yet be legalized through widening. In order to test purely the +; vp.load legalization, manually split it. + +; Widen to nxv32f64 then split into 4 x nxv8f64, of which 1 is empty. + +define @vpload_nxv17f64(* %ptr, * %out, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_nxv17f64: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a7, a3, 1 +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: mv t0, a2 +; CHECK-NEXT: bltu a2, a7, .LBB38_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv t0, a7 +; CHECK-NEXT: .LBB38_2: +; CHECK-NEXT: sub a5, t0, a3 +; CHECK-NEXT: li a6, 0 +; CHECK-NEXT: bltu t0, a5, .LBB38_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a6, a5 +; CHECK-NEXT: .LBB38_4: +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: srli t1, a3, 3 +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vx v0, v8, t1 +; CHECK-NEXT: slli a4, a3, 3 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v16, (a4), v0.t +; CHECK-NEXT: srli a6, a3, 2 +; CHECK-NEXT: sub a4, a2, a7 +; CHECK-NEXT: slli a7, a3, 4 +; CHECK-NEXT: bltu a2, a4, .LBB38_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: .LBB38_6: +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vx v0, v8, a6 +; CHECK-NEXT: add a2, a0, a7 +; CHECK-NEXT: bltu a5, a3, .LBB38_8 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: mv a5, a3 +; CHECK-NEXT: .LBB38_8: +; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v24, (a2), v0.t +; CHECK-NEXT: bltu t0, a3, .LBB38_10 +; CHECK-NEXT: # %bb.9: +; CHECK-NEXT: mv t0, a3 +; CHECK-NEXT: .LBB38_10: +; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: vs1r.v v24, (a1) +; CHECK-NEXT: ret + %load = call @llvm.vp.load.nxv17f64.p0nxv17f64(* %ptr, %m, i32 %evl) + %lo = call @llvm.experimental.vector.extract.nxv16f64( %load, i64 0) + %hi = call @llvm.experimental.vector.extract.nxv1f64( %load, i64 16) + store %hi, * %out + ret %lo +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.vp.store.nxv1i8.p0nxv1i8(, *, , i32) @@ -28,6 +28,18 @@ ret void } +declare void @llvm.vp.store.nxv3i8.p0nxv3i8(, *, , i32) + +define void @vpstore_nxv3i8( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpstore_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.nxv3i8.p0nxv3i8( %val, * %ptr, %m, i32 %evl) + ret void +} + declare void @llvm.vp.store.nxv4i8.p0nxv4i8(, *, , i32) define void @vpstore_nxv4i8( %val, * %ptr, %m, i32 zeroext %evl) { @@ -351,3 +363,106 @@ call void @llvm.vp.store.nxv1i8.p0nxv1i8( %val, * %ptr, %b, i32 %evl) ret void } + +declare void @llvm.vp.store.nxv16f64.p0nxv16f64(, *, , i32) + +define void @vpstore_nxv16f64( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpstore_nxv16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: bltu a1, a2, .LBB30_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: .LBB30_2: +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: srli a5, a2, 3 +; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, mu +; CHECK-NEXT: sub a3, a1, a2 +; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: bltu a1, a3, .LBB30_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: .LBB30_4: +; CHECK-NEXT: slli a1, a2, 3 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v16, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.nxv16f64.p0nxv16f64( %val, * %ptr, %m, i32 %evl) + ret void +} + +declare void @llvm.vp.store.nxv17f64.p0nxv17f64(, *, , i32) + +; Widen to nxv32f64 then split into 4 x nxv8f64, of which 1 is empty. + +define void @vpstore_nxv17f64( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpstore_nxv17f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: sub sp, sp, a3 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a6, a3, 1 +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: mv a5, a2 +; CHECK-NEXT: bltu a2, a6, .LBB31_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a5, a6 +; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: mv a4, a5 +; CHECK-NEXT: bltu a5, a3, .LBB31_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: .LBB31_4: +; CHECK-NEXT: li a7, 0 +; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-NEXT: sub a0, a5, a3 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vse64.v v8, (a1), v0.t +; CHECK-NEXT: bltu a5, a0, .LBB31_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: mv a7, a0 +; CHECK-NEXT: .LBB31_6: +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: srli a4, a3, 3 +; CHECK-NEXT: vsetvli a5, zero, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vx v0, v24, a4 +; CHECK-NEXT: slli a4, a3, 3 +; CHECK-NEXT: add a4, a1, a4 +; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, mu +; CHECK-NEXT: addi a5, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a5) # Unknown-size Folded Reload +; CHECK-NEXT: vse64.v v8, (a4), v0.t +; CHECK-NEXT: srli a7, a3, 2 +; CHECK-NEXT: sub a4, a2, a6 +; CHECK-NEXT: slli a5, a3, 4 +; CHECK-NEXT: bltu a2, a4, .LBB31_8 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: .LBB31_8: +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vx v0, v24, a7 +; CHECK-NEXT: add a1, a1, a5 +; CHECK-NEXT: bltu a0, a3, .LBB31_10 +; CHECK-NEXT: # %bb.9: +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: .LBB31_10: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v16, (a1), v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + call void @llvm.vp.store.nxv17f64.p0nxv17f64( %val, * %ptr, %m, i32 %evl) + ret void +}