diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1356,6 +1356,10 @@ SDValue getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Mask, SDValue EVL, MachineMemOperand *MMO, bool IsCompressing = false); + SDValue getStoreVP(ISD::MemIndexedMode AM, bool IsTruncating, SDValue Chain, + const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Offset, + SDValue Mask, SDValue EVL, EVT MemVT, + MachineMemOperand *MMO, bool IsCompressing = false); SDValue getTruncStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -70,6 +70,9 @@ case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break; case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N)); break; + case ISD::VP_LOAD: + Res = PromoteIntRes_VP_LOAD(cast(N)); + break; case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N)); break; case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast(N)); @@ -738,6 +741,23 @@ return Res; } +SDValue DAGTypeLegalizer::PromoteIntRes_VP_LOAD(VPLoadSDNode *N) { + assert(!N->isIndexed() && "Indexed vp_load during type legalization!"); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + ISD::LoadExtType ExtType = (N->getExtensionType() == ISD::NON_EXTLOAD) + ? ISD::EXTLOAD + : N->getExtensionType(); + SDLoc dl(N); + SDValue Res = + DAG.getLoadVP(N->getAddressingMode(), ExtType, NVT, dl, N->getChain(), + N->getBasePtr(), N->getOffset(), N->getMask(), + N->getVectorLength(), N->getMemoryVT(), N->getMemOperand()); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue ExtPassThru = GetPromotedInteger(N->getPassThru()); @@ -1564,8 +1584,14 @@ case ISD::STRICT_SINT_TO_FP: Res = PromoteIntOp_STRICT_SINT_TO_FP(N); break; case ISD::STORE: Res = PromoteIntOp_STORE(cast(N), OpNo); break; + case ISD::VP_STORE: + Res = PromoteIntOp_VP_STORE(cast(N), OpNo); + break; case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast(N), OpNo); break; + case ISD::VP_LOAD: + Res = PromoteIntOp_VP_LOAD(cast(N), OpNo); + break; case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), OpNo); break; case ISD::MGATHER: Res = PromoteIntOp_MGATHER(cast(N), @@ -1885,6 +1911,50 @@ N->getMemoryVT(), N->getMemOperand()); } +SDValue DAGTypeLegalizer::PromoteIntOp_VP_STORE(VPStoreSDNode *N, + unsigned OpNo) { + SDValue DataOp = N->getValue(); + SDValue Operand = N->getOperand(OpNo); + + if (OpNo >= 4) { + // The Mask or EVL. Update in place. + EVT DataVT = DataOp.getValueType(); + SDValue PromotedOperand = OpNo == 4 ? PromoteTargetBoolean(Operand, DataVT) + : ZExtPromotedInteger(Operand); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = PromotedOperand; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + } + + assert(OpNo == 1 && "Unexpected operand for promotion"); + DataOp = GetPromotedInteger(DataOp); + + assert(!N->isIndexed() && "expecting unindexed vp_store!"); + + return DAG.getTruncStoreVP(N->getChain(), SDLoc(N), DataOp, N->getBasePtr(), + N->getMask(), N->getVectorLength(), + N->getMemoryVT(), N->getMemOperand(), + N->isCompressingStore()); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_VP_LOAD(VPLoadSDNode *N, unsigned OpNo) { + assert(OpNo >= 3 && "Only know how to promote the mask or length!"); + EVT DataVT = N->getValueType(0); + SDValue Operand = N->getOperand(OpNo); + SDValue PromotedOperand = OpNo == 3 ? PromoteTargetBoolean(Operand, DataVT) + : ZExtPromotedInteger(Operand); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = PromotedOperand; + SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); + if (Res == N) + return SDValue(Res, 0); + + // Update triggered CSE, do our own replacement since caller can't. + ReplaceValueWith(SDValue(N, 0), SDValue(Res, 0)); + ReplaceValueWith(SDValue(N, 1), SDValue(Res, 1)); + return SDValue(); +} + SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { SDValue DataOp = N->getValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -324,6 +324,7 @@ SDValue PromoteIntRes_FREEZE(SDNode *N); SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); + SDValue PromoteIntRes_VP_LOAD(VPLoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); @@ -384,7 +385,9 @@ SDValue PromoteIntOp_UINT_TO_FP(SDNode *N); SDValue PromoteIntOp_STRICT_UINT_TO_FP(SDNode *N); SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); + SDValue PromoteIntOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_VP_LOAD(VPLoadSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); @@ -837,6 +840,7 @@ void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi); void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -863,6 +867,7 @@ SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); + SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo); @@ -900,6 +905,7 @@ SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); + SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_ScalarOp(SDNode* N); @@ -934,6 +940,7 @@ SDValue WidenVecOp_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); + SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -936,6 +936,9 @@ case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; + case ISD::VP_LOAD: + SplitVecRes_VP_LOAD(cast(N), Lo, Hi); + break; case ISD::MLOAD: SplitVecRes_MLOAD(cast(N), Lo, Hi); break; @@ -1781,6 +1784,91 @@ ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, + SDValue &Hi) { + assert(LD->isUnindexed() && "Indexed VP load during type legalization!"); + EVT LoVT, HiVT; + SDLoc dl(LD); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0)); + + ISD::LoadExtType ExtType = LD->getExtensionType(); + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + SDValue Offset = LD->getOffset(); + assert(Offset.isUndef() && "Unexpected indexed variable-length store offset"); + Align Alignment = LD->getOriginalAlign(); + SDValue Mask = LD->getMask(); + SDValue EVL = LD->getVectorLength(); + EVT MemoryVT = LD->getMemoryVT(); + + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = + DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty); + + // Split Mask operand + SDValue MaskLo, MaskHi; + if (Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + } + + // Split EVL operand + EVT EVLVT = EVL.getValueType(); + SDValue EVLLo = + DAG.getNode(ISD::UMIN, dl, EVLVT, EVL, + DAG.getConstant(LoMemVT.getVectorNumElements(), dl, EVLVT)); + SDValue EVLHi = DAG.getNode(ISD::SUB, dl, EVLVT, EVL, EVLLo); + + unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize()); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + LD->getPointerInfo(), MachineMemOperand::MOLoad, LoSize, Alignment, + LD->getAAInfo(), LD->getRanges()); + + Lo = + DAG.getLoadVP(LD->getAddressingMode(), ExtType, LoVT, dl, Ch, Ptr, Offset, + MaskLo, EVLLo, LoMemVT, MMO, LD->isExpandingLoad()); + + if (HiIsEmpty) { + // The hi vp_load has zero storage size. We therefore simply set it to + // the low vp_load and rely on subsequent removal from the chain. + Hi = Lo; + } else { + // Generate hi vp_load. + Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG, + LD->isExpandingLoad()); + unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize()); + + MachinePointerInfo MPI; + if (LoMemVT.isScalableVector()) + MPI = MachinePointerInfo(LD->getPointerInfo().getAddrSpace()); + else + MPI = LD->getPointerInfo().getWithOffset( + LoMemVT.getStoreSize().getFixedSize()); + + MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOLoad, HiSize, Alignment, LD->getAAInfo(), + LD->getRanges()); + + Hi = DAG.getLoadVP(LD->getAddressingMode(), ExtType, HiVT, dl, Ch, Ptr, + Offset, MaskHi, EVLHi, HiMemVT, MMO, + LD->isExpandingLoad()); + } + + // Build a factor node to remember that this load is independent of the + // other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(LD, 1), Ch); +} + void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi) { assert(MLD->isUnindexed() && "Indexed masked load during type legalization!"); @@ -2222,6 +2310,9 @@ case ISD::STORE: Res = SplitVecOp_STORE(cast(N), OpNo); break; + case ISD::VP_STORE: + Res = SplitVecOp_VP_STORE(cast(N), OpNo); + break; case ISD::MSTORE: Res = SplitVecOp_MSTORE(cast(N), OpNo); break; @@ -2625,6 +2716,92 @@ return SDValue(); } +SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) { + assert(N->isUnindexed() && "Indexed vp_store of vector?"); + SDValue Ch = N->getChain(); + SDValue Ptr = N->getBasePtr(); + SDValue Offset = N->getOffset(); + assert(Offset.isUndef() && "Unexpected indexed masked store offset"); + SDValue Mask = N->getMask(); + SDValue EVL = N->getVectorLength(); + EVT EVLVT = EVL.getValueType(); + SDValue Data = N->getValue(); + Align Alignment = N->getOriginalAlign(); + SDLoc DL(N); + + SDValue DataLo, DataHi; + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + // Split Data operand + GetSplitVector(Data, DataLo, DataHi); + else + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + + // Split Mask operand + SDValue MaskLo, MaskHi; + if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + } + + EVT MemoryVT = N->getMemoryVT(); + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = + DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty); + + SDValue EVLLo = + DAG.getNode(ISD::UMIN, DL, EVLVT, EVL, + DAG.getConstant(LoMemVT.getVectorNumElements(), DL, EVLVT)); + SDValue EVLHi = DAG.getNode(ISD::SUB, DL, EVLVT, EVL, EVLLo); + + SDValue Lo, Hi, Res; + unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize()); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + N->getPointerInfo(), MachineMemOperand::MOStore, LoSize, Alignment, + N->getAAInfo(), N->getRanges()); + + Lo = DAG.getStoreVP(N->getAddressingMode(), N->isTruncatingStore(), Ch, DL, + DataLo, Ptr, Offset, MaskLo, EVLLo, LoMemVT, MMO, + N->isCompressingStore()); + + if (HiIsEmpty) { + // The hi vp_store has zero storage size. + // Only the lo vp_store is needed. + Res = Lo; + } else { + Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, + N->isCompressingStore()); + + MachinePointerInfo MPI; + if (LoMemVT.isScalableVector()) { + Alignment = commonAlignment( + Alignment, LoMemVT.getSizeInBits().getKnownMinSize() / 8); + MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace()); + } else + MPI = N->getPointerInfo().getWithOffset( + LoMemVT.getStoreSize().getFixedSize()); + + unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize()); + MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOStore, HiSize, Alignment, N->getAAInfo(), + N->getRanges()); + + Hi = DAG.getStoreVP(N->getAddressingMode(), N->isTruncatingStore(), Ch, DL, + DataHi, Ptr, Offset, MaskHi, EVLHi, HiMemVT, MMO, + N->isCompressingStore()); + + // Build a factor node to remember that this store is independent of the + // other one. + Res = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + } + + return Res; +} + SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { assert(N->isUnindexed() && "Indexed masked store of vector?"); @@ -3054,6 +3231,9 @@ case ISD::VECTOR_SHUFFLE: Res = WidenVecRes_VECTOR_SHUFFLE(cast(N)); break; + case ISD::VP_LOAD: + Res = WidenVecRes_VP_LOAD(cast(N)); + break; case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast(N)); break; @@ -4149,6 +4329,30 @@ return Result; } +SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) { + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + EVT MaskVT = Mask.getValueType(); + SDValue EVL = N->getVectorLength(); + ISD::LoadExtType ExtType = N->getExtensionType(); + SDLoc dl(N); + + // The mask should be widened as well + EVT WideMaskVT = + EVT::getVectorVT(*DAG.getContext(), MaskVT.getVectorElementType(), + WidenVT.getVectorNumElements()); + Mask = ModifyToType(Mask, WideMaskVT, true); + + SDValue Res = + DAG.getLoadVP(N->getAddressingMode(), ExtType, WidenVT, dl, N->getChain(), + N->getBasePtr(), N->getOffset(), Mask, EVL, + N->getMemoryVT(), N->getMemOperand(), N->isExpandingLoad()); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0)); @@ -4635,6 +4839,9 @@ case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; + case ISD::VP_STORE: + Res = WidenVecOp_VP_STORE(N, OpNo); + break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break; case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; @@ -4995,6 +5202,46 @@ return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain); } +SDValue DAGTypeLegalizer::WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo) { + assert((OpNo == 1 || OpNo == 3) && + "Can widen only data or mask operand of vp_store"); + VPStoreSDNode *ST = cast(N); + SDValue Mask = ST->getMask(); + EVT MaskVT = Mask.getValueType(); + SDValue StVal = ST->getValue(); + SDLoc dl(N); + + if (OpNo == 1) { + // Widen the value. + StVal = GetWidenedVector(StVal); + + // The mask should be widened as well. + EVT WideVT = StVal.getValueType(); + EVT WideMaskVT = + EVT::getVectorVT(*DAG.getContext(), MaskVT.getVectorElementType(), + WideVT.getVectorNumElements()); + Mask = ModifyToType(Mask, WideMaskVT, true); + } else { + // Widen the mask. + EVT WideMaskVT = TLI.getTypeToTransformTo(*DAG.getContext(), MaskVT); + Mask = ModifyToType(Mask, WideMaskVT, true); + + EVT ValueVT = StVal.getValueType(); + EVT WideVT = + EVT::getVectorVT(*DAG.getContext(), ValueVT.getVectorElementType(), + WideMaskVT.getVectorNumElements()); + StVal = ModifyToType(StVal, WideVT); + } + + assert(Mask.getValueType().getVectorNumElements() == + StVal.getValueType().getVectorNumElements() && + "Mask and data vectors should have the same number of elements"); + return DAG.getStoreVP(ST->getAddressingMode(), false, ST->getChain(), dl, + StVal, ST->getBasePtr(), ST->getOffset(), Mask, + ST->getVectorLength(), ST->getMemoryVT(), + ST->getMemOperand(), ST->isCompressingStore()); +} + SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { assert((OpNo == 1 || OpNo == 3) && "Can widen only data or mask operand of mstore"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7691,23 +7691,6 @@ SDValue Offset, SDValue Mask, SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) { - if (VT == MemVT) { - ExtType = ISD::NON_EXTLOAD; - } else if (ExtType == ISD::NON_EXTLOAD) { - assert(VT == MemVT && "Non-extending load from different memory type!"); - } else { - // Extending load. - assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) && - "Should only be an extending load, not truncating!"); - assert(VT.isInteger() == MemVT.isInteger() && - "Cannot convert from FP to Int or Int -> FP!"); - assert(VT.isVector() == MemVT.isVector() && - "Cannot use an ext load to convert to or from a vector!"); - assert((!VT.isVector() || - VT.getVectorElementCount() == MemVT.getVectorElementCount()) && - "Cannot use an ext load to change the number of vector elements!"); - } - bool Indexed = AM != ISD::UNINDEXED; assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!"); @@ -7847,6 +7830,39 @@ return V; } +SDValue SelectionDAG::getStoreVP(ISD::MemIndexedMode AM, bool IsTruncating, + SDValue Chain, const SDLoc &dl, SDValue Val, + SDValue Ptr, SDValue Offset, SDValue Mask, + SDValue EVL, EVT MemVT, MachineMemOperand *MMO, + bool IsCompressing) { + assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); + bool Indexed = AM != ISD::UNINDEXED; + assert((Indexed || Offset.isUndef()) && "Unindexed vp_store with an offset!"); + SDVTList VTs = Indexed ? getVTList(Ptr.getValueType(), MVT::Other) + : getVTList(MVT::Other); + SDValue Ops[] = {Chain, Val, Ptr, Offset, Mask, EVL}; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops); + ID.AddInteger(MemVT.getRawBits()); + ID.AddInteger(getSyntheticNodeSubclassData( + dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO)); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + void *IP = nullptr; + if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, AM, + IsTruncating, IsCompressing, MemVT, MMO); + createOperands(N, Ops); + + CSEMap.InsertNode(N, IP); + InsertNode(N); + SDValue V(N, 0); + NewSDValueDbgMsg(V, "Creating new node: ", this); + return V; +} + SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7371,7 +7371,7 @@ MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo, Ranges); LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], - MMO, false /*IsExpanding */); + MMO); } else { unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-vp-mem-legalization.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-vp-mem-legalization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-vp-mem-legalization.ll @@ -0,0 +1,1013 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s + + +define void @vp_store_v4i32(<4 x i32>* %ptr, <4 x i32> %val, <4 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v4i32(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v4i32(<4 x i32>, <4 x i32>*, <4 x i1>, i32) +define <4 x i32> @vp_load_v4i32_i32(<4 x i32>* %ptr, <4 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v4i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.vp.load.v4i32(<4 x i32>* %ptr, <4 x i1> %m, i32 %evl) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.vp.load.v4i32(<4 x i32>*, <4 x i1>, i32) + +define void @vp_store_v4i16(<4 x i16>* %ptr, <4 x i16> %val, <4 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v4i16(<4 x i16> %val, <4 x i16>* %ptr, <4 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v4i16(<4 x i16>, <4 x i16>*, <4 x i1>, i32) +define <4 x i16> @vp_load_v4i16_i32(<4 x i16>* %ptr, <4 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v4i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <4 x i16> @llvm.vp.load.v4i16(<4 x i16>* %ptr, <4 x i1> %m, i32 %evl) + ret <4 x i16> %res +} +declare <4 x i16> @llvm.vp.load.v4i16(<4 x i16>*, <4 x i1>, i32) + +define void @vp_store_v8i16(<8 x i16>* %ptr, <8 x i16> %val, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v8i16(<8 x i16> %val, <8 x i16>* %ptr, <8 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i16(<8 x i16>, <8 x i16>*, <8 x i1>, i32) +define <8 x i16> @vp_load_v8i16_i32(<8 x i16>* %ptr, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v8i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.vp.load.v8i16(<8 x i16>* %ptr, <8 x i1> %m, i32 %evl) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.vp.load.v8i16(<8 x i16>*, <8 x i1>, i32) + +define void @vp_store_v8i8(<8 x i8>* %ptr, <8 x i8> %val, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v8i8(<8 x i8> %val, <8 x i8>* %ptr, <8 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i8(<8 x i8>, <8 x i8>*, <8 x i1>, i32) +define <8 x i8> @vp_load_v8i8_i32(<8 x i8>* %ptr, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v8i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.vp.load.v8i8(<8 x i8>* %ptr, <8 x i1> %m, i32 %evl) + ret <8 x i8> %res +} +declare <8 x i8> @llvm.vp.load.v8i8(<8 x i8>*, <8 x i1>, i32) + +define void @vp_store_v16i8(<16 x i8>* %ptr, <16 x i8> %val, <16 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v16i8(<16 x i8> %val, <16 x i8>* %ptr, <16 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v16i8(<16 x i8>, <16 x i8>*, <16 x i1>, i32) +define <16 x i8> @vp_load_v16i8_i32(<16 x i8>* %ptr, <16 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v16i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.vp.load.v16i8(<16 x i8>* %ptr, <16 x i1> %m, i32 %evl) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.vp.load.v16i8(<16 x i8>*, <16 x i1>, i32) + +define void @vp_store_v8i7(<8 x i7>* %ptr, <8 x i7> %val, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v8i7(<8 x i7> %val, <8 x i7>* %ptr, <8 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i7(<8 x i7>, <8 x i7>*, <8 x i1>, i32) +define <8 x i7> @vp_load_v8i7_i32(<8 x i7>* %ptr, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v8i7_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <8 x i7> @llvm.vp.load.v8i7(<8 x i7>* %ptr, <8 x i1> %m, i32 %evl) + ret <8 x i7> %res +} +declare <8 x i7> @llvm.vp.load.v8i7(<8 x i7>*, <8 x i1>, i32) + +define void @vp_store_v3i32(<3 x i32>* %ptr, <3 x i32> %val, <3 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v3i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sb zero, 15(sp) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 12(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 14(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 13(sp) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: addi a2, sp, 12 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + call void @llvm.vp.store.v3i32(<3 x i32> %val, <3 x i32>* %ptr, <3 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v3i32(<3 x i32>, <3 x i32>*, <3 x i1>, i32) +define <3 x i32> @vp_load_v3i32_i32(<3 x i32>* %ptr, <3 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v3i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sb zero, 15(sp) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 12(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 14(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 13(sp) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: addi a2, sp, 12 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call <3 x i32> @llvm.vp.load.v3i32(<3 x i32>* %ptr, <3 x i1> %m, i32 %evl) + ret <3 x i32> %res +} +declare <3 x i32> @llvm.vp.load.v3i32(<3 x i32>*, <3 x i1>, i32) + +define void @vp_store_v7i16(<7 x i16>* %ptr, <7 x i16> %val, <7 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v7i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sb zero, 15(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 8(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 14(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 13(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 12(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 11(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 10(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 9(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: addi a2, sp, 8 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + call void @llvm.vp.store.v7i16(<7 x i16> %val, <7 x i16>* %ptr, <7 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v7i16(<7 x i16>, <7 x i16>*, <7 x i1>, i32) +define <7 x i16> @vp_load_v7i16_i32(<7 x i16>* %ptr, <7 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v7i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sb zero, 15(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 8(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 14(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 13(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 12(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 11(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 10(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 9(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: addi a2, sp, 8 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call <7 x i16> @llvm.vp.load.v7i16(<7 x i16>* %ptr, <7 x i1> %m, i32 %evl) + ret <7 x i16> %res +} +declare <7 x i16> @llvm.vp.load.v7i16(<7 x i16>*, <7 x i1>, i32) + +define void @vp_store_v15i8(<15 x i8>* %ptr, <15 x i8> %val, <15 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v15i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: sb zero, 31(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 16(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 14 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 30(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 13 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 29(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 12 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 28(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 11 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 27(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 10 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 26(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 9 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 25(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 8 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 24(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 7 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 23(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 22(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 21(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 20(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 19(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 18(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 17(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: ret + call void @llvm.vp.store.v15i8(<15 x i8> %val, <15 x i8>* %ptr, <15 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v15i8(<15 x i8>, <15 x i8>*, <15 x i1>, i32) +define <15 x i8> @vp_load_v15i8_i32(<15 x i8>* %ptr, <15 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v15i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: sb zero, 31(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 16(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 14 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 30(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 13 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 29(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 12 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 28(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 11 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 27(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 10 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 26(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 9 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 25(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 8 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 24(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 7 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 23(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 22(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 21(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 20(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 19(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 18(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 17(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: ret + %res = call <15 x i8> @llvm.vp.load.v15i8(<15 x i8>* %ptr, <15 x i1> %m, i32 %evl) + ret <15 x i8> %res +} +declare <15 x i8> @llvm.vp.load.v15i8(<15 x i8>*, <15 x i1>, i32) + +define void @vp_store_v8i32(<8 x i32>* %ptr, <8 x i32> %val, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v8i32(<8 x i32> %val, <8 x i32>* %ptr, <8 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v8i32(<8 x i32>, <8 x i32>*, <8 x i1>, i32) +define <8 x i32> @vp_load_v8i32_i32(<8 x i32>* %ptr, <8 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v8i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.vp.load.v8i32(<8 x i32>* %ptr, <8 x i1> %m, i32 %evl) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.vp.load.v8i32(<8 x i32>*, <8 x i1>, i32) + +define void @vp_store_v7i32(<7 x i32>* %ptr, <7 x i32> %val, <7 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v7i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sb zero, 15(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 8(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 14(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 13(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 12(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 11(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 10(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 9(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: addi a2, sp, 8 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + call void @llvm.vp.store.v7i32(<7 x i32> %val, <7 x i32>* %ptr, <7 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v7i32(<7 x i32>, <7 x i32>*, <7 x i1>, i32) +define <7 x i32> @vp_load_v7i32_i32(<7 x i32>* %ptr, <7 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v7i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sb zero, 15(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 8(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 14(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 13(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 12(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 11(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 10(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 9(sp) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: addi a2, sp, 8 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call <7 x i32> @llvm.vp.load.v7i32(<7 x i32>* %ptr, <7 x i1> %m, i32 %evl) + ret <7 x i32> %res +} +declare <7 x i32> @llvm.vp.load.v7i32(<7 x i32>*, <7 x i1>, i32) + +define void @vp_store_v16i16(<16 x i16>* %ptr, <16 x i16> %val, <16 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v16i16(<16 x i16> %val, <16 x i16>* %ptr, <16 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v16i16(<16 x i16>, <16 x i16>*, <16 x i1>, i32) +define <16 x i16> @vp_load_v16i16_i32(<16 x i16>* %ptr, <16 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v16i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.vp.load.v16i16(<16 x i16>* %ptr, <16 x i1> %m, i32 %evl) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.vp.load.v16i16(<16 x i16>*, <16 x i1>, i32) + +define void @vp_store_v15i16(<15 x i16>* %ptr, <15 x i16> %val, <15 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v15i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: sb zero, 31(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 16(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 14 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 30(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 13 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 29(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 12 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 28(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 11 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 27(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 10 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 26(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 9 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 25(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 8 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 24(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 7 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 23(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 22(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 21(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 20(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 19(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 18(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 17(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: ret + call void @llvm.vp.store.v15i16(<15 x i16> %val, <15 x i16>* %ptr, <15 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v15i16(<15 x i16>, <15 x i16>*, <15 x i1>, i32) +define <15 x i16> @vp_load_v15i16_i32(<15 x i16>* %ptr, <15 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v15i16_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: sb zero, 31(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 16(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v26, v25, 14 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 30(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 13 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 29(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 12 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 28(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 11 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 27(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 10 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 26(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 9 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 25(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 8 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 24(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 7 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 23(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 6 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 22(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 5 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 21(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 4 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 20(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 3 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 19(sp) +; CHECK-NEXT: vslidedown.vi v26, v25, 2 +; CHECK-NEXT: vmv.x.s a2, v26 +; CHECK-NEXT: sb a2, 18(sp) +; CHECK-NEXT: vslidedown.vi v25, v25, 1 +; CHECK-NEXT: vmv.x.s a2, v25 +; CHECK-NEXT: sb a2, 17(sp) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vle8.v v25, (a2) +; CHECK-NEXT: vand.vi v25, v25, 1 +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: ret + %res = call <15 x i16> @llvm.vp.load.v15i16(<15 x i16>* %ptr, <15 x i1> %m, i32 %evl) + ret <15 x i16> %res +} +declare <15 x i16> @llvm.vp.load.v15i16(<15 x i16>*, <15 x i1>, i32) + +define void @vp_store_v32i8(<32 x i8>* %ptr, <32 x i8> %val, <32 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v32i8(<32 x i8> %val, <32 x i8>* %ptr, <32 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v32i8(<32 x i8>, <32 x i8>*, <32 x i1>, i32) +define <32 x i8> @vp_load_v32i8_i32(<32 x i8>* %ptr, <32 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v32i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %res = call <32 x i8> @llvm.vp.load.v32i8(<32 x i8>* %ptr, <32 x i1> %m, i32 %evl) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.vp.load.v32i8(<32 x i8>*, <32 x i1>, i32) + +define void @vp_store_v31i8(<31 x i8>* %ptr, <31 x i8> %val, <31 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_store_v31i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset ra, -8 +; CHECK-NEXT: .cfi_offset s0, -16 +; CHECK-NEXT: addi s0, sp, 96 +; CHECK-NEXT: .cfi_def_cfa s0, 0 +; CHECK-NEXT: andi sp, sp, -32 +; CHECK-NEXT: sb zero, 63(sp) +; CHECK-NEXT: addi a2, zero, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: vmv.x.s a3, v26 +; CHECK-NEXT: sb a3, 32(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v28, v26, 30 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 62(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 29 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 61(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 28 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 60(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 27 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 59(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 26 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 58(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 25 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 57(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 24 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 56(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 23 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 55(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 22 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 54(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 21 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 53(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 20 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 52(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 19 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 51(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 18 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 50(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 17 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 49(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 16 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 48(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 15 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 47(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 14 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 46(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 13 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 45(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 12 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 44(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 11 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 43(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 10 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 42(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 9 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 41(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 8 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 40(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 7 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 39(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 6 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 38(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 5 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 37(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 4 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 36(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 3 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 35(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 2 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 34(sp) +; CHECK-NEXT: vslidedown.vi v26, v26, 1 +; CHECK-NEXT: vmv.x.s a3, v26 +; CHECK-NEXT: sb a3, 33(sp) +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: addi a2, sp, 32 +; CHECK-NEXT: vle8.v v26, (a2) +; CHECK-NEXT: vand.vi v26, v26, 1 +; CHECK-NEXT: vmsne.vi v0, v26, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, s0, -96 +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 +; CHECK-NEXT: ret + call void @llvm.vp.store.v31i8(<31 x i8> %val, <31 x i8>* %ptr, <31 x i1> %m, i32 %evl) + ret void +} +declare void @llvm.vp.store.v31i8(<31 x i8>, <31 x i8>*, <31 x i1>, i32) +define <31 x i8> @vp_load_v31i8_i32(<31 x i8>* %ptr, <31 x i1> %m, i32 %evl) { +; CHECK-LABEL: vp_load_v31i8_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset ra, -8 +; CHECK-NEXT: .cfi_offset s0, -16 +; CHECK-NEXT: addi s0, sp, 96 +; CHECK-NEXT: .cfi_def_cfa s0, 0 +; CHECK-NEXT: andi sp, sp, -32 +; CHECK-NEXT: sb zero, 63(sp) +; CHECK-NEXT: addi a2, zero, 32 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: vmv.x.s a3, v26 +; CHECK-NEXT: sb a3, 32(sp) +; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v28, v26, 30 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 62(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 29 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 61(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 28 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 60(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 27 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 59(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 26 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 58(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 25 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 57(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 24 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 56(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 23 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 55(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 22 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 54(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 21 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 53(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 20 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 52(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 19 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 51(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 18 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 50(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 17 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 49(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 16 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 48(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 15 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 47(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 14 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 46(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 13 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 45(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 12 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 44(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 11 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 43(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 10 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 42(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 9 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 41(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 8 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 40(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 7 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 39(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 6 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 38(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 5 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 37(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 4 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 36(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 3 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 35(sp) +; CHECK-NEXT: vslidedown.vi v28, v26, 2 +; CHECK-NEXT: vmv.x.s a3, v28 +; CHECK-NEXT: sb a3, 34(sp) +; CHECK-NEXT: vslidedown.vi v26, v26, 1 +; CHECK-NEXT: vmv.x.s a3, v26 +; CHECK-NEXT: sb a3, 33(sp) +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, mu +; CHECK-NEXT: addi a2, sp, 32 +; CHECK-NEXT: vle8.v v26, (a2) +; CHECK-NEXT: vand.vi v26, v26, 1 +; CHECK-NEXT: vmsne.vi v0, v26, 0 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: addi sp, s0, -96 +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 +; CHECK-NEXT: ret + %res = call <31 x i8> @llvm.vp.load.v31i8(<31 x i8>* %ptr, <31 x i1> %m, i32 %evl) + ret <31 x i8> %res +} +declare <31 x i8> @llvm.vp.load.v31i8(<31 x i8>*, <31 x i1>, i32) +