diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -821,6 +821,9 @@ /// Split mask operator of a VP intrinsic. std::pair SplitMask(SDValue Mask); + /// Split mask operator of a VP intrinsic in a given location. + std::pair SplitMask(SDValue Mask, const SDLoc &DL); + // Helper function for incrementing the pointer when splitting // memory operations void IncrementPointer(MemSDNode *N, EVT MemVT, MachinePointerInfo &MPI, @@ -851,7 +854,7 @@ void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); - void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi); + void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi); void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -879,8 +882,8 @@ SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); - SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); - SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo); + SDValue SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo); + SDValue SplitVecOp_Gather(MemSDNode *MGT, unsigned OpNo); SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -944,7 +944,8 @@ SplitVecRes_MLOAD(cast(N), Lo, Hi); break; case ISD::MGATHER: - SplitVecRes_MGATHER(cast(N), Lo, Hi); + case ISD::VP_GATHER: + SplitVecRes_Gather(cast(N), Lo, Hi); break; case ISD::SETCC: SplitVecRes_SETCC(N, Lo, Hi); @@ -1118,12 +1119,17 @@ } std::pair DAGTypeLegalizer::SplitMask(SDValue Mask) { + return SplitMask(Mask, SDLoc(Mask)); +} + +std::pair DAGTypeLegalizer::SplitMask(SDValue Mask, + const SDLoc &DL) { SDValue MaskLo, MaskHi; EVT MaskVT = Mask.getValueType(); if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector) GetSplitVector(Mask, MaskLo, MaskHi); else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, SDLoc(Mask)); + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); return std::make_pair(MaskLo, MaskHi); } @@ -1923,61 +1929,93 @@ } -void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, - SDValue &Lo, SDValue &Hi) { +void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo, + SDValue &Hi) { EVT LoVT, HiVT; - SDLoc dl(MGT); - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0)); - - SDValue Ch = MGT->getChain(); - SDValue Ptr = MGT->getBasePtr(); - SDValue Mask = MGT->getMask(); - SDValue PassThru = MGT->getPassThru(); - SDValue Index = MGT->getIndex(); - SDValue Scale = MGT->getScale(); - EVT MemoryVT = MGT->getMemoryVT(); - Align Alignment = MGT->getOriginalAlign(); - ISD::LoadExtType ExtType = MGT->getExtensionType(); + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + SDValue Ch = N->getChain(); + SDValue Ptr = N->getBasePtr(); + struct Operands { + SDValue Mask; + SDValue Index; + SDValue Scale; + } Ops = [&]() -> Operands { + if (auto *MSC = dyn_cast(N)) { + return {MSC->getMask(), MSC->getIndex(), MSC->getScale()}; + } + if (auto *VPSC = dyn_cast(N)) { + return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale()}; + } + llvm_unreachable( + "Only know how to split masked.gather and vp.gather nodes"); + }(); + EVT MemoryVT = N->getMemoryVT(); + Align Alignment = N->getOriginalAlign(); // Split Mask operand SDValue MaskLo, MaskHi; - if (Mask.getOpcode() == ISD::SETCC) { - SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + if (Ops.Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi); } else { - if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Mask, MaskLo, MaskHi); + if (getTypeAction(Ops.Mask.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(Ops.Mask, MaskLo, MaskHi); else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Ops.Mask, dl); } EVT LoMemVT, HiMemVT; // Split MemoryVT std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); - SDValue PassThruLo, PassThruHi; - if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(PassThru, PassThruLo, PassThruHi); - else - std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl); - SDValue IndexHi, IndexLo; - if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Index, IndexLo, IndexHi); + if (getTypeAction(Ops.Index.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(Ops.Index, IndexLo, IndexHi); else - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MGT->getPointerInfo(), MachineMemOperand::MOLoad, - MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(), - MGT->getRanges()); - - SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; - Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo, - MMO, MGT->getIndexType(), ExtType); + N->getPointerInfo(), MachineMemOperand::MOLoad, + MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale}; - Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi, - MMO, MGT->getIndexType(), ExtType); + if (auto *MGT = dyn_cast(N)) { + SDValue PassThru = MGT->getPassThru(); + SDValue PassThruLo, PassThruHi; + if (getTypeAction(PassThru.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(PassThru, PassThruLo, PassThruHi); + else + std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl); + + ISD::LoadExtType ExtType = MGT->getExtensionType(); + ISD::MemIndexType IndexTy = MGT->getIndexType(); + + SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Ops.Scale}; + Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, + OpsLo, MMO, IndexTy, ExtType); + + SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Ops.Scale}; + Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, + OpsHi, MMO, IndexTy, ExtType); + } else if (auto *VPGT = dyn_cast(N)) { + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(VPGT->getVectorLength(), MemoryVT, dl); + + SDValue OpsLo[] = {Ch, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo}; + Lo = DAG.getGatherVP(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo, + MMO, VPGT->getIndexType()); + + SDValue OpsHi[] = {Ch, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi}; + Hi = DAG.getGatherVP(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi, + MMO, VPGT->getIndexType()); + } else { + llvm_unreachable( + "Only know how to split masked.gather and vp.gather nodes"); + } // Build a factor node to remember that this load is independent of the // other one. @@ -1986,10 +2024,9 @@ // Legalize the chain result - switch anything that used the old chain to // use the new one. - ReplaceValueWith(SDValue(MGT, 1), Ch); + ReplaceValueWith(SDValue(N, 1), Ch); } - void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && @@ -2286,10 +2323,12 @@ Res = SplitVecOp_MSTORE(cast(N), OpNo); break; case ISD::MSCATTER: - Res = SplitVecOp_MSCATTER(cast(N), OpNo); + case ISD::VP_SCATTER: + Res = SplitVecOp_Scatter(cast(N), OpNo); break; case ISD::MGATHER: - Res = SplitVecOp_MGATHER(cast(N), OpNo); + case ISD::VP_GATHER: + Res = SplitVecOp_Gather(cast(N), OpNo); break; case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); @@ -2663,69 +2702,13 @@ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); } -SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, - unsigned OpNo) { - EVT LoVT, HiVT; - SDLoc dl(MGT); - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0)); - - SDValue Ch = MGT->getChain(); - SDValue Ptr = MGT->getBasePtr(); - SDValue Index = MGT->getIndex(); - SDValue Scale = MGT->getScale(); - SDValue Mask = MGT->getMask(); - SDValue PassThru = MGT->getPassThru(); - Align Alignment = MGT->getOriginalAlign(); - ISD::LoadExtType ExtType = MGT->getExtensionType(); - - SDValue MaskLo, MaskHi; - if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) - // Split Mask operand - GetSplitVector(Mask, MaskLo, MaskHi); - else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); - - EVT MemoryVT = MGT->getMemoryVT(); - EVT LoMemVT, HiMemVT; - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); - - SDValue PassThruLo, PassThruHi; - if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(PassThru, PassThruLo, PassThruHi); - else - std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl); - - SDValue IndexHi, IndexLo; - if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Index, IndexLo, IndexHi); - else - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); - - MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MGT->getPointerInfo(), MachineMemOperand::MOLoad, - MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(), - MGT->getRanges()); - - SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; - SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, - OpsLo, MMO, MGT->getIndexType(), ExtType); - - SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale}; - SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, - OpsHi, MMO, MGT->getIndexType(), ExtType); - - // Build a factor node to remember that this load is independent of the - // other one. - Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), - Hi.getValue(1)); - - // Legalize the chain result - switch anything that used the old chain to - // use the new one. - ReplaceValueWith(SDValue(MGT, 1), Ch); +SDValue DAGTypeLegalizer::SplitVecOp_Gather(MemSDNode *N, unsigned OpNo) { + (void)OpNo; + SDValue Lo, Hi; + SplitVecRes_Gather(N, Lo, Hi); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo, - Hi); - ReplaceValueWith(SDValue(MGT, 0), Res); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, N, N->getValueType(0), Lo, Hi); + ReplaceValueWith(SDValue(N, 0), Res); return SDValue(); } @@ -2886,17 +2869,29 @@ return Res; } -SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, - unsigned OpNo) { - SDValue Ch = N->getChain(); +SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) { + SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); - SDValue Mask = N->getMask(); - SDValue Index = N->getIndex(); - SDValue Scale = N->getScale(); - SDValue Data = N->getValue(); EVT MemoryVT = N->getMemoryVT(); Align Alignment = N->getOriginalAlign(); SDLoc DL(N); + struct Operands { + SDValue Mask; + SDValue Index; + SDValue Scale; + SDValue Data; + } Ops = [&]() -> Operands { + if (auto *MSC = dyn_cast(N)) { + return {MSC->getMask(), MSC->getIndex(), MSC->getScale(), + MSC->getValue()}; + } + if (auto *VPSC = dyn_cast(N)) { + return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale(), + VPSC->getValue()}; + } + llvm_unreachable( + "Only know how to split masked.scatter and vp.scatter nodes"); + }(); // Split all operands @@ -2904,46 +2899,64 @@ std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; - if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + if (getTypeAction(Ops.Data.getValueType()) == TargetLowering::TypeSplitVector) // Split Data operand - GetSplitVector(Data, DataLo, DataHi); + GetSplitVector(Ops.Data, DataLo, DataHi); else - std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + std::tie(DataLo, DataHi) = DAG.SplitVector(Ops.Data, DL); // Split Mask operand SDValue MaskLo, MaskHi; - if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) { - SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + if (OpNo == 1 && Ops.Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi); } else { - if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Mask, MaskLo, MaskHi); - else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + std::tie(MaskLo, MaskHi) = SplitMask(Ops.Mask, DL); } SDValue IndexHi, IndexLo; - if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Index, IndexLo, IndexHi); + if (getTypeAction(Ops.Index.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(Ops.Index, IndexLo, IndexHi); else - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL); SDValue Lo; MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( N->getPointerInfo(), MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale}; - Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT, - DL, OpsLo, MMO, N->getIndexType(), - N->isTruncatingStore()); - - // The order of the Scatter operation after split is well defined. The "Hi" - // part comes after the "Lo". So these two operations should be chained one - // after another. - SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale}; - return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT, - DL, OpsHi, MMO, N->getIndexType(), - N->isTruncatingStore()); + if (auto *MSC = dyn_cast(N)) { + SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale}; + Lo = + DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO, + MSC->getIndexType(), MSC->isTruncatingStore()); + + // The order of the Scatter operation after split is well defined. The "Hi" + // part comes after the "Lo". So these two operations should be chained one + // after another. + SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Ops.Scale}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi, + MMO, MSC->getIndexType(), + MSC->isTruncatingStore()); + } + if (auto *VPSC = dyn_cast(N)) { + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(VPSC->getVectorLength(), Ops.Data.getValueType(), DL); + + SDValue OpsLo[] = {Ch, DataLo, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo}; + Lo = DAG.getScatterVP(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO, + VPSC->getIndexType()); + + // The order of the Scatter operation after split is well defined. The "Hi" + // part comes after the "Lo". So these two operations should be chained one + // after another. + SDValue OpsHi[] = {Lo, DataHi, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi}; + return DAG.getScatterVP(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi, MMO, + VPSC->getIndexType()); + } + llvm_unreachable( + "Only know how to split masked.scatter and vp.scatter nodes"); } SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -271,6 +271,54 @@ ret <8 x i8> %v } +declare <32 x i8> @llvm.vp.gather.v32i8.v32p0i8(<32 x i8*>, <32 x i1>, i32) + +define <32 x i8> @vpgather_baseidx_v32i8(i8* %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_v32i8: +; RV32: # %bb.0: +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vsext.vf4 v16, v8 +; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_v32i8: +; RV64: # %bb.0: +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vmv1r.v v10, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: bltu a1, a3, .LBB13_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB13_2: +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v10, 2 +; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, mu +; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB13_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB13_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli zero, a0, e8, m2, tu, mu +; RV64-NEXT: vslideup.vi v8, v12, 16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i8, i8* %base, <32 x i8> %idxs + %v = call <32 x i8> @llvm.vp.gather.v32i8.v32p0i8(<32 x i8*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x i8> %v +} + declare <2 x i16> @llvm.vp.gather.v2i16.v2p0i16(<2 x i16*>, <2 x i1>, i32) define <2 x i16> @vpgather_v2i16(<2 x i16*> %ptrs, <2 x i1> %m, i32 zeroext %evl) { @@ -1870,3 +1918,756 @@ %v = call <8 x double> @llvm.vp.gather.v8f64.v8p0f64(<8 x double*> %ptrs, <8 x i1> %m, i32 %evl) ret <8 x double> %v } + +declare <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*>, <32 x i1>, i32) + +define <32 x double> @vpgather_v32f64(<32 x double*> %ptrs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: addi a2, a0, -16 +; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: li a1, 0 +; RV32-NEXT: bltu a0, a2, .LBB86_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB86_2: +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v1, 2 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (zero), v24, v0.t +; RV32-NEXT: li a1, 16 +; RV32-NEXT: bltu a0, a1, .LBB86_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a0, 16 +; RV32-NEXT: .LBB86_4: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vluxei32.v v24, (zero), v8, v0.t +; RV32-NEXT: vmv.v.v v8, v24 +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi a2, a0, -16 +; RV64-NEXT: vmv1r.v v24, v0 +; RV64-NEXT: li a1, 0 +; RV64-NEXT: bltu a0, a2, .LBB86_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: .LBB86_2: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v24, 2 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (zero), v16, v0.t +; RV64-NEXT: li a1, 16 +; RV64-NEXT: bltu a0, a1, .LBB86_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a0, 16 +; RV64-NEXT: .LBB86_4: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vluxei64.v v8, (zero), v8, v0.t +; RV64-NEXT: ret + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_v32i8_v32f64(double* %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_v32i8_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: li a3, 16 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB87_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB87_2: +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV32-NEXT: vsext.vf4 v16, v8 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: bltu a1, a3, .LBB87_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB87_4: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_v32i8_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vmv1r.v v10, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: bltu a1, a3, .LBB87_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB87_2: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v10, 2 +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB87_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB87_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, <32 x i8> %idxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(double* %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_sext_v32i8_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v10, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV32-NEXT: vslidedown.vi v12, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vsext.vf8 v16, v12 +; RV32-NEXT: bltu a1, a3, .LBB88_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB88_2: +; RV32-NEXT: vsext.vf8 v24, v8 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v10, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v12, v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB88_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB88_4: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v24, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vluxei32.v v8, (a0), v4, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_sext_v32i8_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v10, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: bltu a1, a3, .LBB88_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB88_2: +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v10, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB88_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB88_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %eidxs = sext <32 x i8> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(double* %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_zext_v32i8_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v10, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV32-NEXT: vslidedown.vi v12, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vzext.vf8 v16, v12 +; RV32-NEXT: bltu a1, a3, .LBB89_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB89_2: +; RV32-NEXT: vzext.vf8 v24, v8 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v10, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v12, v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB89_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB89_4: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v24, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vluxei32.v v8, (a0), v4, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_zext_v32i8_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v10, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vzext.vf8 v16, v12 +; RV64-NEXT: bltu a1, a3, .LBB89_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB89_2: +; RV64-NEXT: vzext.vf8 v24, v8 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v10, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB89_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB89_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %eidxs = zext <32 x i8> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_v32i16_v32f64(double* %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_v32i16_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: li a3, 16 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB90_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB90_2: +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV32-NEXT: vsext.vf2 v16, v8 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: bltu a1, a3, .LBB90_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB90_4: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_v32i16_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: bltu a1, a3, .LBB90_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB90_2: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v12, 2 +; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB90_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB90_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, <32 x i16> %idxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(double* %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_sext_v32i16_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, mu +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vsext.vf4 v16, v24 +; RV32-NEXT: bltu a1, a3, .LBB91_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB91_2: +; RV32-NEXT: vsext.vf4 v24, v8 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v12, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v8, v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB91_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB91_4: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v24, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vluxei32.v v8, (a0), v4, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_sext_v32i16_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu +; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf4 v16, v24 +; RV64-NEXT: bltu a1, a3, .LBB91_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB91_2: +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v12, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB91_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB91_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %eidxs = sext <32 x i16> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(double* %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_zext_v32i16_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, mu +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vzext.vf4 v16, v24 +; RV32-NEXT: bltu a1, a3, .LBB92_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB92_2: +; RV32-NEXT: vzext.vf4 v24, v8 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v12, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v8, v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB92_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB92_4: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v24, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vluxei32.v v8, (a0), v4, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_zext_v32i16_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, mu +; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vzext.vf4 v16, v24 +; RV64-NEXT: bltu a1, a3, .LBB92_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB92_2: +; RV64-NEXT: vzext.vf4 v24, v8 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v12, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB92_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB92_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %eidxs = zext <32 x i16> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_v32i32_v32f64(double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_v32i32_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: li a3, 16 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB93_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB93_2: +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV32-NEXT: vsll.vi v16, v8, 3 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: bltu a1, a3, .LBB93_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB93_4: +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_v32i32_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vmv1r.v v1, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: bltu a1, a3, .LBB93_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB93_2: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v1, 2 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB93_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB93_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, <32 x i32> %idxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_sext_v32i32_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vsext.vf2 v16, v24 +; RV32-NEXT: bltu a1, a3, .LBB94_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB94_2: +; RV32-NEXT: vsext.vf2 v24, v8 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v1, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v8, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v4, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB94_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB94_4: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_sext_v32i32_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v1, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vsext.vf2 v16, v24 +; RV64-NEXT: bltu a1, a3, .LBB94_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB94_2: +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v1, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB94_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB94_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t +; RV64-NEXT: ret + %eidxs = sext <32 x i32> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_zext_v32i32_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v24, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vzext.vf2 v16, v24 +; RV32-NEXT: bltu a1, a3, .LBB95_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB95_2: +; RV32-NEXT: vzext.vf2 v24, v8 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v1, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v8, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v4, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB95_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB95_4: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_zext_v32i32_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v1, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vzext.vf2 v16, v24 +; RV64-NEXT: bltu a1, a3, .LBB95_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB95_2: +; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v1, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB95_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB95_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t +; RV64-NEXT: ret + %eidxs = zext <32 x i32> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} + +define <32 x double> @vpgather_baseidx_v32f64(double* %base, <32 x i64> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: addi a3, a1, -16 +; RV32-NEXT: vmv1r.v v24, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: bltu a1, a3, .LBB96_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB96_2: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v24, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v28, v16, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB96_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB96_4: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v28, v8, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vluxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi a3, a1, -16 +; RV64-NEXT: vmv1r.v v24, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: bltu a1, a3, .LBB96_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB96_2: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v24, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: li a2, 16 +; RV64-NEXT: bltu a1, a2, .LBB96_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB96_4: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %idxs + %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0f64(<32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret <32 x double> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-v -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-v,+m -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-v -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-v,+m -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 declare void @llvm.vp.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, <2 x i1>, i32) @@ -1716,3 +1716,470 @@ call void @llvm.vp.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, <8 x i1> %m, i32 %evl) ret void } + +declare void @llvm.vp.scatter.v32f64.v32p0f64(<32 x double>, <32 x double*>, <32 x i1>, i32) + +define void @vpscatter_v32f64(<32 x double> %val, <32 x double*> %ptrs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vle32.v v24, (a0) +; RV32-NEXT: li a0, 16 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a0, .LBB79_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB79_2: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t +; RV32-NEXT: bltu a1, a2, .LBB79_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: .LBB79_4: +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV32-NEXT: vsoxei32.v v16, (zero), v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vle64.v v24, (a0) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a3, 16 +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: bltu a2, a3, .LBB79_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB79_2: +; RV64-NEXT: li a3, 0 +; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: addi a0, a2, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t +; RV64-NEXT: bltu a2, a0, .LBB79_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a3, a0 +; RV64-NEXT: .LBB79_4: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + call void @llvm.vp.scatter.v32f64.v32p0f64(<32 x double> %val, <32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret void +} + +define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_baseidx_v32i32_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: li a3, 16 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: bltu a2, a3, .LBB80_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB80_2: +; RV32-NEXT: li a3, 0 +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: bltu a2, a1, .LBB80_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: .LBB80_4: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_baseidx_v32i32_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a4, a3, 3 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a1, 16 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB80_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a3, 16 +; RV64-NEXT: .LBB80_2: +; RV64-NEXT: li a1, 0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vl8re8.v v0, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsext.vf2 v24, v0 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: addi a3, a2, -16 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl1r.v v0, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: bltu a2, a3, .LBB80_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a1, a3 +; RV64-NEXT: .LBB80_4: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8re8.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, <32 x i32> %idxs + call void @llvm.vp.scatter.v32f64.v32p0f64(<32 x double> %val, <32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret void +} + +define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_baseidx_sext_v32i32_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsext.vf2 v8, v24 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: bltu a2, a3, .LBB81_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB81_2: +; RV32-NEXT: li a3, 0 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsext.vf2 v16, v24 +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli a4, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: bltu a2, a1, .LBB81_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: .LBB81_4: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_baseidx_sext_v32i32_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v8, v24, 16 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsext.vf2 v8, v24 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: bltu a2, a3, .LBB81_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB81_2: +; RV64-NEXT: li a3, 0 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsext.vf2 v16, v24 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: addi a1, a2, -16 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 3 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: bltu a2, a1, .LBB81_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: .LBB81_4: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %eidxs = sext <32 x i32> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + call void @llvm.vp.scatter.v32f64.v32p0f64(<32 x double> %val, <32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret void +} + +define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, double* %base, <32 x i32> %idxs, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_baseidx_zext_v32i32_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vzext.vf2 v8, v24 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: bltu a2, a3, .LBB82_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB82_2: +; RV32-NEXT: li a3, 0 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vzext.vf2 v16, v24 +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli a4, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: addi a1, a2, -16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vl8re8.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: bltu a2, a1, .LBB82_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: .LBB82_4: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_baseidx_zext_v32i32_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; RV64-NEXT: vslidedown.vi v8, v24, 16 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: li a3, 16 +; RV64-NEXT: vzext.vf2 v8, v24 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: bltu a2, a3, .LBB82_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB82_2: +; RV64-NEXT: li a3, 0 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vzext.vf2 v16, v24 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: addi a1, a2, -16 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 3 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vl8re8.v v24, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: bltu a2, a1, .LBB82_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: .LBB82_4: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %eidxs = zext <32 x i32> %idxs to <32 x i64> + %ptrs = getelementptr inbounds double, double* %base, <32 x i64> %eidxs + call void @llvm.vp.scatter.v32f64.v32p0f64(<32 x double> %val, <32 x double*> %ptrs, <32 x i1> %m, i32 %evl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll @@ -251,6 +251,107 @@ ret %v } +declare @llvm.vp.gather.nxv32i8.nxv32p0i8(, , i32) + +define @vpgather_baseidx_nxv32i8(i8* %base, %idxs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_nxv32i8: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: srli a5, a2, 2 +; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, mu +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: sub a4, a1, a2 +; RV32-NEXT: vslidedown.vx v0, v0, a5 +; RV32-NEXT: bltu a1, a4, .LBB12_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a3, a4 +; RV32-NEXT: .LBB12_2: +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, mu +; RV32-NEXT: vsext.vf4 v24, v10 +; RV32-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV32-NEXT: vluxei32.v v18, (a0), v24, v0.t +; RV32-NEXT: bltu a1, a2, .LBB12_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB12_4: +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, mu +; RV32-NEXT: vsext.vf4 v24, v8 +; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: vmv4r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_nxv32i8: +; RV64: # %bb.0: +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli t0, a3, 1 +; RV64-NEXT: sub a4, a1, t0 +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: li t1, 0 +; RV64-NEXT: li a7, 0 +; RV64-NEXT: bltu a1, a4, .LBB12_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a7, a4 +; RV64-NEXT: .LBB12_2: +; RV64-NEXT: sub a4, a7, a3 +; RV64-NEXT: mv a2, t1 +; RV64-NEXT: bltu a7, a4, .LBB12_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a2, a4 +; RV64-NEXT: .LBB12_4: +; RV64-NEXT: srli a4, a3, 2 +; RV64-NEXT: vsetvli a5, zero, e8, mf2, ta, mu +; RV64-NEXT: vslidedown.vx v13, v12, a4 +; RV64-NEXT: srli a6, a3, 3 +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vx v0, v13, a6 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v24, v11 +; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, mu +; RV64-NEXT: vluxei64.v v19, (a0), v24, v0.t +; RV64-NEXT: bltu a1, t0, .LBB12_6 +; RV64-NEXT: # %bb.5: +; RV64-NEXT: mv a1, t0 +; RV64-NEXT: .LBB12_6: +; RV64-NEXT: sub a2, a1, a3 +; RV64-NEXT: bltu a1, a2, .LBB12_8 +; RV64-NEXT: # %bb.7: +; RV64-NEXT: mv t1, a2 +; RV64-NEXT: .LBB12_8: +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vx v0, v12, a6 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v24, v9 +; RV64-NEXT: vsetvli zero, t1, e8, m1, ta, mu +; RV64-NEXT: vluxei64.v v17, (a0), v24, v0.t +; RV64-NEXT: bltu a1, a3, .LBB12_10 +; RV64-NEXT: # %bb.9: +; RV64-NEXT: mv a1, a3 +; RV64-NEXT: .LBB12_10: +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t +; RV64-NEXT: bltu a7, a3, .LBB12_12 +; RV64-NEXT: # %bb.11: +; RV64-NEXT: mv a7, a3 +; RV64-NEXT: .LBB12_12: +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v24, v10 +; RV64-NEXT: vsetvli zero, a7, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v13 +; RV64-NEXT: vluxei64.v v18, (a0), v24, v0.t +; RV64-NEXT: vmv4r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i8, i8* %base, %idxs + %v = call @llvm.vp.gather.nxv32i8.nxv32p0i8( %ptrs, %m, i32 %evl) + ret %v +} + declare @llvm.vp.gather.nxv1i16.nxv1p0i16(, , i32) define @vpgather_nxv1i16( %ptrs, %m, i32 zeroext %evl) { @@ -2232,3 +2333,264 @@ %v = call @llvm.vp.gather.nxv8f64.nxv8p0f64( %ptrs, %m, i32 %evl) ret %v } + +declare @llvm.vp.gather.nxv16f64.nxv16p0f64(, , i32) + +define @vpgather_nxv16f64( %ptrs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v24, v0 +; RV32-NEXT: li a2, 0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a4, a1, 3 +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a3, a0, a1 +; RV32-NEXT: vslidedown.vx v0, v0, a4 +; RV32-NEXT: bltu a0, a3, .LBB102_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB102_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (zero), v12, v0.t +; RV32-NEXT: bltu a0, a1, .LBB102_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB102_4: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vluxei32.v v24, (zero), v8, v0.t +; RV32-NEXT: vmv.v.v v8, v24 +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v24, v0 +; RV64-NEXT: li a2, 0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a4, a1, 3 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a3, a0, a1 +; RV64-NEXT: vslidedown.vx v0, v0, a4 +; RV64-NEXT: bltu a0, a3, .LBB102_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB102_2: +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (zero), v16, v0.t +; RV64-NEXT: bltu a0, a1, .LBB102_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB102_4: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vluxei64.v v8, (zero), v8, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.gather.nxv16f64.nxv16p0f64( %ptrs, %m, i32 %evl) + ret %v +} + +define @vpgather_baseidx_nxv16i16_nxv16f64(double* %base, %idxs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: srli a5, a2, 3 +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a4, a1, a2 +; RV32-NEXT: vslidedown.vx v0, v0, a5 +; RV32-NEXT: bltu a1, a4, .LBB103_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a3, a4 +; RV32-NEXT: .LBB103_2: +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, mu +; RV32-NEXT: vsext.vf2 v16, v8 +; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: bltu a1, a2, .LBB103_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB103_4: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a5, a2, 3 +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a4, a1, a2 +; RV64-NEXT: vslidedown.vx v0, v0, a5 +; RV64-NEXT: bltu a1, a4, .LBB103_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a3, a4 +; RV64-NEXT: .LBB103_2: +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: bltu a1, a2, .LBB103_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: .LBB103_4: +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v = call @llvm.vp.gather.nxv16f64.nxv16p0f64( %ptrs, %m, i32 %evl) + ret %v +} + +define @vpgather_baseidx_sext_nxv16i16_nxv16f64(double* %base, %idxs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vsext.vf4 v16, v10 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: srli a5, a2, 3 +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a4, a1, a2 +; RV32-NEXT: vslidedown.vx v0, v0, a5 +; RV32-NEXT: bltu a1, a4, .LBB104_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a3, a4 +; RV32-NEXT: .LBB104_2: +; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, mu +; RV32-NEXT: vsext.vf4 v24, v8 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v8, v16, 0 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: bltu a1, a2, .LBB104_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB104_4: +; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v24, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vluxei32.v v8, (a0), v4, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a5, a2, 3 +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a4, a1, a2 +; RV64-NEXT: vslidedown.vx v0, v0, a5 +; RV64-NEXT: bltu a1, a4, .LBB104_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a3, a4 +; RV64-NEXT: .LBB104_2: +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: bltu a1, a2, .LBB104_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: .LBB104_4: +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.vp.gather.nxv16f64.nxv16p0f64( %ptrs, %m, i32 %evl) + ret %v +} + +define @vpgather_baseidx_zext_nxv16i16_nxv16f64(double* %base, %idxs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vzext.vf4 v16, v10 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: srli a5, a2, 3 +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a4, a1, a2 +; RV32-NEXT: vslidedown.vx v0, v0, a5 +; RV32-NEXT: bltu a1, a4, .LBB105_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a3, a4 +; RV32-NEXT: .LBB105_2: +; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, mu +; RV32-NEXT: vzext.vf4 v24, v8 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v8, v16, 0 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: bltu a1, a2, .LBB105_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB105_4: +; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v4, v24, 0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vluxei32.v v8, (a0), v4, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vzext.vf4 v16, v10 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a5, a2, 3 +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a4, a1, a2 +; RV64-NEXT: vslidedown.vx v0, v0, a5 +; RV64-NEXT: bltu a1, a4, .LBB105_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a3, a4 +; RV64-NEXT: .LBB105_2: +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, mu +; RV64-NEXT: vzext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: bltu a1, a2, .LBB105_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: .LBB105_4: +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.vp.gather.nxv16f64.nxv16p0f64( %ptrs, %m, i32 %evl) + ret %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 declare void @llvm.vp.scatter.nxv1i8.nxv1p0i8(, , , i32) @@ -2071,3 +2071,367 @@ call void @llvm.vp.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, %m, i32 %evl) ret void } + +declare void @llvm.vp.scatter.nxv16f64.nxv16p0f64(, , , i32) + +define void @vpscatter_nxv16f64( %val, %ptrs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl8re32.v v24, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a0, .LBB95_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: .LBB95_2: +; RV32-NEXT: li a3, 0 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t +; RV32-NEXT: srli a2, a0, 3 +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: vslidedown.vx v0, v0, a2 +; RV32-NEXT: bltu a1, a0, .LBB95_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a3, a0 +; RV32-NEXT: .LBB95_4: +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vsoxei32.v v16, (zero), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl8re64.v v16, (a0) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a3, a1, 3 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB95_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: .LBB95_2: +; RV64-NEXT: li a4, 0 +; RV64-NEXT: vl8re64.v v24, (a0) +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a0, a2, a1 +; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: bltu a2, a0, .LBB95_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a4, a0 +; RV64-NEXT: .LBB95_4: +; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + call void @llvm.vp.scatter.nxv16f64.nxv16p0f64( %val, %ptrs, %m, i32 %evl) + ret void +} + +define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, double* %base, %idxs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl4re16.v v4, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: bltu a2, a1, .LBB96_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: .LBB96_2: +; RV32-NEXT: li a4, 0 +; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, mu +; RV32-NEXT: vsext.vf2 v24, v4 +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: srli a3, a1, 3 +; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: bltu a2, a1, .LBB96_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a4, a1 +; RV32-NEXT: .LBB96_4: +; RV32-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vl4re16.v v4, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB96_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: .LBB96_2: +; RV64-NEXT: li a4, 0 +; RV64-NEXT: vsetvli a5, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v24, v4 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: bltu a2, a1, .LBB96_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a4, a1 +; RV64-NEXT: .LBB96_4: +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf4 v8, v6 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.vp.scatter.nxv16f64.nxv16p0f64( %val, %ptrs, %m, i32 %evl) + ret void +} + +define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %val, double* %base, %idxs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: vl4re16.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: vsext.vf4 v8, v24 +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: bltu a2, a1, .LBB97_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: .LBB97_2: +; RV32-NEXT: li a4, 0 +; RV32-NEXT: vsext.vf4 v16, v26 +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: srli a3, a1, 3 +; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: bltu a2, a1, .LBB97_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a4, a1 +; RV32-NEXT: .LBB97_4: +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: vl4re16.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: vsext.vf4 v8, v24 +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB97_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: .LBB97_2: +; RV64-NEXT: li a4, 0 +; RV64-NEXT: vsext.vf4 v16, v26 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: bltu a2, a1, .LBB97_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a4, a1 +; RV64-NEXT: .LBB97_4: +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.vp.scatter.nxv16f64.nxv16p0f64( %val, %ptrs, %m, i32 %evl) + ret void +} + +define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64( %val, double* %base, %idxs, %m, i32 zeroext %evl) { +; RV32-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: vl4re16.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: vzext.vf4 v8, v24 +; RV32-NEXT: mv a3, a2 +; RV32-NEXT: bltu a2, a1, .LBB98_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: .LBB98_2: +; RV32-NEXT: li a4, 0 +; RV32-NEXT: vzext.vf4 v16, v26 +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v24, v8, 0 +; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vl8re8.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: srli a3, a1, 3 +; RV32-NEXT: vsetvli a5, zero, e8, mf4, ta, mu +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: vslidedown.vx v0, v0, a3 +; RV32-NEXT: bltu a2, a1, .LBB98_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a4, a1 +; RV32-NEXT: .LBB98_4: +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV32-NEXT: vsll.vi v8, v16, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vnsrl.wi v16, v8, 0 +; RV32-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: vl4re16.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: vzext.vf4 v8, v24 +; RV64-NEXT: mv a3, a2 +; RV64-NEXT: bltu a2, a1, .LBB98_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: .LBB98_2: +; RV64-NEXT: li a4, 0 +; RV64-NEXT: vzext.vf4 v16, v26 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t +; RV64-NEXT: srli a3, a1, 3 +; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, mu +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: vslidedown.vx v0, v0, a3 +; RV64-NEXT: bltu a2, a1, .LBB98_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a4, a1 +; RV64-NEXT: .LBB98_4: +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.vp.scatter.nxv16f64.nxv16p0f64( %val, %ptrs, %m, i32 %evl) + ret void +}