diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1746,6 +1746,7 @@ SDValue PassThru = MGT->getPassThru(); SDValue Index = MGT->getIndex(); SDValue Scale = MGT->getScale(); + EVT MemoryVT = MGT->getMemoryVT(); Align Alignment = MGT->getOriginalAlign(); // Split Mask operand @@ -1759,6 +1760,10 @@ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); } + EVT LoMemVT, HiMemVT; + // Split MemoryVT + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + SDValue PassThruLo, PassThruHi; if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector) GetSplitVector(PassThru, PassThruLo, PassThruHi); @@ -1777,11 +1782,11 @@ MGT->getRanges()); SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; - Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo, + Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo, MMO, MGT->getIndexType()); SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale}; - Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi, + Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi, MMO, MGT->getIndexType()); // Build a factor node to remember that this load is independent of the @@ -2421,11 +2426,11 @@ MGT->getRanges()); SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; - SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, + SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo, MMO, MGT->getIndexType()); SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale}; - SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, + SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi, MMO, MGT->getIndexType()); // Build a factor node to remember that this load is independent of the diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7310,17 +7310,22 @@ return SDValue(E, 0); } + IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]); auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, VT, MMO, IndexType); createOperands(N, Ops); assert(N->getPassThru().getValueType() == N->getValueType(0) && "Incompatible type of the PassThru value in MaskedGatherSDNode"); - assert(N->getMask().getValueType().getVectorNumElements() == - N->getValueType(0).getVectorNumElements() && + assert(N->getMask().getValueType().getVectorElementCount() == + N->getValueType(0).getVectorElementCount() && "Vector width mismatch between mask and data"); - assert(N->getIndex().getValueType().getVectorNumElements() >= - N->getValueType(0).getVectorNumElements() && + assert(N->getIndex().getValueType().getVectorElementCount().isScalable() == + N->getValueType(0).getVectorElementCount().isScalable() && + "Scalable flags of index and data do not match"); + assert(ElementCount::isKnownGE( + N->getIndex().getValueType().getVectorElementCount(), + N->getValueType(0).getVectorElementCount()) && "Vector width mismatch between index and data"); assert(isa(N->getScale()) && cast(N->getScale())->getAPIntValue().isPowerOf2() && diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4416,7 +4416,7 @@ if (!UniformBase) { Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(Ptr); - IndexType = ISD::SIGNED_SCALED; + IndexType = ISD::SIGNED_UNSCALED; Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale }; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -805,6 +805,7 @@ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -113,6 +113,16 @@ "optimization"), cl::init(true)); +// Temporary option added for the purpose of testing functionality added +// to DAGCombiner.cpp in D92230. It is expected that this can be removed +// in future when both implementations will be based off MGATHER rather +// than the GLD1 nodes added for the SVE gather load intrinsics. +static cl::opt +EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, + cl::desc("Combine extends of AArch64 masked " + "gather intrinsics"), + cl::init(true)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -1059,6 +1069,7 @@ setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); @@ -1111,6 +1122,7 @@ MVT::nxv4f32, MVT::nxv2f64}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); @@ -3775,6 +3787,29 @@ return ExtVal.getValueType().isScalableVector(); } +unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { + std::map, unsigned> AddrModes = { + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::GLD1_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::GLD1_UXTW_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::GLD1_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::GLD1_SXTW_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::GLD1_SCALED_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::GLD1_SCALED_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, + }; + auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); + return AddrModes.find(Key)->second; +} + unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { std::map, unsigned> AddrModes = { {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), @@ -3798,7 +3833,7 @@ return AddrModes.find(Key)->second; } -bool getScatterIndexIsExtended(SDValue Index) { +bool getGatherScatterIndexIsExtended(SDValue Index) { unsigned Opcode = Index.getOpcode(); if (Opcode == ISD::SIGN_EXTEND_INREG) return true; @@ -3816,6 +3851,54 @@ return false; } +SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MaskedGatherSDNode *MGT = cast(Op); + assert(MGT && "Can only custom lower gather load nodes"); + + SDValue Index = MGT->getIndex(); + SDValue Chain = MGT->getChain(); + SDValue PassThru = MGT->getPassThru(); + SDValue Mask = MGT->getMask(); + SDValue BasePtr = MGT->getBasePtr(); + + ISD::MemIndexType IndexType = MGT->getIndexType(); + bool IsScaled = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; + bool IsSigned = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; + bool IdxNeedsExtend = + getGatherScatterIndexIsExtended(Index) || + Index.getSimpleValueType().getVectorElementType() == MVT::i32; + + EVT VT = PassThru.getSimpleValueType(); + EVT MemVT = MGT->getMemoryVT(); + SDValue InputVT = DAG.getValueType(MemVT); + + if (VT.getVectorElementType() == MVT::bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); + + // Handle FP data + if (VT.isFloatingPoint()) { + VT = VT.changeVectorElementTypeToInteger(); + ElementCount EC = VT.getVectorElementCount(); + auto ScalarIntVT = + MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue()); + PassThru = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, + MVT::getVectorVT(ScalarIntVT, EC), PassThru); + + InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); + } + + SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other); + + SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru}; + return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL, + VTs, Ops); +} + SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -3834,7 +3917,7 @@ bool IsSigned = IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; bool NeedsExtend = - getScatterIndexIsExtended(Index) || + getGatherScatterIndexIsExtended(Index) || Index.getSimpleValueType().getVectorElementType() == MVT::i32; EVT VT = StoreVal.getSimpleValueType(); @@ -3858,7 +3941,7 @@ InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); } - if (getScatterIndexIsExtended(Index)) + if (getGatherScatterIndexIsExtended(Index)) Index = Index.getOperand(0); SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT}; @@ -4159,6 +4242,8 @@ return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::MGATHER: + return LowerMGATHER(Op, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, DAG); case ISD::VECREDUCE_SEQ_FADD: @@ -12019,6 +12104,9 @@ return DAG.getNode(Opc, DL, N->getValueType(0), And); } + if (!EnableCombineMGatherIntrinsics) + return SDValue(); + SDValue Mask = N->getOperand(1); if (!Src.hasOneUse()) @@ -14982,6 +15070,9 @@ return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); } + if (!EnableCombineMGatherIntrinsics) + return SDValue(); + // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. unsigned NewOpc; diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled unpacked 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv2i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ret + %ptrs = getelementptr i16, i16* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ret + %ptrs = getelementptr i32, i32* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i64(i64* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %ptrs = getelementptr i64, i64* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f16(half* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %ptrs = getelementptr half, half* %base, %offsets + %vals = call @llvm.masked.gather.nxv2f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f32(float* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %ptrs = getelementptr float, float* %base, %offsets + %vals = call @llvm.masked.gather.nxv2f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f64(double* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %ptrs = getelementptr double, double* %base, %offsets + %vals = call @llvm.masked.gather.nxv2f64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: sxth z0.d, p1/m, z0.d +; CHECK-NEXT: ret + %ptrs = getelementptr i16, i16* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ret + %ptrs = getelementptr i32, i32* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled packed 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv4i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %ptrs = getelementptr i16, i16* %base, %offsets + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv4i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ret + %ptrs = getelementptr i32, i32* %base, %offsets + %vals = call @llvm.masked.gather.nxv4i32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f16(half* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] +; CHECK-NEXT: ret + %ptrs = getelementptr half, half* %base, %offsets + %vals = call @llvm.masked.gather.nxv4f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f32(float* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ret + %ptrs = getelementptr float, float* %base, %offsets + %vals = call @llvm.masked.gather.nxv4f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv4i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %ptrs = getelementptr i16, i16* %base, %offsets + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +declare @llvm.masked.gather.nxv2i16(, i32, , ) +declare @llvm.masked.gather.nxv2i32(, i32, , ) +declare @llvm.masked.gather.nxv2i64(, i32, , ) +declare @llvm.masked.gather.nxv2f16(, i32, , ) +declare @llvm.masked.gather.nxv2f32(, i32, , ) +declare @llvm.masked.gather.nxv2f64(, i32, , ) + +declare @llvm.masked.gather.nxv4i16(, i32, , ) +declare @llvm.masked.gather.nxv4i32(, i32, , ) +declare @llvm.masked.gather.nxv4f16(, i32, , ) +declare @llvm.masked.gather.nxv4f32(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll @@ -0,0 +1,328 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled unpacked 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv2i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: ret + %ptrs = getelementptr i8, i8* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: sxtb z0.d, p1/m, z0.d +; CHECK-NEXT: ret + %ptrs = getelementptr i8, i8* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: sxth z0.d, p1/m, z0.d +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled packed 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv4i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: ret + %ptrs = getelementptr i8, i8* %base, %offsets + %vals = call @llvm.masked.gather.nxv4i8( %ptrs, i32 1, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv4i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: sunpklo z2.d, z0.s +; CHECK-NEXT: sunpkhi z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z2.d, z1.d, z2.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv4i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: sunpklo z2.d, z0.s +; CHECK-NEXT: sunpkhi z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z2.d, z1.d, z2.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4i32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: sunpklo z2.d, z0.s +; CHECK-NEXT: sunpkhi z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z2.d, z1.d, z2.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: sunpklo z2.d, z0.s +; CHECK-NEXT: sunpkhi z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z2.d, z1.d, z2.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv4i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %ptrs = getelementptr i8, i8* %base, %offsets + %vals = call @llvm.masked.gather.nxv4i8( %ptrs, i32 1, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: sunpklo z2.d, z0.s +; CHECK-NEXT: sunpkhi z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z2.d, z1.d, z2.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +declare @llvm.masked.gather.nxv2i8(, i32, , ) +declare @llvm.masked.gather.nxv2i16(, i32, , ) +declare @llvm.masked.gather.nxv2i32(, i32, , ) +declare @llvm.masked.gather.nxv2i64(, i32, , ) +declare @llvm.masked.gather.nxv2f16(, i32, , ) +declare @llvm.masked.gather.nxv2f32(, i32, , ) +declare @llvm.masked.gather.nxv2f64(, i32, , ) + +declare @llvm.masked.gather.nxv4i8(, i32, , ) +declare @llvm.masked.gather.nxv4i16(, i32, , ) +declare @llvm.masked.gather.nxv4i32(, i32, , ) +declare @llvm.masked.gather.nxv4f16(, i32, , ) +declare @llvm.masked.gather.nxv4f32(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll @@ -0,0 +1,223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled unpacked 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv2i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i16, i16* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i32, i32* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i64(i64* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i64, i64* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2i64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f16(half* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr half, half* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f32(float* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr float, float* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f64(double* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr double, double* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2f64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i16, i16* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i32, i32* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled packed 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv4i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i16, i16* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv4i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i32, i32* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv4i32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f16(half* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr half, half* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv4f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f32(float* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr float, float* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv4f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv4i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i16, i16* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +declare @llvm.masked.gather.nxv2i16(, i32, , ) +declare @llvm.masked.gather.nxv2i32(, i32, , ) +declare @llvm.masked.gather.nxv2i64(, i32, , ) +declare @llvm.masked.gather.nxv2f16(, i32, , ) +declare @llvm.masked.gather.nxv2f32(, i32, , ) +declare @llvm.masked.gather.nxv2f64(, i32, , ) + +declare @llvm.masked.gather.nxv4i16(, i32, , ) +declare @llvm.masked.gather.nxv4i32(, i32, , ) +declare @llvm.masked.gather.nxv4f16(, i32, , ) +declare @llvm.masked.gather.nxv4f32(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled unpacked 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv2i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i8, i8* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtb z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i8, i8* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled packed 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define @masked_gather_nxv4i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i8, i8* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv4i8( %ptrs, i32 1, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv4i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z1.d, z1.d, z2.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv4i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z1.d, z1.d, z2.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4i32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z1.d, z1.d, z2.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv4f32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z1.d, z1.d, z2.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv4i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %ptrs = getelementptr i8, i8* %base, %offsets.zext + %vals = call @llvm.masked.gather.nxv4i8( %ptrs, i32 1, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z1.d, z1.d, z2.d +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %offsets.zext = zext %offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +declare @llvm.masked.gather.nxv2i8(, i32, , ) +declare @llvm.masked.gather.nxv2i16(, i32, , ) +declare @llvm.masked.gather.nxv2i32(, i32, , ) +declare @llvm.masked.gather.nxv2i64(, i32, , ) +declare @llvm.masked.gather.nxv2f16(, i32, , ) +declare @llvm.masked.gather.nxv2f32(, i32, , ) +declare @llvm.masked.gather.nxv2f64(, i32, , ) + +declare @llvm.masked.gather.nxv4i8(, i32, , ) +declare @llvm.masked.gather.nxv4i16(, i32, , ) +declare @llvm.masked.gather.nxv4i32(, i32, , ) +declare @llvm.masked.gather.nxv4f16(, i32, , ) +declare @llvm.masked.gather.nxv4f32(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s + +define @masked_gather_nxv2i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ret + %ptrs = getelementptr i16, i16* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ret + %ptrs = getelementptr i32, i32* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i64(i64* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %ptrs = getelementptr i64, i64* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f16(half* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] +; CHECK-NEXT: ret + %ptrs = getelementptr half, half* %base, %offsets + %vals = call @llvm.masked.gather.nxv2f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f32(float* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] +; CHECK-NEXT: ret + %ptrs = getelementptr float, float* %base, %offsets + %vals = call @llvm.masked.gather.nxv2f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f64(double* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %ptrs = getelementptr double, double* %base, %offsets + %vals.sext = call @llvm.masked.gather.nxv2f64( %ptrs, i32 8, %mask, undef) + ret %vals.sext +} + +define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %ptrs = getelementptr i16, i16* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %ptrs = getelementptr i32, i32* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +declare @llvm.masked.gather.nxv2i16(, i32, , ) +declare @llvm.masked.gather.nxv2i32(, i32, , ) +declare @llvm.masked.gather.nxv2i64(, i32, , ) +declare @llvm.masked.gather.nxv2f16(, i32, , ) +declare @llvm.masked.gather.nxv2f32(, i32, , ) +declare @llvm.masked.gather.nxv2f64(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s + +define @masked_gather_nxv2i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: ret + %ptrs = getelementptr i8, i8* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.zext = zext %vals to + ret %vals.zext +} + +define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f16( %ptrs, i32 2, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f32( %ptrs, i32 4, %mask, undef) + ret %vals +} + +define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_gather_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2f64( %ptrs, i32 8, %mask, undef) + ret %vals +} + +define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtb z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %ptrs = getelementptr i8, i8* %base, %offsets + %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { +; CHECK-LABEL: masked_sgather_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + %vals.sext = sext %vals to + ret %vals.sext +} + +declare @llvm.masked.gather.nxv2i8(, i32, , ) +declare @llvm.masked.gather.nxv2i16(, i32, , ) +declare @llvm.masked.gather.nxv2i32(, i32, , ) +declare @llvm.masked.gather.nxv2i64(, i32, , ) +declare @llvm.masked.gather.nxv2f16(, i32, , ) +declare @llvm.masked.gather.nxv2f32(, i32, , ) +declare @llvm.masked.gather.nxv2f64(, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +; Tests that exercise various type legalisation scenarios for ISD::MGATHER. + +; Code generate load of an illegal datatype via promotion. +define @masked_gather_nxv2i32( %ptrs, %mask) { +; CHECK-LABEL: masked_gather_nxv2i32: +; CHECK-DAG: mov x8, xzr +; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK: ret + %data = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) + ret %data +} + +; Code generate the worst case scenario when all vector types are illegal. +define @masked_gather_nxv32i32(i32* %base, %indices, %mask) { +; CHECK-LABEL: masked_gather_nxv32i32: +; CHECK-NOT: unpkhi +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z0.s, sxtw #2] +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z1.s, sxtw #2] +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z2.s, sxtw #2] +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z3.s, sxtw #2] +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z4.s, sxtw #2] +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z5.s, sxtw #2] +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z6.s, sxtw #2] +; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z7.s, sxtw #2] +; CHECK: ret + %ptrs = getelementptr i32, i32* %base, %indices + %data = call @llvm.masked.gather.nxv32i32( %ptrs, i32 4, %mask, undef) + ret %data +} + +; TODO: Currently, the sign extend gets applied to the values after a 'uzp1' of two +; registers, so it doesn't get folded away. Same for any other vector-of-pointers +; style gathers which don't fit in an single register. Better folding +; is required before we can check those off. +define @masked_sgather_nxv4i8( %ptrs, %mask) { +; CHECK-LABEL: masked_sgather_nxv4i8: +; CHECK: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: zip2 p2.s, p0.s, p1.s +; CHECK-NEXT: zip1 p0.s, p0.s, p1.s +; CHECK-NEXT: ld1b { z1.d }, p2/z, [x8, z1.d] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %vals = call @llvm.masked.gather.nxv4i8( %ptrs, i32 1, %mask, undef) + %svals = sext %vals to + ret %svals +} + +declare @llvm.masked.gather.nxv2i8(, i32, , ) +declare @llvm.masked.gather.nxv2i16(, i32, , ) +declare @llvm.masked.gather.nxv2i32(, i32, , ) + +declare @llvm.masked.gather.nxv4i8(, i32, , ) + +declare @llvm.masked.gather.nxv16i8(, i32, , ) +declare @llvm.masked.gather.nxv32i32(, i32, , )