diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -985,6 +985,8 @@ // Demand any elements from the subvector and the remainder from the src its // inserted into. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; SDValue Sub = Op.getOperand(1); uint64_t Idx = Op.getConstantOperandVal(2); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); @@ -1031,6 +1033,8 @@ case ISD::EXTRACT_SUBVECTOR: { // Offset the demanded elts by the subvector index. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; uint64_t Idx = Op.getConstantOperandVal(1); unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx); @@ -2469,6 +2473,8 @@ // Demand any elements from the subvector and the remainder from the src its // inserted into. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; SDValue Sub = Op.getOperand(1); uint64_t Idx = Op.getConstantOperandVal(2); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); @@ -2514,6 +2520,8 @@ case ISD::EXTRACT_SUBVECTOR: { // Offset the demanded elts by the subvector index. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; uint64_t Idx = Op.getConstantOperandVal(1); unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -713,6 +713,7 @@ bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); + void addTypeForFixedLengthSVE(MVT VT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); @@ -847,6 +848,9 @@ SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef LoadOps, EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; + SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, @@ -912,7 +916,7 @@ const TargetTransformInfo *TTI) const override; bool useSVEForFixedLengthVectors() const; - bool useSVEForFixedLengthVectorVT(MVT VT) const; + bool useSVEForFixedLengthVectorVT(EVT VT) const; }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -926,6 +926,17 @@ setOperationAction(ISD::SELECT, VT, Custom); } } + + // NOTE: Currently this has to happen after computeRegisterProperties rather + // than the preferred option of combining it with the addRegisterClass call. + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useSVEForFixedLengthVectorVT(VT)) + addTypeForFixedLengthSVE(VT); + } } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -1010,6 +1021,28 @@ } } +void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + // By default everything must be expanded. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + // EXTRACT_SUBVECTOR/INSERT_SUBVECTOR are used to "cast" between scalable + // and fixed length vector types, although with the current level of support + // only the former is exercised. + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + // Lower fixed length vector operations to scalable equivalents. + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + + // NOTE: This is a temporary measure to maintain functionality required by + // Analysis/CostModel/AArch64/sve-fixed-length.ll + setOperationAction(ISD::ADD, VT, Legal); + setOperationAction(ISD::FADD, VT, Legal); +} + void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); @@ -3276,6 +3309,9 @@ EVT MemVT = StoreNode->getMemoryVT(); if (VT.isVector()) { + if (useSVEForFixedLengthVectorVT(VT)) + return LowerFixedLengthVectorStoreToSVE(Op, DAG); + unsigned AS = StoreNode->getAddressSpace(); unsigned Align = StoreNode->getAlignment(); if (Align < MemVT.getStoreSize() && @@ -3481,6 +3517,10 @@ return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VSCALE: return LowerVSCALE(Op, DAG); + case ISD::LOAD: + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerFixedLengthVectorLoadToSVE(Op, DAG); + llvm_unreachable("Unexpected Load."); } } @@ -3489,18 +3529,20 @@ return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256; } -bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(MVT VT) const { - assert(VT.isFixedLengthVector()); +bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const { if (!useSVEForFixedLengthVectors()) return false; + if (!VT.isFixedLengthVector()) + return false; + // Fixed length predicates should be promoted to i8. // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. if (VT.getVectorElementType() == MVT::i1) return false; // Don't use SVE for vectors we cannot scalarize if required. - switch (VT.getVectorElementType().SimpleTy) { + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { default: return false; case MVT::i8: @@ -14653,3 +14695,156 @@ return false; } + +// Return the largest legal scalable vector type that matches VT's element type. +static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE container"); + case MVT::i8: + return EVT(MVT::nxv16i8); + case MVT::i16: + return EVT(MVT::nxv8i16); + case MVT::i32: + return EVT(MVT::nxv4i32); + case MVT::i64: + return EVT(MVT::nxv2i64); + case MVT::f16: + return EVT(MVT::nxv8f16); + case MVT::f32: + return EVT(MVT::nxv4f32); + case MVT::f64: + return EVT(MVT::nxv2f64); + } +} + +// Return a PTRUE with active lanes corresponding to the extent of VT. +static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + + int PgPattern; + switch (VT.getVectorNumElements()) { + default: + llvm_unreachable("unexpected element count for SVE predicate"); + case 1: + PgPattern = AArch64SVEPredPattern::vl1; + break; + case 2: + PgPattern = AArch64SVEPredPattern::vl2; + break; + case 4: + PgPattern = AArch64SVEPredPattern::vl4; + break; + case 8: + PgPattern = AArch64SVEPredPattern::vl8; + break; + case 16: + PgPattern = AArch64SVEPredPattern::vl16; + break; + case 32: + PgPattern = AArch64SVEPredPattern::vl32; + break; + case 64: + PgPattern = AArch64SVEPredPattern::vl64; + break; + case 128: + PgPattern = AArch64SVEPredPattern::vl128; + break; + case 256: + PgPattern = AArch64SVEPredPattern::vl256; + break; + } + + // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can + // use AArch64SVEPredPattern::all, which can enable the use of unpredicated + // variants of instructions when available. + + MVT MaskVT; + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE predicate"); + case MVT::i8: + MaskVT = MVT::nxv16i1; + break; + case MVT::i16: + case MVT::f16: + MaskVT = MVT::nxv8i1; + break; + case MVT::i32: + case MVT::f32: + MaskVT = MVT::nxv4i1; + break; + case MVT::i64: + case MVT::f64: + MaskVT = MVT::nxv2i1; + break; + } + + return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, + DAG.getTargetConstant(PgPattern, DL, MVT::i64)); +} + +// Grow V to consume an entire SVE register. +static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + "Expected to convert into a scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected a fixed length vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); +} + +// Shrink V so it's just big enough to maintain a VT's worth of data. +static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isFixedLengthVector() && + "Expected to convert into a fixed length vector!"); + assert(V.getValueType().isScalableVector() && + "Expected a scalable vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); +} + +// Convert all fixed length vector loads larger than NEON to masked_loads. +SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Load = cast(Op); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewLoad = DAG.getMaskedLoad( + ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), + Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), + Load->getExtensionType()); + + auto Result = convertFromScalableVector(DAG, VT, NewLoad); + SDValue MergedValues[2] = {Result, Load->getChain()}; + return DAG.getMergeValues(MergedValues, DL); +} + +// Convert all fixed length vector stores larger than NEON to masked_stores. +SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( + SDValue Op, SelectionDAG &DAG) const { + auto Store = cast(Op); + + SDLoc DL(Op); + EVT VT = Store->getValue().getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); + return DAG.getMaskedStore( + Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), + getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore()); +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll @@ -0,0 +1,104 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK + +; VBYTES represents the useful byte size of a vector register from the code +; generator's point of view. It is clamped to power-of-2 values because +; only power-of-2 vector lengths are considered legal, regardless of the +; user specified vector length. + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define <2 x float> @load_v2f32(<2 x float>* %a) #0 { +; CHECK-LABEL: load_v2f32: +; CHECK: ldr d0, [x0] +; CHECK: ret + %load = load <2 x float>, <2 x float>* %a + ret <2 x float> %load +} + +; Don't use SVE for 128-bit vectors. +define <4 x float> @load_v4f32(<4 x float>* %a) #0 { +; CHECK-LABEL: load_v4f32: +; CHECK: ldr q0, [x0] +; CHECK: ret + %load = load <4 x float>, <4 x float>* %a + ret <4 x float> %load +} + +define <8 x float> @load_v8f32(<8 x float>* %a) #0 { +; CHECK-LABEL: load_v8f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] +; CHECK: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; CHECK: ret + %load = load <8 x float>, <8 x float>* %a + ret <8 x float> %load +} + +define <16 x float> @load_v16f32(<16 x float>* %a) #0 { +; CHECK-LABEL: load_v16f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] +; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]] +; CHECK: ret + %load = load <16 x float>, <16 x float>* %a + ret <16 x float> %load +} + +define <32 x float> @load_v32f32(<32 x float>* %a) #0 { +; CHECK-LABEL: load_v32f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] +; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]] +; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A2]]] +; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A3]]] +; CHECK: ret + %load = load <32 x float>, <32 x float>* %a + ret <32 x float> %load +} + +define <64 x float> @load_v64f32(<64 x float>* %a) #0 { +; CHECK-LABEL: load_v64f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] +; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; VBITS_LE_1024-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_1024-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]] +; VBITS_LE_512-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A2]]] +; VBITS_LE_512-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A3]]] +; VBITS_LE_256-DAG: add x[[A4:[0-9]+]], x0, #[[#mul(VBYTES,4)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A4]]] +; VBITS_LE_256-DAG: add x[[A5:[0-9]+]], x0, #[[#mul(VBYTES,5)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A5]]] +; VBITS_LE_256-DAG: add x[[A6:[0-9]+]], x0, #[[#mul(VBYTES,6)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A6]]] +; VBITS_LE_256-DAG: add x[[A7:[0-9]+]], x0, #[[#mul(VBYTES,7)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A7]]] +; CHECK: ret + %load = load <64 x float>, <64 x float>* %a + ret <64 x float> %load +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll @@ -0,0 +1,104 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK + +; VBYTES represents the useful byte size of a vector register from the code +; generator's point of view. It is clamped to power-of-2 values because +; only power-of-2 vector lengths are considered legal, regardless of the +; user specified vector length. + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define void @store_v2f32(<2 x float>* %a) #0 { +; CHECK-LABEL: store_v2f32: +; CHECK: str xzr, [x0] +; CHECK: ret + store <2 x float> zeroinitializer, <2 x float>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @store_v4f32(<4 x float>* %a) #0 { +; CHECK-LABEL: store_v4f32: +; CHECK: stp xzr, xzr, [x0] +; CHECK: ret + store <4 x float> zeroinitializer, <4 x float>* %a + ret void +} + +define void @store_v8f32(<8 x float>* %a) #0 { +; CHECK-LABEL: store_v8f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] +; CHECK: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; CHECK: ret + store <8 x float> zeroinitializer, <8 x float>* %a + ret void +} + +define void @store_v16f32(<16 x float>* %a) #0 { +; CHECK-LABEL: store_v16f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] +; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]] +; CHECK: ret + store <16 x float> zeroinitializer, <16 x float>* %a + ret void +} + +define void @store_v32f32(<32 x float>* %a) #0 { +; CHECK-LABEL: store_v32f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] +; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]] +; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A2]]] +; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A3]]] +; CHECK: ret + store <32 x float> zeroinitializer, <32 x float>* %a + ret void +} + +define void @store_v64f32(<64 x float>* %a) #0 { +; CHECK-LABEL: store_v64f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] +; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; VBITS_LE_1024-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_1024-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]] +; VBITS_LE_512-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A2]]] +; VBITS_LE_512-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A3]]] +; VBITS_LE_256-DAG: add x[[A4:[0-9]+]], x0, #[[#mul(VBYTES,4)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A4]]] +; VBITS_LE_256-DAG: add x[[A5:[0-9]+]], x0, #[[#mul(VBYTES,5)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A5]]] +; VBITS_LE_256-DAG: add x[[A6:[0-9]+]], x0, #[[#mul(VBYTES,6)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A6]]] +; VBITS_LE_256-DAG: add x[[A7:[0-9]+]], x0, #[[#mul(VBYTES,7)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A7]]] +; CHECK: ret + store <64 x float> zeroinitializer, <64 x float>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" }