diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -963,6 +963,8 @@ // Demand any elements from the subvector and the remainder from the src its // inserted into. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; SDValue Sub = Op.getOperand(1); uint64_t Idx = Op.getConstantOperandVal(2); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); @@ -1009,6 +1011,8 @@ case ISD::EXTRACT_SUBVECTOR: { // Offset the demanded elts by the subvector index. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; uint64_t Idx = Op.getConstantOperandVal(1); unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx); @@ -2442,6 +2446,8 @@ // Demand any elements from the subvector and the remainder from the src its // inserted into. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; SDValue Sub = Op.getOperand(1); uint64_t Idx = Op.getConstantOperandVal(2); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); @@ -2487,6 +2493,8 @@ case ISD::EXTRACT_SUBVECTOR: { // Offset the demanded elts by the subvector index. SDValue Src = Op.getOperand(0); + if (Src.getValueType().isScalableVector()) + break; uint64_t Idx = Op.getConstantOperandVal(1); unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx); diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -3247,6 +3247,49 @@ ReplaceNode(N, N3); } +static SDNode *extractFromSVEReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + SDLoc DL(V); + switch (VT.getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + +static SDNode *insertIntoSVEReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(V.getValueType().isFixedLengthVector() && + "Expected fixed length vector type!"); + SDLoc DL(V); + switch (V.getValueType().getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + void AArch64DAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -3320,6 +3363,44 @@ return; break; + case ISD::EXTRACT_SUBVECTOR: { + // Bail when not a "cast" like extract_subvector. + if (cast(Node->getOperand(1))->getZExtValue() != 0) + break; + + // Bail when normal isel can do the job. + EVT InVT = Node->getOperand(0).getValueType(); + if (VT.isScalableVector() || !InVT.isScalableVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + + ReplaceNode(Node, extractFromSVEReg(CurDAG, VT, Node->getOperand(0))); + return; + } + + case ISD::INSERT_SUBVECTOR: { + // Bail when not a "cast" like insert_subvector. + if (cast(Node->getOperand(2))->getZExtValue() != 0) + break; + if (!Node->getOperand(0).isUndef()) + break; + + // Bail when normal isel should do the job. + EVT InVT = Node->getOperand(1).getValueType(); + if (!VT.isScalableVector() || InVT.isScalableVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + + ReplaceNode(Node, insertIntoSVEReg(CurDAG, VT, Node->getOperand(1))); + return; + } + case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -420,6 +420,7 @@ /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue LowerOperationNoFixedLengthSVE(SDValue Op, SelectionDAG &DAG) const; const char *getTargetNodeName(unsigned Opcode) const override; @@ -710,6 +711,7 @@ bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); + void addTypeForFixedLengthSVE(MVT VT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -926,6 +926,15 @@ setOperationAction(ISD::SELECT, VT, Custom); } } + + // TODO: Do this as part of the addRegisterClass update above. Currently + // we're forced to do this work after computeRegisterProperties. + if (useSVEForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + addTypeForFixedLengthSVE(VT); + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + addTypeForFixedLengthSVE(VT); + } } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -1010,6 +1019,27 @@ } } +void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + // By default everything requires expanding. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + // Use common code for all type legalisation. + if (!isTypeLegal(VT)) + return; + + // These are used to legalise operations and represent the only operations + // that are legal to operate on fixed length vectors. + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + + // Lower fixed length vector operations to scalable equivalents. + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); +} + void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); @@ -3330,8 +3360,9 @@ return SDValue(); } -SDValue AArch64TargetLowering::LowerOperation(SDValue Op, - SelectionDAG &DAG) const { +SDValue +AArch64TargetLowering::LowerOperationNoFixedLengthSVE(SDValue Op, + SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); LLVM_DEBUG(Op.dump()); @@ -3484,6 +3515,226 @@ } } +// Return the largest legal scalable vector type that matches VT's element type. +static EVT getContainerForFixedVector(SelectionDAG &DAG, EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE container"); + case MVT::i8: + return EVT(MVT::nxv16i8); + case MVT::i16: + return EVT(MVT::nxv8i16); + case MVT::i32: + return EVT(MVT::nxv4i32); + case MVT::i64: + return EVT(MVT::nxv2i64); + case MVT::f16: + return EVT(MVT::nxv8f16); + case MVT::f32: + return EVT(MVT::nxv4f32); + case MVT::f64: + return EVT(MVT::nxv2f64); + } +} + +// Return a PTRUE with active lanes corresponding to the extent of VT. +static SDValue createPredicateForFixedVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + + int PgPattern; + switch (VT.getVectorNumElements()) { + default: + llvm_unreachable("unexpected element count for SVE predicate"); + case 256: + PgPattern = AArch64SVEPredPattern::vl256; + break; + case 128: + PgPattern = AArch64SVEPredPattern::vl128; + break; + case 64: + PgPattern = AArch64SVEPredPattern::vl64; + break; + case 32: + PgPattern = AArch64SVEPredPattern::vl32; + break; + case 16: + PgPattern = AArch64SVEPredPattern::vl16; + break; + case 8: + PgPattern = AArch64SVEPredPattern::vl8; + break; + case 4: + PgPattern = AArch64SVEPredPattern::vl4; + break; + case 2: + PgPattern = AArch64SVEPredPattern::vl2; + break; + case 1: + PgPattern = AArch64SVEPredPattern::vl1; + break; + } + + // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can + // use AArch64SVEPredPattern::all, which can enable the use of unpredicated + // variants of instructions when available. + + MVT MaskVT; + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for SVE predicate"); + case MVT::i8: + MaskVT = MVT::nxv16i1; + break; + case MVT::i16: + case MVT::f16: + MaskVT = MVT::nxv8i1; + break; + case MVT::i32: + case MVT::f32: + MaskVT = MVT::nxv4i1; + break; + case MVT::i64: + case MVT::f64: + MaskVT = MVT::nxv2i1; + break; + } + + return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT, + DAG.getTargetConstant(PgPattern, DL, MVT::i64)); +} + +// Grow V to consume an entire SVE register. +static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + "Expected to convert into a scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected a fixed length vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); +} + +// Shrink V so it's just big enough to maintain a VT's worth of data. +static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { + assert(VT.isFixedLengthVector() && + "Expected to convert into a fixed length vector!"); + assert(V.getValueType().isScalableVector() && + "Expected a scalable vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); +} + +static SDValue lowerFixedVectorLoadToSVE(SelectionDAG &DAG, SDValue Op) { + auto Load = cast(Op); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT ContainerVT = getContainerForFixedVector(DAG, VT); + + auto NewLoad = DAG.getMaskedLoad( + ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), + createPredicateForFixedVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), + Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), + Load->getExtensionType()); + + auto Result = convertFromScalableVector(DAG, VT, NewLoad); + SDValue MergedValues[2] = {Result, Load->getChain()}; + return DAG.getMergeValues(MergedValues, DL); +} + +static SDValue lowerFixedVectorStoreToSVE(SelectionDAG &DAG, SDValue Op) { + auto Store = cast(Op); + + SDLoc DL(Op); + EVT VT = Store->getValue().getValueType(); + EVT ContainerVT = getContainerForFixedVector(DAG, VT); + + auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); + return DAG.getMaskedStore( + Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), + createPredicateForFixedVector(DAG, DL, VT), Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore()); +} + +SDValue AArch64TargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + if (!useSVEForFixedLengthVectors()) + return LowerOperationNoFixedLengthSVE(Op, DAG); + + // If we get here then we might have a vector operation that is type legal + // but too big to be implemented using NEON. Lower the operation to SVE. + + LLVM_DEBUG(dbgs() << "Fixed length SVE lowering: "); + LLVM_DEBUG(Op.dump()); + + switch (Op.getOpcode()) { + default: + return LowerOperationNoFixedLengthSVE(Op, DAG); + + // EXTRACT_SUBVECTOR/INSERT_SUBVECTOR are used to "cast" between scalable + // and fixed length vector types. These represent legal operations with + // everything else requiring custom handling. + case ISD::EXTRACT_SUBVECTOR: { + EVT VT = Op.getValueType(); + EVT InVT = Op.getOperand(0).getValueType(); + + if (VT.isFixedLengthVector() && InVT.isFixedLengthVector()) { + if (InVT.getSizeInBits() <= 128) + return Op; // Use NEON + + return SDValue(); // Expand fixed length extract... + } + + return Op; // ...with everything else being legal. + } + case ISD::INSERT_SUBVECTOR: { + EVT VT = Op.getValueType(); + EVT InVT = Op.getOperand(1).getValueType(); + + if (VT.isFixedLengthVector() && InVT.isFixedLengthVector()) { + if (VT.getSizeInBits() <= 128) + return Op; // Use NEON. + + return SDValue(); // Expand fixed length inserts... + } + + return Op; // ...with everything else being legal. + } + + // Convert all vector loads to masked_loads. + case ISD::LOAD: { + EVT VT = Op.getValueType(); + if (!VT.isFixedLengthVector()) + return LowerOperationNoFixedLengthSVE(Op, DAG); + + if (VT.getSizeInBits() <= 128) + return Op; // Use NEON. + + return lowerFixedVectorLoadToSVE(DAG, Op); + } + + // Convert all vector stores to masked_stores. + case ISD::STORE: { + EVT InVT = cast(Op)->getValue().getValueType(); + if (!InVT.isFixedLengthVector()) + return LowerOperationNoFixedLengthSVE(Op, DAG); + + if (InVT.getSizeInBits() <= 128) + return Op; // Use NEON. + + return lowerFixedVectorStoreToSVE(DAG, Op); + } + } +} + bool AArch64TargetLowering::useSVEForFixedLengthVectors() const { // Prefer NEON unless larger SVE registers are available. return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll @@ -0,0 +1,104 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK + +; VBYTES represents the useful byte size of a vector register from the code +; generator's point of view. It is clamped to power-of-2 values because +; only power-of-2 vector lengths are considered legal, regardless of the +; user specified vector length. + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define <2 x float> @load_v2f32(<2 x float>* %a) #0 { +; CHECK-LABEL: load_v2f32: +; CHECK: ldr d0, [x0] +; CHECK: ret + %load = load <2 x float>, <2 x float>* %a + ret <2 x float> %load +} + +; Don't use SVE for 128-bit vectors. +define <4 x float> @load_v4f32(<4 x float>* %a) #0 { +; CHECK-LABEL: load_v4f32: +; CHECK: ldr q0, [x0] +; CHECK: ret + %load = load <4 x float>, <4 x float>* %a + ret <4 x float> %load +} + +define <8 x float> @load_v8f32(<8 x float>* %a) #0 { +; CHECK-LABEL: load_v8f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] +; CHECK: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; CHECK: ret + %load = load <8 x float>, <8 x float>* %a + ret <8 x float> %load +} + +define <16 x float> @load_v16f32(<16 x float>* %a) #0 { +; CHECK-LABEL: load_v16f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] +; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]] +; CHECK: ret + %load = load <16 x float>, <16 x float>* %a + ret <16 x float> %load +} + +define <32 x float> @load_v32f32(<32 x float>* %a) #0 { +; CHECK-LABEL: load_v32f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] +; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]] +; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A2]]] +; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A3]]] +; CHECK: ret + %load = load <32 x float>, <32 x float>* %a + ret <32 x float> %load +} + +define <64 x float> @load_v64f32(<64 x float>* %a) #0 { +; CHECK-LABEL: load_v64f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] +; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] +; VBITS_LE_1024-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_1024-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]] +; VBITS_LE_512-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A2]]] +; VBITS_LE_512-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A3]]] +; VBITS_LE_256-DAG: add x[[A4:[0-9]+]], x0, #[[#mul(VBYTES,4)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A4]]] +; VBITS_LE_256-DAG: add x[[A5:[0-9]+]], x0, #[[#mul(VBYTES,5)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A5]]] +; VBITS_LE_256-DAG: add x[[A6:[0-9]+]], x0, #[[#mul(VBYTES,6)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A6]]] +; VBITS_LE_256-DAG: add x[[A7:[0-9]+]], x0, #[[#mul(VBYTES,7)]] +; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A7]]] +; CHECK: ret + %load = load <64 x float>, <64 x float>* %a + ret <64 x float> %load +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll @@ -0,0 +1,104 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK + +; VBYTES represents the useful byte size of a vector register from the code +; generator's point of view. It is clamped to power-of-2 values because +; only power-of-2 vector lengths are considered legal, regardless of the +; user specified vector length. + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +; Don't use SVE for 64-bit vectors. +define void @store_v2f32(<2 x float>* %a) #0 { +; CHECK-LABEL: store_v2f32: +; CHECK: str xzr, [x0] +; CHECK: ret + store <2 x float> zeroinitializer, <2 x float>* %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @store_v4f32(<4 x float>* %a) #0 { +; CHECK-LABEL: store_v4f32: +; CHECK: stp xzr, xzr, [x0] +; CHECK: ret + store <4 x float> zeroinitializer, <4 x float>* %a + ret void +} + +define void @store_v8f32(<8 x float>* %a) #0 { +; CHECK-LABEL: store_v8f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] +; CHECK: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; CHECK: ret + store <8 x float> zeroinitializer, <8 x float>* %a + ret void +} + +define void @store_v16f32(<16 x float>* %a) #0 { +; CHECK-LABEL: store_v16f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] +; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]] +; CHECK: ret + store <16 x float> zeroinitializer, <16 x float>* %a + ret void +} + +define void @store_v32f32(<32 x float>* %a) #0 { +; CHECK-LABEL: store_v32f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] +; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]] +; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A2]]] +; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A3]]] +; CHECK: ret + store <32 x float> zeroinitializer, <32 x float>* %a + ret void +} + +define void @store_v64f32(<64 x float>* %a) #0 { +; CHECK-LABEL: store_v64f32: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] +; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] +; VBITS_LE_1024-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]] +; VBITS_LE_1024-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]] +; VBITS_LE_512-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]] +; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A2]]] +; VBITS_LE_512-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]] +; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A3]]] +; VBITS_LE_256-DAG: add x[[A4:[0-9]+]], x0, #[[#mul(VBYTES,4)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A4]]] +; VBITS_LE_256-DAG: add x[[A5:[0-9]+]], x0, #[[#mul(VBYTES,5)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A5]]] +; VBITS_LE_256-DAG: add x[[A6:[0-9]+]], x0, #[[#mul(VBYTES,6)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A6]]] +; VBITS_LE_256-DAG: add x[[A7:[0-9]+]], x0, #[[#mul(VBYTES,7)]] +; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A7]]] +; CHECK: ret + store <64 x float> zeroinitializer, <64 x float>* %a + ret void +} + +attributes #0 = { "target-features"="+sve" }