diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -3240,6 +3240,63 @@ ReplaceNode(N, N3); } +// NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length +// vector types larger than NEON don't have a matching SubRegIndex. +static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(V.getValueType().isScalableVector() && + V.getValueType().getSizeInBits().getKnownMinSize() == + AArch64::SVEBitsPerBlock && + "Expected to extract from a packed scalable vector!"); + assert(VT.isFixedLengthVector() && + "Expected to extract a fixed length vector!"); + + SDLoc DL(V); + switch (VT.getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + +// NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length +// vector types larger than NEON don't have a matching SubRegIndex. +static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) { + assert(VT.isScalableVector() && + VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock && + "Expected to insert into a packed scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected to insert a fixed length vector!"); + + SDLoc DL(V); + switch (V.getValueType().getSizeInBits()) { + case 64: { + auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + case 128: { + auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT, + SDValue(Container, 0), V, SubReg); + } + default: { + auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64); + return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + } + } +} + void AArch64DAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -3313,6 +3370,52 @@ return; break; + case ISD::EXTRACT_SUBVECTOR: { + // Bail when not a "cast" like extract_subvector. + if (cast(Node->getOperand(1))->getZExtValue() != 0) + break; + + // Bail when normal isel can do the job. + EVT InVT = Node->getOperand(0).getValueType(); + if (VT.isScalableVector() || InVT.isFixedLengthVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + // + // NOTE: If the above changes, be aware that selection will still not work + // because the td definition of extract_vector does not support extracting + // a fixed length vector from a scalable vector. + + ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0))); + return; + } + + case ISD::INSERT_SUBVECTOR: { + // Bail when not a "cast" like insert_subvector. + if (cast(Node->getOperand(2))->getZExtValue() != 0) + break; + if (!Node->getOperand(0).isUndef()) + break; + + // Bail when normal isel should do the job. + EVT InVT = Node->getOperand(1).getValueType(); + if (VT.isFixedLengthVector() || InVT.isScalableVector()) + break; + + // NOTE: We can only get here when doing fixed length SVE code generation. + // We do manual selection because the types involved are not linked to real + // registers (despite being legal) and must be coerced into SVE registers. + // + // NOTE: If the above changes, be aware that selection will still not work + // because the td definition of insert_vector does not support inserting a + // fixed length vector into a scalable vector. + + ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1))); + return; + } + case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -846,6 +846,7 @@ SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -116,6 +116,18 @@ /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; +/// Returns true if VT's elements occupy the lowest bit positions of its +/// associated register class without any intervening space. +/// +/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the +/// same register class, but only nxv8f16 can be treated as a packed vector. +static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { + assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal vector type!"); + return VT.isFixedLengthVector() || + VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; +} + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -906,6 +918,7 @@ // D68877 for more details. for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); @@ -919,16 +932,18 @@ setOperationAction(ISD::SRA, VT, Custom); if (VT.getScalarType() == MVT::i1) setOperationAction(ISD::SETCC, VT, Custom); - } else { - for (auto VT : { MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32 }) - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } } + + for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { if (isTypeLegal(VT)) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); } @@ -1035,9 +1050,7 @@ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) setOperationAction(Op, VT, Expand); - // EXTRACT_SUBVECTOR/INSERT_SUBVECTOR are used to "cast" between scalable - // and fixed length vector types, although with the current level of support - // only the former is exercised. + // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Lower fixed length vector operations to scalable equivalents. @@ -3457,6 +3470,8 @@ return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return LowerINSERT_SUBVECTOR(Op, DAG); case ISD::SDIV: return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED); case ISD::UDIV: @@ -8665,29 +8680,47 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - assert(!Op.getValueType().isScalableVector() && - "Unexpected scalable type for custom lowering EXTRACT_SUBVECTOR"); + assert(Op.getValueType().isFixedLengthVector() && + "Only cases that extract a fixed length vector are supported!"); - EVT VT = Op.getOperand(0).getValueType(); - SDLoc dl(Op); - // Just in case... - if (!VT.isVector()) - return SDValue(); + EVT InVT = Op.getOperand(0).getValueType(); + unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); + unsigned Size = Op.getValueSizeInBits(); - ConstantSDNode *Cst = dyn_cast(Op.getOperand(1)); - if (!Cst) - return SDValue(); - unsigned Val = Cst->getZExtValue(); + if (InVT.isScalableVector()) { + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG)) + return Op; - unsigned Size = Op.getValueSizeInBits(); + return SDValue(); + } // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. - if (Val == 0) + if (Idx == 0 && InVT.getSizeInBits() <= 128) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. - if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) + if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64) + return Op; + + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType().isScalableVector() && + "Only expect to lower inserts into scalable vectors!"); + + EVT InVT = Op.getOperand(1).getValueType(); + unsigned Idx = cast(Op.getOperand(2))->getZExtValue(); + + // We don't have any patterns for scalable vector yet. + if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT)) + return SDValue(); + + // This will be matched by custom code during ISelDAGToDAG. + if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) return Op; return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll @@ -0,0 +1,88 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 + +; Test we can code generater patterns of the form: +; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0 +; scalable_vector = ISD::INSERT_SUBVECTOR scalable_vector, fixed_length_vector, 0 +; +; NOTE: Currently shufflevector does not support scalable vectors so it cannot +; be used to model the above operations. Instead these tests rely on knowing +; how fixed length operation are lowered to scalable ones, with multiple blocks +; ensuring insert/extract sequences are not folded away. + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 { +; CHECK-LABEL: subvector_v8i32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 +; CHECK: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] +; CHECK: st1w { [[DATA]] }, [[PG]], [x1] +; CHECK: ret + %a = load <8 x i32>, <8 x i32>* %in + br label %bb1 + +bb1: + store <8 x i32> %a, <8 x i32>* %out + ret void +} + +define void @subvector_v16i32(<16 x i32> *%in, <16 x i32>* %out) #0 { +; CHECK-LABEL: subvector_v16i32: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] +; VBITS_GE_512: st1w { [[DATA]] }, [[PG]], [x1] +; CHECKT: ret + %a = load <16 x i32>, <16 x i32>* %in + br label %bb1 + +bb1: + store <16 x i32> %a, <16 x i32>* %out + ret void +} + +define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 { +; CHECK-LABEL: subvector_v32i32: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] +; VBITS_GE_1024: st1w { [[DATA]] }, [[PG]], [x1] +; CHECK: ret + %a = load <32 x i32>, <32 x i32>* %in + br label %bb1 + +bb1: + store <32 x i32> %a, <32 x i32>* %out + ret void +} + +define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 { +; CHECK-LABEL: subvector_v64i32: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] +; VBITS_GE_2048: st1w { [[DATA]] }, [[PG]], [x1] +; CHECK: ret + %a = load <64 x i32>, <64 x i32>* %in + br label %bb1 + +bb1: + store <64 x i32> %a, <64 x i32>* %out + ret void +} + +attributes #0 = { "target-features"="+sve" }