Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19058,7 +19058,11 @@ // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that // place the incoming vectors at the exact same location. SDValue SingleSource = SDValue(); - unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); + // For scalable vectors it is safe to use the minimum number of elements, + // since this is consistent with the definition of extract_subvector where + // the index will be scaled by 'vscale'. + unsigned PartNumElem = + N->getOperand(0).getValueType().getVectorMinNumElements(); for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Op = N->getOperand(i); @@ -19180,7 +19184,10 @@ // The binop must be a vector type, so we can extract some fraction of it. EVT WideBVT = BinOp.getValueType(); - if (!WideBVT.isVector()) + // The optimisations below currently assume we are dealing with fixed length + // vectors. It is possible to add support for scalable vectors, but at the + // moment we've done no analysis to prove whether they are profitable or not. + if (!WideBVT.isFixedLengthVector()) return SDValue(); EVT VT = Extract->getValueType(0); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3611,16 +3611,15 @@ EVT InVT = N->getOperand(0).getValueType(); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); - unsigned WidenNumElts = WidenVT.getVectorNumElements(); - unsigned NumInElts = InVT.getVectorNumElements(); unsigned NumOperands = N->getNumOperands(); bool InputWidened = false; // Indicates we need to widen the input. if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) { - if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) { + unsigned WidenNumElts = WidenVT.getVectorMinNumElements(); + unsigned NumInElts = InVT.getVectorMinNumElements(); + if (WidenNumElts % NumInElts == 0) { // Add undef vectors to widen to correct length. - unsigned NumConcat = WidenVT.getVectorNumElements() / - InVT.getVectorNumElements(); + unsigned NumConcat = WidenNumElts / NumInElts; SDValue UndefVal = DAG.getUNDEF(InVT); SmallVector Ops(NumConcat); for (unsigned i=0; i < NumOperands; ++i) @@ -3644,6 +3643,11 @@ return GetWidenedVector(N->getOperand(0)); if (NumOperands == 2) { + assert(!WidenVT.isScalableVector() && + "Cannot use vector shuffles to widen CONCAT_VECTOR result"); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + unsigned NumInElts = InVT.getVectorNumElements(); + // Replace concat of two operands with a shuffle. SmallVector MaskOps(WidenNumElts, -1); for (unsigned i = 0; i < NumInElts; ++i) { @@ -3658,6 +3662,11 @@ } } + assert(!WidenVT.isScalableVector() && + "Cannot use build vectors to widen CONCAT_VECTOR result"); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + unsigned NumInElts = InVT.getVectorNumElements(); + // Fall back to use extracts and build vector. EVT EltVT = WidenVT.getVectorElementType(); SmallVector Ops(WidenNumElts); Index: llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll @@ -325,6 +325,39 @@ ret %out } +; ADD (tuples) + +define @add_i64_tuple2(* %out, %in1, %in2) { +; CHECK-LABEL: add_i64_tuple2 +; CHECK: add z0.d, z0.d, z0.d +; CHECK: add z1.d, z1.d, z1.d + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64( %in1, %in2) + %res = add %tuple, %tuple + ret %res +} + +define @add_i64_tuple3(* %out, %in1, %in2, %in3) { +; CHECK-LABEL: add_i64_tuple3 +; CHECK: add z0.d, z0.d, z0.d +; CHECK: add z1.d, z1.d, z1.d +; CHECK: add z2.d, z2.d, z2.d + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64( %in1, %in2, %in3) + %res = add %tuple, %tuple + ret %res +} + +define @add_i64_tuple4(* %out, %in1, %in2, %in3, %in4) { +; CHECK-LABEL: add_i64_tuple4 +; CHECK: add z0.d, z0.d, z0.d +; CHECK: add z1.d, z1.d, z1.d +; CHECK: add z2.d, z2.d, z2.d +; CHECK: add z3.d, z3.d, z3.d + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64( %in1, %in2, %in3, %in4) + %res = add %tuple, %tuple + ret %res +} + + declare @llvm.aarch64.sve.abs.nxv16i8(, , ) declare @llvm.aarch64.sve.abs.nxv8i16(, , ) declare @llvm.aarch64.sve.abs.nxv4i32(, , ) @@ -366,3 +399,7 @@ declare @llvm.aarch64.sve.uqsub.x.nxv8i16(, ) declare @llvm.aarch64.sve.uqsub.x.nxv4i32(, ) declare @llvm.aarch64.sve.uqsub.x.nxv2i64(, ) + +declare @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(, ) +declare @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(, , ) +declare @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(, , , )