diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11657,9 +11657,159 @@ return false; } +/// Calculates what the pre-extend type is, based on the extension +/// operation node provided by \p Extend. +/// +/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the +/// pre-extend type is pulled directly from the operand, while other extend +/// operations need a bit more inspection to get this information. +/// +/// \param Extend The SDNode from the DAG that represents the extend operation +/// \param DAG The SelectionDAG hosting the \p Extend node +/// +/// \returns The type representing the \p Extend source type, or \p MVT::Other +/// if no valid type can be determined +static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) { + switch (Extend.getOpcode()) { + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return Extend.getOperand(0).getValueType(); + case ISD::AssertSext: + case ISD::AssertZext: + case ISD::SIGN_EXTEND_INREG: { + VTSDNode *TypeNode = dyn_cast(Extend.getOperand(1)); + if (!TypeNode) + return MVT::Other; + return TypeNode->getVT(); + } + case ISD::AND: { + ConstantSDNode *Constant = + dyn_cast(Extend.getOperand(1).getNode()); + if (!Constant) + return MVT::Other; + + uint32_t Mask = Constant->getZExtValue(); + + if (Mask == UCHAR_MAX) + return MVT::i8; + else if (Mask == USHRT_MAX) + return MVT::i16; + else if (Mask == UINT_MAX) + return MVT::i32; + + return MVT::Other; + } + default: + return MVT::Other; + } + + llvm_unreachable("Code path unhandled in calculatePreExtendType!"); +} + +/// Combines a dup(sext/zext) node pattern into sext/zext(dup) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, + SelectionDAG &DAG) { + + if (VectorShuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + ShuffleVectorSDNode *ShuffleNode = + dyn_cast(VectorShuffle.getNode()); + // Defensive check, if this fails something has gone wrong somewhere + if (!ShuffleNode) + return SDValue(); + + if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0) + // Non-zero mask + return SDValue(); + + SDValue InsertVectorElt = VectorShuffle.getOperand(0); + + if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + + SDValue InsertLane = InsertVectorElt.getOperand(2); + ConstantSDNode *Constant; + // Ensures the insert is inserting into lane 0 + if (!(Constant = dyn_cast(InsertLane.getNode()))) + return SDValue(); + + if (Constant->getZExtValue() != 0) + return SDValue(); + + SDValue Extend = InsertVectorElt.getOperand(1); + unsigned ExtendOpcode = Extend.getOpcode(); + + bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || + ExtendOpcode == ISD::SIGN_EXTEND_INREG || + ExtendOpcode == ISD::AssertSext; + if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && + ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) + return SDValue(); + + EVT TargetType = VectorShuffle.getValueType(); + EVT PreExtendType = calculatePreExtendType(Extend, DAG); + + if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 && + TargetType != MVT::v2i64) || + (PreExtendType == MVT::Other)) + return SDValue(); + + EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType); + + if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount()) + return SDValue(); + + if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2) + return SDValue(); + + SDLoc DL(VectorShuffle); + + SDValue InsertVectorNode = DAG.getNode( + InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT), + Extend.getOperand(0), DAG.getConstant(0, DL, MVT::i64)); + + std::vector ShuffleMask(TargetType.getVectorElementCount().getValue()); + + SDValue VectorShuffleNode = + DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode, + DAG.getUNDEF(PreExtendVT), ShuffleMask); + + SDValue ExtendNode = + DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, TargetType, + VectorShuffleNode, DAG.getValueType(TargetType)); + + return ExtendNode; +} + +/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { + // If the value type isn't a vector, none of the operands are going to be dups + if (!Mul->getValueType(0).isVector()) + return SDValue(); + + SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG); + SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG); + + // Neither operands have been changed, don't make any further changes + if (!Op0 && !Op1) + return SDValue(); + + SDLoc DL(Mul); + return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0), + Op0 ? Op0 : Mul->getOperand(0), + Op1 ? Op1 : Mul->getOperand(1)); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { + + if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) + return Ext; + if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll @@ -0,0 +1,327 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve | FileCheck %s + +define @dupsext_v2i8_v2i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i8_v4i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v8i8_v8i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i8_v2i32(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i8_v4i32(i8 %src, %b) { +; CHECK-LABEL: dupsext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i8_v2i64(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtb x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i16_v2i32(i16 %src, %b) { +; CHECK-LABEL: dupsext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i16_v4i32(i16 %src, %b) { +; CHECK-LABEL: dupsext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i16_v2i64(i16 %src, %b) { +; CHECK-LABEL: dupsext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxth x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i32_v2i64(i32 %src, %b) { +; CHECK-LABEL: dupsext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i32 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i8_v4i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v8i8_v8i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i32(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i8_v4i32(i8 %src, %b) { +; CHECK-LABEL: dupzext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i64(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i16_v2i32(i16 %src, %b) { +; CHECK-LABEL: dupzext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i16_v4i32(i16 %src, %b) { +; CHECK-LABEL: dupzext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i16_v2i64(i16 %src, %b) { +; CHECK-LABEL: dupzext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i32_v2i64(i32 %src, %b) { +; CHECK-LABEL: dupzext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i32 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s + +; Supported combines + +define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: dupsext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.8b, w0 +; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %out = mul nsw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <8 x i16> @dupzext_v8i8_v8i16(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: dupzext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.8b, w0 +; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %ext.b = zext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %out = mul nuw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <4 x i32> @dupsext_v4i16_v4i32(i16 %src, <4 x i16> %b) { +; CHECK-LABEL: dupsext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.4h, w0 +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %ext.b = sext <4 x i16> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nsw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <4 x i32> @dupzext_v4i16_v4i32(i16 %src, <4 x i16> %b) { +; CHECK-LABEL: dupzext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.4h, w0 +; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %ext.b = zext <4 x i16> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nuw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <2 x i64> @dupsext_v2i32_v2i64(i32 %src, <2 x i32> %b) { +; CHECK-LABEL: dupsext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i32 %src to i64 + %ext.b = sext <2 x i32> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +define <2 x i64> @dupzext_v2i32_v2i64(i32 %src, <2 x i32> %b) { +; CHECK-LABEL: dupzext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = zext i32 %src to i64 + %ext.b = zext <2 x i32> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +; Unsupported combines + +define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupsext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <2 x i8> %b to <2 x i16> + %broadcast.splatinsert = insertelement <2 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <2 x i16> %broadcast.splatinsert, <2 x i16> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i16> %broadcast.splat, %ext.b + ret <2 x i16> %out +} + +define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) { +; CHECK-LABEL: dupzext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: mul x10, x8, x10 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i64 + %ext.b = zext <2 x i16> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +; dupsext_v4i8_v4i16 +; dupsext_v2i8_v2i32 +; dupsext_v4i8_v4i32 +; dupsext_v2i8_v2i64 +; dupsext_v2i16_v2i32 +; dupsext_v2i16_v2i64 +; dupzext_v2i8_v2i16 +; dupzext_v4i8_v4i16 +; dupzext_v2i8_v2i32 +; dupzext_v4i8_v4i32 +; dupzext_v2i8_v2i64 +; dupzext_v2i16_v2i32 +; dupzext_v2i16_v2i64 + +; Unsupported states + +define <8 x i16> @nonsplat_shuffleinsert(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: nonsplat_shuffleinsert: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 1 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> + %out = mul nsw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <8 x i16> @missing_insert(<8 x i8> %b) { +; CHECK-LABEL: missing_insert: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: ret +entry: + %ext.b = sext <8 x i8> %b to <8 x i16> + %broadcast.splat = shufflevector <8 x i16> %ext.b, <8 x i16> undef, <8 x i32> + %out = mul nsw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +}