diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11570,9 +11570,160 @@ return false; } +static EVT calculatePreExtendVectorType(SDNode *Extend, EVT PostExtendType, + SelectionDAG &DAG) { + const SDValue &ExtOperand = Extend->getOperand(0); + EVT PreExtendType; + + switch (Extend->getOpcode()) { + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + PreExtendType = ExtOperand.getValueType(); + break; + case ISD::AssertSext: + case ISD::AssertZext: + case ISD::SIGN_EXTEND_INREG: { + const SDValue &TypeOperand = Extend->getOperand(1); + VTSDNode *TypeNode = dyn_cast(TypeOperand); + if (!TypeNode) + return MVT::Other; + PreExtendType = TypeNode->getVT(); + break; + } + case ISD::AND: { + const SDValue &TypeOperand = Extend->getOperand(1); + ConstantSDNode *Constant = dyn_cast(TypeOperand.getNode()); + if (!Constant) { + LLVM_DEBUG(dbgs() << "Type operand is not a constant.\n"); + return MVT::Other; + } + + uint32_t Mask = Constant->getZExtValue(); + + if (Mask == UCHAR_MAX) + PreExtendType = MVT::i8; + else if (Mask == USHRT_MAX) + PreExtendType = MVT::i16; + else if (Mask == UINT_MAX) + PreExtendType = MVT::i32; + else { + LLVM_DEBUG(dbgs() << "AND mask constant " << Mask + << " is not a type limit" + << "\n"); + return MVT::Other; + } + break; + } + default: + return MVT::Other; + break; + } + + return PostExtendType.getVectorVT(*DAG.getContext(), PreExtendType, + PostExtendType.getVectorElementCount()); +} + +/// Combines a dup(sext/zext) node pattern into sext/zext(dup) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performCommonVectorExtendCombine(SDNode *VectorShuffle, + SelectionDAG &DAG) { + + std::function LegalityFilter = [](EVT PreExtendVT, + EVT TargetType) { + if (PreExtendVT == MVT::Other) + return false; + + if (TargetType == MVT::v8i16) + if (PreExtendVT != MVT::v8i8 && PreExtendVT != MVT::v16i8) + return false; + else if (TargetType == MVT::v4i32) + if (PreExtendVT != MVT::v4i16 && PreExtendVT != MVT::v8i16) + return false; + else if (TargetType == MVT::v2i64) + if (PreExtendVT != MVT::v2i32 && PreExtendVT != MVT::v4i32) + return false; + else + return false; + + return true; + }; + + if (VectorShuffle->getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + SDValue InsertVectorElt = VectorShuffle->getOperand(0); + SDValue Extend = InsertVectorElt.getOperand(1); + unsigned ExtendOpcode = Extend.getOpcode(); + + bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || + ExtendOpcode == ISD::SIGN_EXTEND_INREG || + ExtendOpcode == ISD::AssertSext; + if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && + ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) + return SDValue(); + + EVT TargetType = VectorShuffle->getValueType(0); + EVT PreExtendVT = + calculatePreExtendVectorType(Extend.getNode(), TargetType, DAG); + if (!LegalityFilter(PreExtendVT, TargetType)) + return SDValue(); + + SDLoc DebugLoc(VectorShuffle); + + SDValue InsertVectorNode = + DAG.getNode(InsertVectorElt.getOpcode(), DebugLoc, PreExtendVT, + {DAG.getUNDEF(PreExtendVT), Extend.getOperand(0), + DAG.getConstant(0, DebugLoc, MVT::i64)}); + + std::vector ShuffleMask(TargetType.getVectorElementCount().getValue()); + + SDValue VectorShuffleNode = + DAG.getVectorShuffle(PreExtendVT, DebugLoc, InsertVectorNode, + DAG.getUNDEF(PreExtendVT), ShuffleMask); + + SDValue ExtendNode = DAG.getNode( + IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DebugLoc, TargetType, + {VectorShuffleNode, DAG.getValueType(TargetType)}); + + return ExtendNode; +} + +/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { + // If the value type isn't a vector, none of the operands are going to be dups + if (!Mul->getValueType(0).isVector()) + return SDValue(); + + std::vector Operands; + bool Changed = false; + + for (unsigned i = 0; i < Mul->getNumOperands(); i++) { + const SDValue &Operand = Mul->getOperand(i); + if (SDValue ExtNode = + performCommonVectorExtendCombine(Operand.getNode(), DAG)) { + Changed = true; + Operands.push_back(ExtNode); + } else + Operands.push_back(Operand); + } + + // If no DAG changes have been made yet, don't make any useless changes + if (!Changed) + return SDValue(); + + SDLoc DebugLoc(Mul); + return DAG.getNode(Mul->getOpcode(), DebugLoc, Mul->getValueType(0), + Operands); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { + + if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) + return Ext; + if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll @@ -0,0 +1,327 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve | FileCheck %s + +define @dupsext_v2i8_v2i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i8_v4i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v8i8_v8i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i8_v2i32(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i8_v4i32(i8 %src, %b) { +; CHECK-LABEL: dupsext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i8_v2i64(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtb x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i16_v2i32(i16 %src, %b) { +; CHECK-LABEL: dupsext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i16_v4i32(i16 %src, %b) { +; CHECK-LABEL: dupsext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i16_v2i64(i16 %src, %b) { +; CHECK-LABEL: dupsext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxth x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i32_v2i64(i32 %src, %b) { +; CHECK-LABEL: dupsext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i32 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i8_v4i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v8i8_v8i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i32(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i8_v4i32(i8 %src, %b) { +; CHECK-LABEL: dupzext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i64(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i16_v2i32(i16 %src, %b) { +; CHECK-LABEL: dupzext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i16_v4i32(i16 %src, %b) { +; CHECK-LABEL: dupzext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i16_v2i64(i16 %src, %b) { +; CHECK-LABEL: dupzext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i32_v2i64(i32 %src, %b) { +; CHECK-LABEL: dupzext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i32 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -0,0 +1,372 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s + +; Supported combines + +define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: dupsext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.8b, w0 +; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %out = mul nsw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <8 x i16> @dupzext_v8i8_v8i16(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: dupzext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.8b, w0 +; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %ext.b = zext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %out = mul nuw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <4 x i32> @dupsext_v4i16_v4i32(i16 %src, <4 x i16> %b) { +; CHECK-LABEL: dupsext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.4h, w0 +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %ext.b = sext <4 x i16> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nsw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <4 x i32> @dupzext_v4i16_v4i32(i16 %src, <4 x i16> %b) { +; CHECK-LABEL: dupzext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.4h, w0 +; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %ext.b = zext <4 x i16> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nuw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <2 x i64> @dupsext_v2i32_v2i64(i32 %src, <2 x i32> %b) { +; CHECK-LABEL: dupsext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i32 %src to i64 + %ext.b = sext <2 x i32> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +define <2 x i64> @dupzext_v2i32_v2i64(i32 %src, <2 x i32> %b) { +; CHECK-LABEL: dupzext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = zext i32 %src to i64 + %ext.b = zext <2 x i32> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +; Unsupported combines + +define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupsext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <2 x i8> %b to <2 x i16> + %broadcast.splatinsert = insertelement <2 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <2 x i16> %broadcast.splatinsert, <2 x i16> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i16> %broadcast.splat, %ext.b + ret <2 x i16> %out +} + +define <4 x i16> @dupsext_v4i8_v4i16(i8 %src, <4 x i8> %b) { +; CHECK-LABEL: dupsext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: mul v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <4 x i8> %b to <4 x i16> + %broadcast.splatinsert = insertelement <4 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <4 x i16> %broadcast.splatinsert, <4 x i16> undef, <4 x i32> zeroinitializer + %out = mul nsw <4 x i16> %broadcast.splat, %ext.b + ret <4 x i16> %out +} + +define <2 x i32> @dupsext_v2i8_v2i32(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupsext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %ext.b = sext <2 x i8> %b to <2 x i32> + %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i32> %broadcast.splat, %ext.b + ret <2 x i32> %out +} + +define <4 x i32> @dupsext_v4i8_v4i32(i8 %src, <4 x i8> %b) { +; CHECK-LABEL: dupsext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: shl v0.4s, v0.4s, #24 +; CHECK-NEXT: sshr v0.4s, v0.4s, #24 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %ext.b = sext <4 x i8> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nsw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <2 x i64> @dupsext_v2i8_v2i64(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupsext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: shl v0.2d, v0.2d, #56 +; CHECK-NEXT: sshr v0.2d, v0.2d, #56 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtb x8, w0 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: mul x10, x8, x10 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i64 + %ext.b = sext <2 x i8> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +define <2 x i32> @dupsext_v2i16_v2i32(i16 %src, <2 x i16> %b) { +; CHECK-LABEL: dupsext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %ext.b = sext <2 x i16> %b to <2 x i32> + %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i32> %broadcast.splat, %ext.b + ret <2 x i32> %out +} + +define <2 x i64> @dupsext_v2i16_v2i64(i16 %src, <2 x i16> %b) { +; CHECK-LABEL: dupsext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: shl v0.2d, v0.2d, #48 +; CHECK-NEXT: sshr v0.2d, v0.2d, #48 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxth x8, w0 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: mul x10, x8, x10 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i64 + %ext.b = sext <2 x i16> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +define <2 x i16> @dupzext_v2i8_v2i16(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupzext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %ext.b = zext <2 x i8> %b to <2 x i16> + %broadcast.splatinsert = insertelement <2 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <2 x i16> %broadcast.splatinsert, <2 x i16> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i16> %broadcast.splat, %ext.b + ret <2 x i16> %out +} + +define <4 x i16> @dupzext_v4i8_v4i16(i8 %src, <4 x i8> %b) { +; CHECK-LABEL: dupzext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: mul v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %ext.b = zext <4 x i8> %b to <4 x i16> + %broadcast.splatinsert = insertelement <4 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <4 x i16> %broadcast.splatinsert, <4 x i16> undef, <4 x i32> zeroinitializer + %out = mul nuw <4 x i16> %broadcast.splat, %ext.b + ret <4 x i16> %out +} + +define <2 x i32> @dupzext_v2i8_v2i32(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupzext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %ext.b = zext <2 x i8> %b to <2 x i32> + %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i32> %broadcast.splat, %ext.b + ret <2 x i32> %out +} + +define <4 x i32> @dupzext_v4i8_v4i32(i8 %src, <4 x i8> %b) { +; CHECK-LABEL: dupzext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %ext.b = zext <4 x i8> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nuw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <2 x i64> @dupzext_v2i8_v2i64(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupzext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: mul x10, x8, x10 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i64 + %ext.b = zext <2 x i8> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +define <2 x i32> @dupzext_v2i16_v2i32(i16 %src, <2 x i16> %b) { +; CHECK-LABEL: dupzext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %ext.b = zext <2 x i16> %b to <2 x i32> + %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i32> %broadcast.splat, %ext.b + ret <2 x i32> %out +} + +define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) { +; CHECK-LABEL: dupzext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: mul x10, x8, x10 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i64 + %ext.b = zext <2 x i16> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +}