Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4583,8 +4583,42 @@ return AArch64ISD::SMULL; } } + + // Select UMULL if we can replace the other operand with an extends. + if (IsN0ZExt || IsN1ZExt) { + EVT VT = N0->getValueType(0); + APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(), + VT.getScalarSizeInBits() / 2); + if (DAG.MaskedValueIsZero(SDValue(IsN0ZExt ? N1 : N0, 0), Mask)) { + EVT HalfVT; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v2i64: + HalfVT = MVT::v2i32; + break; + case MVT::v4i32: + HalfVT = MVT::v4i16; + break; + case MVT::v8i16: + HalfVT = MVT::v8i8; + break; + default: + return 0; + } + // Truncate and then extend the result. + SDValue NewExt = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, + SDValue(IsN0ZExt ? N1 : N0, 0)); + NewExt = DAG.getZExtOrTrunc(NewExt, DL, VT); + if (IsN0ZExt) + N1 = NewExt.getNode(); + else + N0 = NewExt.getNode(); + return AArch64ISD::UMULL; + } + } + if (!IsN1SExt && !IsN1ZExt) return 0; + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (IsN1SExt && isAddSubSExt(N0, DAG)) { Index: llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -116,17 +116,13 @@ define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) { ; CHECK-LABEL: dupzext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi d1, #0x00ffff0000ffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: dup v2.2d, x8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mul x9, x8, x9 -; CHECK-NEXT: mul x8, x8, x10 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: umull v0.2d, v2.2s, v0.2s ; CHECK-NEXT: ret entry: %in = zext i16 %src to i64 @@ -226,12 +222,12 @@ define <8 x i16> @typei1_v8i1_v8i16(i1 %src, <8 x i1> %b) { ; CHECK-LABEL: typei1_v8i1_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.8b, #1 ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: movi v1.8b, #1 +; CHECK-NEXT: dup v2.8h, w8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: xtn v2.8b, v2.8h +; CHECK-NEXT: umull v0.8h, v2.8b, v0.8b ; CHECK-NEXT: ret entry: %in = zext i1 %src to i16 Index: llvm/test/CodeGen/AArch64/aarch64-smull.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -932,9 +932,9 @@ define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-LABEL: umull_and_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 -; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v1.8b, v1.8h +; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> @@ -946,9 +946,9 @@ define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) { ; CHECK-LABEL: umull_and_v8i16_c: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 -; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: xtn v1.8b, v1.8h +; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b ; CHECK-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> @@ -989,9 +989,9 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.8b, #15 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 +; CHECK-NEXT: xtn v1.8b, v1.8h ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ret entry: %in1 = zext <8 x i4> %src1 to <8 x i16> @@ -1004,9 +1004,9 @@ ; CHECK-LABEL: umull_and_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: ret entry: %in1 = zext <4 x i16> %src1 to <4 x i32> @@ -1019,16 +1019,9 @@ ; CHECK-LABEL: umull_and_v2i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.2d, #0x000000000000ff -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: fmov x10, d0 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x11, v0.d[1] -; CHECK-NEXT: mul x9, x10, x9 -; CHECK-NEXT: mul x8, x11, x8 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: ret entry: %in1 = zext <2 x i32> %src1 to <2 x i64>