diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1341,6 +1341,10 @@ setOperationAction(ISD::UDIV, VT, Custom); } + // NEON doesn't support 64-bit vector integer muls, but SVE does. + setOperationAction(ISD::MUL, MVT::v1i64, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1367,8 +1371,6 @@ setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MULHS, MVT::v1i64, Custom); setOperationAction(ISD::MULHS, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v1i64, Custom); @@ -3950,9 +3952,7 @@ // If SVE is available then i64 vector multiplications can also be made legal. bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; - if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT( - VT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) + if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Multiplications are only custom-lowered for 128-bit vectors so that diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -1,4 +1,4 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 @@ -22,9 +22,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; ADD ; @@ -657,22 +654,32 @@ ret void } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { ; CHECK-LABEL: mul_v1i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: mul_v1i64: +; VBITS_EQ_128: ptrue p0.d, vl1 +; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128: ret + %res = mul <1 x i64> %op1, %op2 ret <1 x i64> %res } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { ; CHECK-LABEL: mul_v2i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: mul_v2i64: +; VBITS_EQ_128: ptrue p0.d, vl2 +; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128: ret + %res = mul <2 x i64> %op1, %op2 ret <2 x i64> %res } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -1,4 +1,4 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -25,14 +25,12 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; SMULH ; ; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: smulh_v8i8: ; CHECK: // %bb.0: @@ -166,6 +164,7 @@ } ; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: smulh_v4i16: ; CHECK: // %bb.0: @@ -294,6 +293,15 @@ ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v2i32: +; VBITS_EQ_128: sshll v0.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: ret + %1 = sext <2 x i32> %op1 to <2 x i64> %2 = sext <2 x i32> %op2 to <2 x i64> %mul = mul <2 x i64> %1, %2 @@ -521,6 +529,7 @@ ; ; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: umulh_v8i8: ; CHECK: // %bb.0: @@ -652,6 +661,7 @@ } ; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: umulh_v4i16: ; CHECK: // %bb.0: @@ -780,6 +790,15 @@ ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v2i32: +; VBITS_EQ_128: ushll v0.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: ret + %1 = zext <2 x i32> %op1 to <2 x i64> %2 = zext <2 x i32> %op2 to <2 x i64> %mul = mul <2 x i64> %1, %2 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -697,10 +697,7 @@ ; VBITS_EQ_128: ptrue p0.d, vl1 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x8, d2 -; VBITS_EQ_128-NEXT: fmov x9, d1 -; VBITS_EQ_128-NEXT: mul x8, x8, x9 -; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d ; VBITS_EQ_128-NEXT: sub d0, d0, d1 ; VBITS_EQ_128-NEXT: ret @@ -723,14 +720,7 @@ ; VBITS_EQ_128: ptrue p0.d, vl2 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x9, d2 -; VBITS_EQ_128-NEXT: fmov x10, d1 -; VBITS_EQ_128-NEXT: mov x8, v2.d[1] -; VBITS_EQ_128-NEXT: mov x11, v1.d[1] -; VBITS_EQ_128-NEXT: mul x9, x9, x10 -; VBITS_EQ_128-NEXT: mul x8, x8, x11 -; VBITS_EQ_128-NEXT: fmov d1, x9 -; VBITS_EQ_128-NEXT: mov v1.d[1], x8 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d ; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d ; VBITS_EQ_128-NEXT: ret @@ -1487,10 +1477,7 @@ ; VBITS_EQ_128: ptrue p0.d, vl1 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x8, d2 -; VBITS_EQ_128-NEXT: fmov x9, d1 -; VBITS_EQ_128-NEXT: mul x8, x8, x9 -; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d ; VBITS_EQ_128-NEXT: sub d0, d0, d1 ; VBITS_EQ_128-NEXT: ret @@ -1513,14 +1500,7 @@ ; VBITS_EQ_128: ptrue p0.d, vl2 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x9, d2 -; VBITS_EQ_128-NEXT: fmov x10, d1 -; VBITS_EQ_128-NEXT: mov x8, v2.d[1] -; VBITS_EQ_128-NEXT: mov x11, v1.d[1] -; VBITS_EQ_128-NEXT: mul x9, x9, x10 -; VBITS_EQ_128-NEXT: mul x8, x8, x11 -; VBITS_EQ_128-NEXT: fmov d1, x9 -; VBITS_EQ_128-NEXT: mov v1.d[1], x8 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d ; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d ; VBITS_EQ_128-NEXT: ret