Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1338,6 +1338,10 @@ setOperationAction(ISD::UDIV, VT, Custom); } + // NEON doesn't support 64-bit vector integer muls, but SVE does. + setOperationAction(ISD::MUL, MVT::v1i64, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1364,8 +1368,6 @@ setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MULHS, MVT::v1i64, Custom); setOperationAction(ISD::MULHS, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v1i64, Custom); @@ -3946,7 +3948,8 @@ // If SVE is available then i64 vector multiplications can also be made legal. bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) + if (VT.isScalableVector() || (OverrideNEON && Subtarget->hasSVE()) || + useSVEForFixedLengthVectorVT(VT, false)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Multiplications are only custom-lowered for 128-bit vectors so that Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -1,4 +1,4 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 @@ -22,9 +22,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; ADD ; @@ -657,22 +654,32 @@ ret void } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { ; CHECK-LABEL: mul_v1i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: mul_v1i64: +; VBITS_EQ_128: ptrue p0.d, vl1 +; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128: ret + %res = mul <1 x i64> %op1, %op2 ret <1 x i64> %res } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { ; CHECK-LABEL: mul_v2i64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d ; CHECK: ret + +; VBITS_EQ_128-LABEL: mul_v2i64: +; VBITS_EQ_128: ptrue p0.d, vl2 +; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128: ret + %res = mul <2 x i64> %op1, %op2 ret <2 x i64> %res } Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -1,4 +1,4 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -25,14 +25,11 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; SMULH ; -; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: smulh_v8i8: ; CHECK: // %bb.0: @@ -56,6 +53,12 @@ ; CHECK-NEXT: mov v0.b[7], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v8i8: +; VBITS_EQ_128: smull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: ret + %insert = insertelement <8 x i16> undef, i16 8, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer %1 = sext <8 x i8> %op1 to <8 x i16> @@ -66,7 +69,6 @@ ret <8 x i8> %res } -; Don't use SVE for 128-bit vectors. define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; CHECK-LABEL: smulh_v16i8: ; CHECK: // %bb.0: @@ -74,6 +76,13 @@ ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v16i8: +; VBITS_EQ_128: smull2 v2.8h, v0.16b, v1.16b +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: ret + %1 = sext <16 x i8> %op1 to <16 x i16> %2 = sext <16 x i8> %op2 to <16 x i16> %mul = mul <16 x i16> %1, %2 @@ -165,7 +174,7 @@ ret void } -; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: smulh_v4i16: ; CHECK: // %bb.0: @@ -180,6 +189,12 @@ ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v4i16: +; VBITS_EQ_128: smull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: ret + %1 = sext <4 x i16> %op1 to <4 x i32> %2 = sext <4 x i16> %op2 to <4 x i32> %mul = mul <4 x i32> %1, %2 @@ -188,7 +203,6 @@ ret <4 x i16> %res } -; Don't use SVE for 128-bit vectors. define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; CHECK-LABEL: smulh_v8i16: ; CHECK: // %bb.0: @@ -196,6 +210,13 @@ ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v8i16: +; VBITS_EQ_128: smull2 v2.4s, v0.8h, v1.8h +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: ret + %1 = sext <8 x i16> %op1 to <8 x i32> %2 = sext <8 x i16> %op2 to <8 x i32> %mul = mul <8 x i32> %1, %2 @@ -294,6 +315,15 @@ ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v2i32: +; VBITS_EQ_128: sshll v0.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: ret + %1 = sext <2 x i32> %op1 to <2 x i64> %2 = sext <2 x i32> %op2 to <2 x i64> %mul = mul <2 x i64> %1, %2 @@ -302,7 +332,6 @@ ret <2 x i32> %res } -; Don't use SVE for 128-bit vectors. define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { ; CHECK-LABEL: smulh_v4i32: ; CHECK: // %bb.0: @@ -310,6 +339,13 @@ ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v4i32: +; VBITS_EQ_128: smull2 v2.2d, v0.4s, v1.4s +; VBITS_EQ_128-NEXT: smull v0.2d, v0.2s, v1.2s +; VBITS_EQ_128-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; VBITS_EQ_128-NEXT: ret + %1 = sext <4 x i32> %op1 to <4 x i64> %2 = sext <4 x i32> %op2 to <4 x i64> %mul = mul <4 x i64> %1, %2 @@ -398,7 +434,6 @@ ret void } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { ; CHECK-LABEL: smulh_v1i64: ; CHECK: // %bb.0: @@ -408,6 +443,14 @@ ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v1i64: +; VBITS_EQ_128: fmov x8, d0 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: smulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: ret + %insert = insertelement <1 x i128> undef, i128 64, i128 0 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer %1 = sext <1 x i64> %op1 to <1 x i128> @@ -418,7 +461,6 @@ ret <1 x i64> %res } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { ; CHECK-LABEL: smulh_v2i64: ; CHECK: // %bb.0: @@ -428,6 +470,19 @@ ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: smulh_v2i64: +; VBITS_EQ_128: mov x8, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d0 +; VBITS_EQ_128-NEXT: mov x9, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d1 +; VBITS_EQ_128-NEXT: smulh x10, x10, x11 +; VBITS_EQ_128-NEXT: smulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x10 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: ret + %1 = sext <2 x i64> %op1 to <2 x i128> %2 = sext <2 x i64> %op2 to <2 x i128> %mul = mul <2 x i128> %1, %2 @@ -520,7 +575,7 @@ ; UMULH ; -; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: umulh_v8i8: ; CHECK: // %bb.0: @@ -544,6 +599,12 @@ ; CHECK-NEXT: mov v0.b[7], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v8i8: +; VBITS_EQ_128: umull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: ret + %1 = zext <8 x i8> %op1 to <8 x i16> %2 = zext <8 x i8> %op2 to <8 x i16> %mul = mul <8 x i16> %1, %2 @@ -552,7 +613,6 @@ ret <8 x i8> %res } -; Don't use SVE for 128-bit vectors. define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; CHECK-LABEL: umulh_v16i8: ; CHECK: // %bb.0: @@ -560,6 +620,13 @@ ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v16i8: +; VBITS_EQ_128: umull2 v2.8h, v0.16b, v1.16b +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: ret + %1 = zext <16 x i8> %op1 to <16 x i16> %2 = zext <16 x i8> %op2 to <16 x i16> %mul = mul <16 x i16> %1, %2 @@ -651,7 +718,7 @@ ret void } -; Don't use SVE for 64-bit vectors. +; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: umulh_v4i16: ; CHECK: // %bb.0: @@ -666,6 +733,12 @@ ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v4i16: +; VBITS_EQ_128: umull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: ret + %1 = zext <4 x i16> %op1 to <4 x i32> %2 = zext <4 x i16> %op2 to <4 x i32> %mul = mul <4 x i32> %1, %2 @@ -674,7 +747,6 @@ ret <4 x i16> %res } -; Don't use SVE for 128-bit vectors. define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; CHECK-LABEL: umulh_v8i16: ; CHECK: // %bb.0: @@ -682,6 +754,13 @@ ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v8i16: +; VBITS_EQ_128: umull2 v2.4s, v0.8h, v1.8h +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: ret + %1 = zext <8 x i16> %op1 to <8 x i32> %2 = zext <8 x i16> %op2 to <8 x i32> %mul = mul <8 x i32> %1, %2 @@ -770,7 +849,6 @@ ret void } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { ; CHECK-LABEL: umulh_v2i32: ; CHECK: // %bb.0: @@ -780,6 +858,15 @@ ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v2i32: +; VBITS_EQ_128: ushll v0.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: ret + %1 = zext <2 x i32> %op1 to <2 x i64> %2 = zext <2 x i32> %op2 to <2 x i64> %mul = mul <2 x i64> %1, %2 @@ -788,7 +875,6 @@ ret <2 x i32> %res } -; Don't use SVE for 128-bit vectors. define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { ; CHECK-LABEL: umulh_v4i32: ; CHECK: // %bb.0: @@ -796,6 +882,13 @@ ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v4i32: +; VBITS_EQ_128: umull2 v2.2d, v0.4s, v1.4s +; VBITS_EQ_128-NEXT: umull v0.2d, v0.2s, v1.2s +; VBITS_EQ_128-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; VBITS_EQ_128-NEXT: ret + %1 = zext <4 x i32> %op1 to <4 x i64> %2 = zext <4 x i32> %op2 to <4 x i64> %mul = mul <4 x i64> %1, %2 @@ -886,7 +979,6 @@ ret void } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { ; CHECK-LABEL: umulh_v1i64: ; CHECK: // %bb.0: @@ -896,6 +988,14 @@ ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v1i64: +; VBITS_EQ_128: fmov x8, d0 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: umulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: ret + %1 = zext <1 x i64> %op1 to <1 x i128> %2 = zext <1 x i64> %op2 to <1 x i128> %mul = mul <1 x i128> %1, %2 @@ -904,7 +1004,6 @@ ret <1 x i64> %res } -; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { ; CHECK-LABEL: umulh_v2i64: ; CHECK: // %bb.0: @@ -914,6 +1013,19 @@ ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret + +; VBITS_EQ_128-LABEL: umulh_v2i64: +; VBITS_EQ_128: mov x8, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d0 +; VBITS_EQ_128-NEXT: mov x9, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d1 +; VBITS_EQ_128-NEXT: umulh x10, x10, x11 +; VBITS_EQ_128-NEXT: umulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x10 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: ret + %1 = zext <2 x i64> %op1 to <2 x i128> %2 = zext <2 x i64> %op2 to <2 x i128> %mul = mul <2 x i128> %1, %2 Index: llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -697,10 +697,7 @@ ; VBITS_EQ_128: ptrue p0.d, vl1 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x8, d2 -; VBITS_EQ_128-NEXT: fmov x9, d1 -; VBITS_EQ_128-NEXT: mul x8, x8, x9 -; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d ; VBITS_EQ_128-NEXT: sub d0, d0, d1 ; VBITS_EQ_128-NEXT: ret @@ -723,14 +720,7 @@ ; VBITS_EQ_128: ptrue p0.d, vl2 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x9, d2 -; VBITS_EQ_128-NEXT: fmov x10, d1 -; VBITS_EQ_128-NEXT: mov x8, v2.d[1] -; VBITS_EQ_128-NEXT: mov x11, v1.d[1] -; VBITS_EQ_128-NEXT: mul x9, x9, x10 -; VBITS_EQ_128-NEXT: mul x8, x8, x11 -; VBITS_EQ_128-NEXT: fmov d1, x9 -; VBITS_EQ_128-NEXT: mov v1.d[1], x8 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d ; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d ; VBITS_EQ_128-NEXT: ret @@ -1487,10 +1477,7 @@ ; VBITS_EQ_128: ptrue p0.d, vl1 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x8, d2 -; VBITS_EQ_128-NEXT: fmov x9, d1 -; VBITS_EQ_128-NEXT: mul x8, x8, x9 -; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d ; VBITS_EQ_128-NEXT: sub d0, d0, d1 ; VBITS_EQ_128-NEXT: ret @@ -1513,15 +1500,8 @@ ; VBITS_EQ_128: ptrue p0.d, vl2 ; VBITS_EQ_128-NEXT: movprfx z2, z0 ; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; VBITS_EQ_128-NEXT: fmov x9, d2 -; VBITS_EQ_128-NEXT: fmov x10, d1 -; VBITS_EQ_128-NEXT: mov x8, v2.d[1] -; VBITS_EQ_128-NEXT: mov x11, v1.d[1] -; VBITS_EQ_128-NEXT: mul x9, x9, x10 -; VBITS_EQ_128-NEXT: mul x8, x8, x11 -; VBITS_EQ_128-NEXT: fmov d1, x9 -; VBITS_EQ_128-NEXT: mov v1.d[1], x8 -; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d +; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d ; VBITS_EQ_128-NEXT: ret %res = urem <2 x i64> %op1, %op2