diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -221,6 +221,11 @@ def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs), [(setone node:$lhs, node:$rhs), (setne node:$lhs, node:$rhs)]>; +def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), + (AArch64mul_p node:$pred, node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; @@ -268,8 +273,8 @@ defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; - defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>; - defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>; + defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>; + defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>; // SVE predicated integer reductions. defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2499,7 +2499,8 @@ let ElementSize = zprty.ElementSize; } -multiclass sve_int_mlas_vvv_pred opc, string asm, SDPatternOperator op> { +multiclass sve_int_mlas_vvv_pred opc, string asm, SDPatternOperator op, + SDPatternOperator outerop, SDPatternOperator mulop> { def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>; def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>; def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>; @@ -2509,6 +2510,15 @@ def : SVE_4_Op_Pat(NAME # _H)>; def : SVE_4_Op_Pat(NAME # _S)>; def : SVE_4_Op_Pat(NAME # _D)>; + + def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)), + (!cast(NAME # _B) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)), + (!cast(NAME # _H) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)), + (!cast(NAME # _S) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)), + (!cast(NAME # _D) $pred, $Op1, $Op2, $Op3)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll --- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll +++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll @@ -131,8 +131,7 @@ ; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h ; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b -; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: ret %div = srem %a, %b ret %div @@ -151,8 +150,7 @@ ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h -; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: ret %div = srem %a, %b ret %div @@ -164,8 +162,7 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s -; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s ; CHECK-NEXT: ret %div = srem %a, %b ret %div @@ -177,8 +174,7 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d -; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: ret %div = srem %a, %b ret %div @@ -315,8 +311,7 @@ ; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h ; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b -; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: ret %div = urem %a, %b ret %div @@ -335,8 +330,7 @@ ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h -; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: ret %div = urem %a, %b ret %div @@ -348,8 +342,7 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s -; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s ; CHECK-NEXT: ret %div = urem %a, %b ret %div @@ -361,8 +354,7 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d -; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: ret %div = urem %a, %b ret %div diff --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll --- a/llvm/test/CodeGen/AArch64/sve-gep.ll +++ b/llvm/test/CodeGen/AArch64/sve-gep.ll @@ -105,10 +105,11 @@ ; CHECK-LABEL: scalable_of_scalable_1: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: mov z1.d, #1 // =0x1 ; CHECK-NEXT: mov z0.d, x0 -; CHECK-NEXT: mul z1.d, z1.d, #1 -; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mla z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: ret %idx = shufflevector insertelement ( undef, i64 1, i32 0), zeroinitializer, zeroinitializer %d = getelementptr , * %base, %idx @@ -119,9 +120,10 @@ ; CHECK-LABEL: scalable_of_scalable_2: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: mul z1.d, z1.d, #1 -; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: mov z1.d, #1 // =0x1 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mla z0.d, p0/m, z2.d, z1.d ; CHECK-NEXT: ret %idx = shufflevector insertelement ( undef, i64 1, i32 0), zeroinitializer, zeroinitializer %d = getelementptr , *> %base, %idx @@ -135,8 +137,7 @@ ; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: sxtw z1.d, p0/m, z1.d ; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d -; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret %d = getelementptr , *> %base, %idx ret *> %d diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t @@ -5,200 +6,262 @@ ; WARN-NOT: warning define @add_i64( %a, %b) { -; CHECK-LABEL: add_i64 -; CHECK: add z0.d, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK-LABEL: add_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: ret %res = add %a, %b ret %res } define @add_i32( %a, %b) { -; CHECK-LABEL: add_i32 -; CHECK: add z0.s, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK-LABEL: add_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ret %res = add %a, %b ret %res } define @add_i16( %a, %b) { -; CHECK-LABEL: add_i16 -; CHECK: add z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK-LABEL: add_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ret %res = add %a, %b ret %res } define @add_i8( %a, %b) { -; CHECK-LABEL: add_i8 -; CHECK: add z0.b, z0.b, z1.b -; CHECK-NEXT: ret +; CHECK-LABEL: add_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.b, z0.b, z1.b +; CHECK-NEXT: ret %res = add %a, %b ret %res } define @sub_i64( %a, %b) { -; CHECK-LABEL: sub_i64 -; CHECK: sub z0.d, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: ret %res = sub %a, %b ret %res } define @sub_i32( %a, %b) { -; CHECK-LABEL: sub_i32 -; CHECK: sub z0.s, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: ret %res = sub %a, %b ret %res } define @sub_i16( %a, %b) { -; CHECK-LABEL: sub_i16 -; CHECK: sub z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: ret %res = sub %a, %b ret %res } define @sub_i8( %a, %b) { -; CHECK-LABEL: sub_i8 -; CHECK: sub z0.b, z0.b, z1.b -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: ret %res = sub %a, %b ret %res } define @sqadd_i64( %a, %b) { -; CHECK-LABEL: sqadd_i64 -; CHECK: sqadd z0.d, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret %res = call @llvm.sadd.sat.nxv2i64( %a, %b) ret %res } define @sqadd_i32( %a, %b) { -; CHECK-LABEL: sqadd_i32 -; CHECK: sqadd z0.s, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret %res = call @llvm.sadd.sat.nxv4i32( %a, %b) ret %res } define @sqadd_i16( %a, %b) { -; CHECK-LABEL: sqadd_i16 -; CHECK: sqadd z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret %res = call @llvm.sadd.sat.nxv8i16( %a, %b) ret %res } define @sqadd_i8( %a, %b) { -; CHECK-LABEL: sqadd_i8 -; CHECK: sqadd z0.b, z0.b, z1.b -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.b, z0.b, z1.b +; CHECK-NEXT: ret %res = call @llvm.sadd.sat.nxv16i8( %a, %b) ret %res } define @sqsub_i64( %a, %b) { -; CHECK-LABEL: sqsub_i64 -; CHECK: sqsub z0.d, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.d, z0.d, z1.d +; CHECK-NEXT: ret %res = call @llvm.ssub.sat.nxv2i64( %a, %b) ret %res } define @sqsub_i32( %a, %b) { -; CHECK-LABEL: sqsub_i32 -; CHECK: sqsub z0.s, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret %res = call @llvm.ssub.sat.nxv4i32( %a, %b) ret %res } define @sqsub_i16( %a, %b) { -; CHECK-LABEL: sqsub_i16 -; CHECK: sqsub z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret %res = call @llvm.ssub.sat.nxv8i16( %a, %b) ret %res } define @sqsub_i8( %a, %b) { -; CHECK-LABEL: sqsub_i8 -; CHECK: sqsub z0.b, z0.b, z1.b -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.b, z0.b, z1.b +; CHECK-NEXT: ret %res = call @llvm.ssub.sat.nxv16i8( %a, %b) ret %res } define @uqadd_i64( %a, %b) { -; CHECK-LABEL: uqadd_i64 -; CHECK: uqadd z0.d, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret %res = call @llvm.uadd.sat.nxv2i64( %a, %b) ret %res } define @uqadd_i32( %a, %b) { -; CHECK-LABEL: uqadd_i32 -; CHECK: uqadd z0.s, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret %res = call @llvm.uadd.sat.nxv4i32( %a, %b) ret %res } define @uqadd_i16( %a, %b) { -; CHECK-LABEL: uqadd_i16 -; CHECK: uqadd z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret %res = call @llvm.uadd.sat.nxv8i16( %a, %b) ret %res } define @uqadd_i8( %a, %b) { -; CHECK-LABEL: uqadd_i8 -; CHECK: uqadd z0.b, z0.b, z1.b -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.b, z0.b, z1.b +; CHECK-NEXT: ret %res = call @llvm.uadd.sat.nxv16i8( %a, %b) ret %res } define @uqsub_i64( %a, %b) { -; CHECK-LABEL: uqsub_i64 -; CHECK: uqsub z0.d, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.d, z0.d, z1.d +; CHECK-NEXT: ret %res = call @llvm.usub.sat.nxv2i64( %a, %b) ret %res } define @uqsub_i32( %a, %b) { -; CHECK-LABEL: uqsub_i32 -; CHECK: uqsub z0.s, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.s, z0.s, z1.s +; CHECK-NEXT: ret %res = call @llvm.usub.sat.nxv4i32( %a, %b) ret %res } define @uqsub_i16( %a, %b) { -; CHECK-LABEL: uqsub_i16 -; CHECK: uqsub z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret %res = call @llvm.usub.sat.nxv8i16( %a, %b) ret %res } define @uqsub_i8( %a, %b) { -; CHECK-LABEL: uqsub_i8 -; CHECK: uqsub z0.b, z0.b, z1.b -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.b, z0.b, z1.b +; CHECK-NEXT: ret %res = call @llvm.usub.sat.nxv16i8( %a, %b) ret %res } +define @mla_i8( %a, %b, %c) { +; CHECK-LABEL: mla_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mla z2.b, p0/m, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %prod = mul %a, %b + %res = add %c, %prod + ret %res +} + +define @mla_i8_multiuse( %a, %b, %c, * %p) { +; CHECK-LABEL: mla_i8_multiuse: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: add z0.b, z2.b, z1.b +; CHECK-NEXT: st1b { z1.b }, p0, [x0] +; CHECK-NEXT: ret + %prod = mul %a, %b + store %prod, * %p + %res = add %c, %prod + ret %res +} + +define @mls_i8( %a, %b, %c) { +; CHECK-LABEL: mls_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mls z2.b, p0/m, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %prod = mul %a, %b + %res = sub %c, %prod + ret %res +} + declare @llvm.sadd.sat.nxv16i8(, ) declare @llvm.sadd.sat.nxv8i16(, ) declare @llvm.sadd.sat.nxv4i32(, )