diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1243,10 +1243,13 @@ static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { switch (Intrinsic) { case Intrinsic::aarch64_sve_fmul: + case Intrinsic::aarch64_sve_fmul_u: return Instruction::BinaryOps::FMul; case Intrinsic::aarch64_sve_fadd: + case Intrinsic::aarch64_sve_fadd_u: return Instruction::BinaryOps::FAdd; case Intrinsic::aarch64_sve_fsub: + case Intrinsic::aarch64_sve_fsub_u: return Instruction::BinaryOps::FSub; default: return Instruction::BinaryOpsEnd; @@ -1292,6 +1295,11 @@ Intrinsic::aarch64_sve_mad>( IC, II, false)) return MAD; + if (auto FMLA_U = + instCombineSVEVectorFuseMulAddSub( + IC, II, true)) + return FMLA_U; return instCombineSVEVectorBinOp(IC, II); } @@ -1311,6 +1319,11 @@ Intrinsic::aarch64_sve_fnmsb>( IC, II, false)) return FMSB; + if (auto FMLS_U = + instCombineSVEVectorFuseMulAddSub( + IC, II, true)) + return FMLS_U; return instCombineSVEVectorBinOp(IC, II); } @@ -1684,25 +1697,20 @@ return instCombineSVEPTest(IC, II); case Intrinsic::aarch64_sve_mul: case Intrinsic::aarch64_sve_fmul: + case Intrinsic::aarch64_sve_fmul_u: return instCombineSVEVectorMul(IC, II); case Intrinsic::aarch64_sve_fadd: + case Intrinsic::aarch64_sve_fadd_u: case Intrinsic::aarch64_sve_add: return instCombineSVEVectorAdd(IC, II); - case Intrinsic::aarch64_sve_fadd_u: - return instCombineSVEVectorFuseMulAddSub( - IC, II, true); case Intrinsic::aarch64_sve_add_u: return instCombineSVEVectorFuseMulAddSub( IC, II, true); case Intrinsic::aarch64_sve_fsub: + case Intrinsic::aarch64_sve_fsub_u: case Intrinsic::aarch64_sve_sub: return instCombineSVEVectorSub(IC, II); - case Intrinsic::aarch64_sve_fsub_u: - return instCombineSVEVectorFuseMulAddSub( - IC, II, true); case Intrinsic::aarch64_sve_sub_u: return instCombineSVEVectorFuseMulAddSub( diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll @@ -6,7 +6,7 @@ declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) -; SVE intrinsics fmul and fadd should be replaced with regular fmul and fadd +; SVE intrinsics fmul, fmul_u, fadd, fadd_u, fsub and fsub_u should be replaced with regular fmul, fadd and fsub. declare @llvm.aarch64.sve.fmul.nxv8f16(, , ) define @replace_fmul_intrinsic_half( %a, %b) #0 { ; CHECK-LABEL: @replace_fmul_intrinsic_half @@ -37,6 +37,36 @@ ret %2 } +declare @llvm.aarch64.sve.fmul.u.nxv8f16(, , ) +define @replace_fmul_u_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: @replace_fmul_u_intrinsic_half +; CHECK-NEXT: %1 = fmul fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmul.u.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmul.u.nxv4f32(, , ) +define @replace_fmul_u_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: @replace_fmul_u_intrinsic_float +; CHECK-NEXT: %1 = fmul fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmul.u.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fmul.u.nxv2f64(, , ) +define @replace_fmul_u_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: @replace_fmul_u_intrinsic_double +; CHECK-NEXT: %1 = fmul fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fmul.u.nxv2f64( %1, %a, %b) + ret %2 +} + declare @llvm.aarch64.sve.fadd.nxv8f16(, , ) define @replace_fadd_intrinsic_half( %a, %b) #0 { ; CHECK-LABEL: @replace_fadd_intrinsic_half @@ -67,6 +97,36 @@ ret %2 } +declare @llvm.aarch64.sve.fadd.u.nxv8f16(, , ) +define @replace_fadd_u_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: @replace_fadd_u_intrinsic_half +; CHECK-NEXT: %1 = fadd fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fadd.u.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fadd.u.nxv4f32(, , ) +define @replace_fadd_u_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: @replace_fadd_u_intrinsic_float +; CHECK-NEXT: %1 = fadd fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fadd.u.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fadd.u.nxv2f64(, , ) +define @replace_fadd_u_intrinsic_double( %a, %b) #0 { +; CHECK-LABEL: @replace_fadd_u_intrinsic_double +; CHECK-NEXT: %1 = fadd fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fadd.u.nxv2f64( %1, %a, %b) + ret %2 +} + declare @llvm.aarch64.sve.fsub.nxv8f16(, , ) define @replace_fsub_intrinsic_half( %a, %b) #0 { ; CHECK-LABEL: @replace_fsub_intrinsic_half @@ -87,7 +147,6 @@ ret %2 } - declare @llvm.aarch64.sve.fsub.nxv2f64(, , ) define @replace_fsub_intrinsic_double( %a, %b) #0 { ; CHECK-LABEL: @replace_fsub_intrinsic_double @@ -117,4 +176,44 @@ ret %2 } +declare @llvm.aarch64.sve.fsub.u.nxv8f16(, , ) +define @replace_fsub_u_intrinsic_half( %a, %b) #0 { +; CHECK-LABEL: @replace_fsub_u_intrinsic_half +; CHECK-NEXT: %1 = fsub fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv8f16( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fsub.u.nxv4f32(, , ) +define @replace_fsub_u_intrinsic_float( %a, %b) #0 { +; CHECK-LABEL: @replace_fsub_u_intrinsic_float +; CHECK-NEXT: %1 = fsub fast %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv4f32( %1, %a, %b) + ret %2 +} + +declare @llvm.aarch64.sve.fsub.u.nxv2f64(, , ) +define @replace_fsub_u_intrinsic_no_fast_flag( %a, %b) #0 { +; CHECK-LABEL: @replace_fsub_u_intrinsic_no_fast_flag +; CHECK-NEXT: %1 = fsub %a, %b +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.aarch64.sve.fsub.u.nxv2f64( %1, %a, %b) + ret %2 +} + +define @no_replace_on_non_ptrue_all_u( %a, %b) #0 { +; CHECK-LABEL: @no_replace_on_non_ptrue_all_u +; CHECK-NEXT: %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) +; CHECK-NEXT: %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv2f64( %1, %a, %b) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 5) + %2 = tail call fast @llvm.aarch64.sve.fsub.u.nxv2f64( %1, %a, %b) + ret %2 +} + attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll @@ -0,0 +1,119 @@ +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Idempotent fmuls_u -- should compile to just a ret. +define @idempotent_fmul_u_f16( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_f16( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_f32( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_f32( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv4f32( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_f64( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_f64( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_different_argument_order( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_different_argument_order( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv2f64( [[PG:%.*]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer), [[A:%.*]]) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) + ; Different argument order to the above tests. + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg, %1, %a) + ret %2 +} + +define @idempotent_fmul_u_with_predicated_dup( %pg, %a) #0 { +; CHECK-LABEL: @idempotent_fmul_u_with_predicated_dup( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_u_two_dups( %pg, %a) #0 { + ; Edge case -- make sure that the case where we're fmultiplying two dups + ; together is sane. +; CHECK-LABEL: @idempotent_fmul_u_two_dups( +; CHECK-NEXT: ret shufflevector ( insertelement ( poison, half 0xH3C00, i64 0), poison, zeroinitializer) +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %2 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %3 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %1, %2) + ret %3 +} + +; Non-idempotent fmuls_u -- we don't expect these to be optimised out. +define @non_idempotent_fmul_u_f16( %pg, %a) #0 { +; CHECK-LABEL: @non_idempotent_fmul_u_f16( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv8f16( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, half 0xH4000, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_u_f32( %pg, %a) #0 { +; CHECK-LABEL: @non_idempotent_fmul_u_f32( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv4f32( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, float 2.000000e+00, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv4f32( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_u_f64( %pg, %a) #0 { +; CHECK-LABEL: @non_idempotent_fmul_u_f64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv2f64( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_u_with_predicated_dup( %pg1, %pg2, %a) #0 { + ; Different predicates +; CHECK-LABEL: @non_idempotent_fmul_u_with_predicated_dup( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.nxv2f64( undef, [[PG1:%.*]], double 1.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv2f64( [[PG2:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg1, double 1.0) + %2 = call @llvm.aarch64.sve.fmul.u.nxv2f64( %pg2, %a, %1) + ret %2 +} + +declare @llvm.aarch64.sve.dup.x.nxv8f16(half) +declare @llvm.aarch64.sve.dup.x.nxv4f32(float) +declare @llvm.aarch64.sve.dup.x.nxv2f64(double) + +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) +declare @llvm.aarch64.sve.dup.nxv8f16(, , half) + +declare @llvm.aarch64.sve.fmul.u.nxv8f16(, , ) +declare @llvm.aarch64.sve.fmul.u.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmul.u.nxv2f64(, , ) + +attributes #0 = { "target-features"="+sve" }