diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -77,6 +77,7 @@ static bool optimizeConvertFromSVBool(IntrinsicInst *I); static bool optimizePTest(IntrinsicInst *I); + static bool optimizeVectorMul(IntrinsicInst *I); static bool processPhiNode(IntrinsicInst *I); }; @@ -366,6 +367,76 @@ return false; } +bool SVEIntrinsicOpts::optimizeVectorMul(IntrinsicInst *I) { + assert((I->getIntrinsicID() == Intrinsic::aarch64_sve_mul || + I->getIntrinsicID() == Intrinsic::aarch64_sve_fmul) && + "Unexpected opcode"); + + auto *OpPredicate = I->getOperand(0); + auto *OpMultiplicand = I->getOperand(1); + auto *OpMultiplier = I->getOperand(2); + + // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call + // with a unit splat value, false otherwise. + auto IsUnitDupX = [](auto *I) { + auto *IntrI = dyn_cast(I); + if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) + return false; + + auto *SplatValue = IntrI->getOperand(0); + return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); + }; + + // Return true if a given instruction is an aarch64_sve_dup intrinsic call + // with a unit splat value, false otherwise. + auto IsUnitDup = [](auto *I) { + auto *IntrI = dyn_cast(I); + if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) + return false; + + auto *SplatValue = IntrI->getOperand(2); + return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); + }; + + bool Changed = true; + + // The OpMultiplier variable should always point to the dup (if any), so + // swap if necessary. + if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand)) + std::swap(OpMultiplier, OpMultiplicand); + + if (IsUnitDupX(OpMultiplier)) { + // [f]mul pg (dupx 1) %n => %n + I->replaceAllUsesWith(OpMultiplicand); + I->eraseFromParent(); + Changed = true; + } else if (IsUnitDup(OpMultiplier)) { + // [f]mul pg (dup pg 1) %n => %n + auto *DupInst = cast(OpMultiplier); + auto *DupPg = DupInst->getOperand(1); + // TODO: this is naive. The optimization is still valid if DupPg + // 'encompasses' OpPredicate, not only if they're the same predicate. + if (OpPredicate == DupPg) { + I->replaceAllUsesWith(OpMultiplicand); + I->eraseFromParent(); + Changed = true; + } + } + + // If an instruction was optimized out then it is possible that some dangling + // instructions are left. + if (Changed) { + auto *OpPredicateInst = dyn_cast(OpPredicate); + auto *OpMultiplierInst = dyn_cast(OpMultiplier); + if (OpMultiplierInst && OpMultiplierInst->use_empty()) + OpMultiplierInst->eraseFromParent(); + if (OpPredicateInst && OpPredicateInst->use_empty()) + OpPredicateInst->eraseFromParent(); + } + + return Changed; +} + bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool && "Unexpected opcode"); @@ -429,6 +500,9 @@ switch (IntrI->getIntrinsicID()) { case Intrinsic::aarch64_sve_convert_from_svbool: return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_fmul: + case Intrinsic::aarch64_sve_mul: + return optimizeVectorMul(IntrI); case Intrinsic::aarch64_sve_ptest_any: case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: @@ -484,6 +558,8 @@ case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: case Intrinsic::aarch64_sve_ptrue: + case Intrinsic::aarch64_sve_mul: + case Intrinsic::aarch64_sve_fmul: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); break; diff --git a/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll b/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Idempotent fmuls -- should compile to just a ret. +define @idempotent_fmul_f16( %pg, %a) { +; CHECK-LABEL: @idempotent_fmul_f16( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %2 = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_f32( %pg, %a) { +; CHECK-LABEL: @idempotent_fmul_f32( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0) + %2 = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_f64( %pg, %a) { +; CHECK-LABEL: @idempotent_fmul_f64( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) + %2 = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_different_argument_order( %pg, %a) { +; CHECK-LABEL: @idempotent_fmul_different_argument_order( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) + ; Different argument order to the above tests. + %2 = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, %1, %a) + ret %2 +} + +define @idempotent_fmul_with_predicated_dup( %pg, %a) { +; CHECK-LABEL: @idempotent_fmul_with_predicated_dup( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.nxv8f16( undef, %pg, half 1.0) + %2 = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @idempotent_fmul_two_dups( %pg, %a) { + ; Edge case -- make sure that the case where we're fmultiplying two dups + ; together is sane. +; CHECK-LABEL: @idempotent_fmul_two_dups( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH3C00) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %2 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) + %3 = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, %1, %2) + ret %3 +} + +; Non-idempotent fmuls -- we don't expect these to be optimised out. +define @non_idempotent_fmul_f16( %pg, %a) { +; CHECK-LABEL: @non_idempotent_fmul_f16( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH4000) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv8f16( [[PG:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0) + %2 = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_f32( %pg, %a) { +; CHECK-LABEL: @non_idempotent_fmul_f32( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4f32(float 2.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv4f32( [[PG:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0) + %2 = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_f64( %pg, %a) { +; CHECK-LABEL: @non_idempotent_fmul_f64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2f64(double 2.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv2f64( [[PG:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0) + %2 = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_fmul_with_predicated_dup( %pg1, %pg2, %a) { + ; Different predicates +; CHECK-LABEL: @non_idempotent_fmul_with_predicated_dup( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.nxv2f64( undef, [[PG1:%.*]], double 1.000000e+00) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv2f64( [[PG2:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.nxv2f64( undef, %pg1, double 1.0) + %2 = call @llvm.aarch64.sve.fmul.nxv2f64( %pg2, %a, %1) + ret %2 +} + +declare @llvm.aarch64.sve.dup.x.nxv8f16(half) +declare @llvm.aarch64.sve.dup.x.nxv4f32(float) +declare @llvm.aarch64.sve.dup.x.nxv2f64(double) + +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) +declare @llvm.aarch64.sve.dup.nxv8f16(, , half) + +declare @llvm.aarch64.sve.fmul.nxv8f16(, , ) +declare @llvm.aarch64.sve.fmul.nxv4f32(, , ) +declare @llvm.aarch64.sve.fmul.nxv2f64(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll b/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Idempotent muls -- should compile to just a ret. +define @idempotent_mul_i16( %pg, %a) { +; CHECK-LABEL: @idempotent_mul_i16( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) + %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) + ret %2 +} + +define @idempotent_mul_i32( %pg, %a) { +; CHECK-LABEL: @idempotent_mul_i32( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %2 = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a, %1) + ret %2 +} + +define @idempotent_mul_i64( %pg, %a) { +; CHECK-LABEL: @idempotent_mul_i64( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) + %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %a, %1) + ret %2 +} + +define @idempotent_mul_different_argument_order( %pg, %a) { +; CHECK-LABEL: @idempotent_mul_different_argument_order( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) + ; Different argument order to the above tests. + %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %1, %a) + ret %2 +} + +define @idempotent_mul_with_predicated_dup( %pg, %a) { +; CHECK-LABEL: @idempotent_mul_with_predicated_dup( +; CHECK-NEXT: ret [[A:%.*]] +; + %1 = call @llvm.aarch64.sve.dup.nxv8i16( undef, %pg, i16 1) + %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) + ret %2 +} + +define @idempotent_mul_two_dups( %pg, %a) { + ; Edge case -- make sure that the case where we're multiplying two dups + ; together is sane. +; CHECK-LABEL: @idempotent_mul_two_dups( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) + %2 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) + %3 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %1, %2) + ret %3 +} + +; Non-idempotent muls -- we don't expect these to be optimised out. +define @non_idempotent_mul_i16( %pg, %a) { +; CHECK-LABEL: @non_idempotent_mul_i16( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 2) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv8i16( [[PG:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 2) + %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_mul_i32( %pg, %a) { +; CHECK-LABEL: @non_idempotent_mul_i32( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 2) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv4i32( [[PG:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 2) + %2 = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_mul_i64( %pg, %a) { +; CHECK-LABEL: @non_idempotent_mul_i64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 2) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv2i64( [[PG:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 2) + %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %a, %1) + ret %2 +} + +define @non_idempotent_mul_with_predicated_dup( %pg1, %pg2, %a) { + ; Different predicates +; CHECK-LABEL: @non_idempotent_mul_with_predicated_dup( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.nxv2i64( undef, [[PG1:%.*]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv2i64( [[PG2:%.*]], [[A:%.*]], [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = call @llvm.aarch64.sve.dup.nxv2i64( undef, %pg1, i64 1) + %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg2, %a, %1) + ret %2 +} + +declare @llvm.aarch64.sve.dup.x.nxv8i16(i16) +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) + +declare @llvm.aarch64.sve.dup.nxv2i64(, , i64) +declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) + +declare @llvm.aarch64.sve.mul.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.nxv2i64(, , )