diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -163,6 +163,27 @@ return IC.replaceInstUsesWith(II, NewCall); } +bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, + InstCombiner &IC) const { + // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or + // infinity, gives +0.0. If we can prove we don't have one of the special + // cases then we can use a normal multiply instead. + // TODO: Create and use isKnownFiniteNonZero instead of just matching + // constants here. + if (match(Op0, PatternMatch::m_FiniteNonZero()) || + match(Op1, PatternMatch::m_FiniteNonZero())) { + // One operand is not zero or infinity or NaN. + return true; + } + auto *TLI = &IC.getTargetLibraryInfo(); + if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && + isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { + // Neither operand is infinity or NaN. + return true; + } + return false; +} + Optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -836,26 +857,40 @@ // If we can prove we don't have one of the special cases then we can use a // normal fmul instruction instead. - auto *TLI = &IC.getTargetLibraryInfo(); - bool CanSimplifyToMul = false; - // TODO: Create and use isKnownFiniteNonZero instead of just matching - // constants here. - if (match(Op0, PatternMatch::m_FiniteNonZero()) || - match(Op1, PatternMatch::m_FiniteNonZero())) { - // One operand is not zero or infinity or NaN. - CanSimplifyToMul = true; - } else if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && - isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { - // Neither operand is infinity or NaN. - CanSimplifyToMul = true; - } - if (CanSimplifyToMul) { + if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); FMul->takeName(&II); return IC.replaceInstUsesWith(II, FMul); } break; } + case Intrinsic::amdgcn_fma_legacy: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Op2 = II.getArgOperand(2); + + // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or + // infinity, gives +0.0. + // TODO: Move to InstSimplify? + if (match(Op0, PatternMatch::m_AnyZeroFP()) || + match(Op1, PatternMatch::m_AnyZeroFP())) { + // It's tempting to just return Op2 here, but that would give the wrong + // result if Op2 was -0.0. + auto *Zero = ConstantFP::getNullValue(II.getType()); + auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); + FAdd->takeName(&II); + return IC.replaceInstUsesWith(II, FAdd); + } + + // If we can prove we don't have one of the special cases then we can use a + // normal fma instead. + if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { + II.setCalledOperand(Intrinsic::getDeclaration( + II.getModule(), Intrinsic::fma, II.getType())); + return &II; + } + break; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -227,6 +227,8 @@ Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const; + bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, + InstCombiner &IC) const; Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; Optional simplifyDemandedVectorEltsIntrinsic( diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fma_legacy.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fma_legacy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/fma_legacy.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s + +; Simplify to +0.0 + z. +define float @test_zero(float %x, float %z) { +; CHECK-LABEL: @test_zero( +; CHECK-NEXT: [[CALL:%.*]] = fadd float [[Z:%.*]], 0.000000e+00 +; CHECK-NEXT: ret float [[CALL]] +; + %call = call float @llvm.amdgcn.fma.legacy(float %x, float 0.0, float %z) + ret float %call +} + +; Simplify to +0.0 + z, preserving fmf. +define float @test_zero_fmf(float %x, float %z) { +; CHECK-LABEL: @test_zero_fmf( +; CHECK-NEXT: [[CALL:%.*]] = fadd contract float [[Z:%.*]], 0.000000e+00 +; CHECK-NEXT: ret float [[CALL]] +; + %call = call contract float @llvm.amdgcn.fma.legacy(float %x, float 0.0, float %z) + ret float %call +} + +; Simplify to z. +define float @test_zero_nsz(float %x, float %z) { +; CHECK-LABEL: @test_zero_nsz( +; CHECK-NEXT: ret float [[Z:%.*]] +; + %call = call nsz float @llvm.amdgcn.fma.legacy(float %x, float 0.0, float %z) + ret float %call +} + +; Simplify to +0.0 + z. +define float @test_negzero(float %y, float %z) { +; CHECK-LABEL: @test_negzero( +; CHECK-NEXT: [[CALL:%.*]] = fadd float [[Z:%.*]], 0.000000e+00 +; CHECK-NEXT: ret float [[CALL]] +; + %call = call float @llvm.amdgcn.fma.legacy(float -0.0, float %y, float %z) + ret float %call +} + +; Simplify to z. +define float @test_negzero_nsz(float %y, float %z) { +; CHECK-LABEL: @test_negzero_nsz( +; CHECK-NEXT: ret float [[Z:%.*]] +; + %call = call nsz float @llvm.amdgcn.fma.legacy(float -0.0, float %y, float %z) + ret float %call +} + +; Combine to fma because the constant is finite and non-zero. +define float @test_const(float %x, float %z) { +; CHECK-LABEL: @test_const( +; CHECK-NEXT: [[CALL:%.*]] = call float @llvm.fma.f32(float [[X:%.*]], float 9.950000e+01, float [[Z:%.*]]) +; CHECK-NEXT: ret float [[CALL]] +; + %call = call float @llvm.amdgcn.fma.legacy(float %x, float 99.5, float %z) + ret float %call +} + +; Combine to fma because the constant is finite and non-zero, preserving fmf. +define float @test_const_fmf(float %x, float %z) { +; CHECK-LABEL: @test_const_fmf( +; CHECK-NEXT: [[CALL:%.*]] = call contract float @llvm.fma.f32(float [[X:%.*]], float 9.950000e+01, float [[Z:%.*]]) +; CHECK-NEXT: ret float [[CALL]] +; + %call = call contract float @llvm.amdgcn.fma.legacy(float %x, float 99.5, float %z) + ret float %call +} + +; Combine to fma because neither argument can be infinity or NaN. +define float @test_finite(i32 %x, i32 %y, float %z) { +; CHECK-LABEL: @test_finite( +; CHECK-NEXT: [[XF:%.*]] = sitofp i32 [[X:%.*]] to float +; CHECK-NEXT: [[YF:%.*]] = sitofp i32 [[Y:%.*]] to float +; CHECK-NEXT: [[CALL:%.*]] = call float @llvm.fma.f32(float [[XF]], float [[YF]], float [[Z:%.*]]) +; CHECK-NEXT: ret float [[CALL]] +; + %xf = sitofp i32 %x to float + %yf = sitofp i32 %y to float + %call = call float @llvm.amdgcn.fma.legacy(float %xf, float %yf, float %z) + ret float %call +} + +declare float @llvm.amdgcn.fma.legacy(float, float, float) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll @@ -29,6 +29,16 @@ ret float %call } +; Combine to fmul because the constant is finite and non-zero, preserving fmf. +define float @test_const_fmf(float %x) { +; CHECK-LABEL: @test_const_fmf( +; CHECK-NEXT: [[CALL:%.*]] = fmul contract float [[X:%.*]], 9.950000e+01 +; CHECK-NEXT: ret float [[CALL]] +; + %call = call contract float @llvm.amdgcn.fmul.legacy(float %x, float 99.5) + ret float %call +} + ; Combine to fmul because neither argument can be infinity or NaN. define float @test_finite(i32 %x, i32 %y) { ; CHECK-LABEL: @test_finite(