diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -163,6 +163,18 @@ return IC.replaceInstUsesWith(II, NewCall); } +static bool isConstantFPZero(Value *V) { + if (auto *C = dyn_cast(V)) + return C->isZero(); + return false; +} + +static bool isConstantFPFiniteNonZero(Value *V) { + if (auto *C = dyn_cast(V)) + return !C->isZero() && !C->isInfinity() && !C->isNaN(); + return false; +} + Optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -823,6 +835,36 @@ break; } + case Intrinsic::amdgcn_fmul_legacy: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + + // The legacy behaviour is that multiplying zero by anything, even NaN or + // infinity, gives +0.0. + // TODO: Move to InstSimplify? + if (isConstantFPZero(Op0) || isConstantFPZero(Op1)) + return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); + + // If we can prove we don't have one of the special cases then we can use a + // normal fmul instruction instead. + auto *TLI = &IC.getTargetLibraryInfo(); + bool CanSimplifyToMul = false; + if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && + isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { + // Neither operand is infinity or NaN. + CanSimplifyToMul = true; + } else if (isConstantFPFiniteNonZero(Op0) || + isConstantFPFiniteNonZero(Op1)) { + // One operand is not zero or infinity or NaN. + CanSimplifyToMul = true; + } + if (CanSimplifyToMul) { + auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); + FMul->takeName(&II); + return IC.replaceInstUsesWith(II, FMul); + } + break; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s + +; Simplify to +0.0. +define float @test_zero(float %x) { +; CHECK-LABEL: @test_zero( +; CHECK-NEXT: ret float 0.000000e+00 +; + %call = call float @llvm.amdgcn.fmul.legacy(float %x, float 0.0) + ret float %call +} + +; Simplify to +0.0. +define float @test_negzero(float %y) { +; CHECK-LABEL: @test_negzero( +; CHECK-NEXT: ret float 0.000000e+00 +; + %call = call float @llvm.amdgcn.fmul.legacy(float -0.0, float %y) + ret float %call +} + +; Combine to fmul because the constant is finite and non-zero. +define float @test_const(float %x) { +; CHECK-LABEL: @test_const( +; CHECK-NEXT: [[CALL:%.*]] = fmul float [[X:%.*]], 9.950000e+01 +; CHECK-NEXT: ret float [[CALL]] +; + %call = call float @llvm.amdgcn.fmul.legacy(float %x, float 99.5) + ret float %call +} + +; Combine to fmul because neither argument can be infinity or NaN. +define float @test_finite(i32 %x, i32 %y) { +; CHECK-LABEL: @test_finite( +; CHECK-NEXT: [[XF:%.*]] = sitofp i32 [[X:%.*]] to float +; CHECK-NEXT: [[YF:%.*]] = sitofp i32 [[Y:%.*]] to float +; CHECK-NEXT: [[CALL:%.*]] = fmul float [[XF]], [[YF]] +; CHECK-NEXT: ret float [[CALL]] +; + %xf = sitofp i32 %x to float + %yf = sitofp i32 %y to float + %call = call float @llvm.amdgcn.fmul.legacy(float %xf, float %yf) + ret float %call +} + +declare float @llvm.amdgcn.fmul.legacy(float, float)