diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -255,7 +255,17 @@ def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">, Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, IntrWillReturn] + [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] +>; + +// Fused single-precision multiply-add with legacy behaviour for the multiply, +// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is +// intended for use on subtargets that have the v_fma_legacy_f32 and/or +// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and +// has a completely different kind of legacy behaviour.) +def int_amdgcn_fma_legacy : + Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] >; def int_amdgcn_rcp : Intrinsic< diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1504,6 +1504,7 @@ case Intrinsic::amdgcn_cubesc: case Intrinsic::amdgcn_cubetc: case Intrinsic::amdgcn_fmul_legacy: + case Intrinsic::amdgcn_fma_legacy: case Intrinsic::amdgcn_fract: case Intrinsic::amdgcn_ldexp: case Intrinsic::amdgcn_sin: @@ -2371,8 +2372,8 @@ if (IntrinsicID == Intrinsic::amdgcn_fmul_legacy) { const APFloat &C1 = Op1->getValueAPF(); const APFloat &C2 = Op2->getValueAPF(); - // The legacy behaviour is that multiplying zero by anything, even NaN - // or infinity, gives +0.0. + // The legacy behaviour is that multiplying +/- 0.0 by anything, even + // NaN or infinity, gives +0.0. if (C1.isZero() || C2.isZero()) return ConstantFP::getNullValue(Ty); return ConstantFP::get(Ty->getContext(), C1 * C2); @@ -2706,6 +2707,19 @@ if (const auto *Op3 = dyn_cast(Operands[2])) { switch (IntrinsicID) { default: break; + case Intrinsic::amdgcn_fma_legacy: { + const APFloat &C1 = Op1->getValueAPF(); + const APFloat &C2 = Op2->getValueAPF(); + // The legacy behaviour is that multiplying +/- 0.0 by anything, even + // NaN or infinity, gives +0.0. + if (C1.isZero() || C2.isZero()) { + const APFloat &C3 = Op3->getValueAPF(); + // It's tempting to just return C3 here, but that would give the + // wrong result if C3 was -0.0. + return ConstantFP::get(Ty->getContext(), APFloat(0.0f) + C3); + } + LLVM_FALLTHROUGH; + } case Intrinsic::fma: case Intrinsic::fmuladd: { APFloat V = Op1->getValueAPF(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -599,6 +599,7 @@ case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: case AMDGPUISD::FMED3: + // TODO: handle llvm.amdgcn.fma.legacy return true; default: return false; @@ -3723,6 +3724,7 @@ } case ISD::FMA: case ISD::FMAD: { + // TODO: handle llvm.amdgcn.fma.legacy if (!mayIgnoreSignedZero(N0)) return SDValue(); @@ -4713,6 +4715,12 @@ case Intrinsic::amdgcn_fdot2: // TODO: Refine on operand return SNaN; + case Intrinsic::amdgcn_fma_legacy: + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1); default: return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4020,6 +4020,7 @@ case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: case Intrinsic::amdgcn_fmul_legacy: + case Intrinsic::amdgcn_fma_legacy: case Intrinsic::amdgcn_ldexp: case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_frexp_exp: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -895,6 +895,17 @@ SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; +// Don't allow source modifiers. If there are any source modifiers then it's +// better to select fma instead of fmac. +let SubtargetPredicate = HasNoMadMacF32Insts in +def : GCNPat < + (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), + (VOP3NoMods f32:$src1), + (VOP3NoMods f32:$src2))), + (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + let SubtargetPredicate = Has16BitInsts in { def : FMADPat ; def : FMADPat ; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -298,7 +298,9 @@ } // End SubtargetPredicate = HasMadMacInsts let SubtargetPredicate = HasNoMadMacF32Insts in -def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile>; +def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", + VOP3_Profile, + int_amdgcn_fma_legacy>; } def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s + +define float @v_fma(float %a, float %b, float %c) { +; GCN-LABEL: v_fma: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: v_fmac_legacy_f32_e64 v2, v0, v1 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] + %fma = call float @llvm.amdgcn.fma.legacy(float %a, float %b, float %c) + ret float %fma +} + +define float @v_fabs_fma(float %a, float %b, float %c) { +; GCN-LABEL: v_fabs_fma: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: v_fma_legacy_f32 v0, |v0|, v1, v2 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.a = call float @llvm.fabs.f32(float %a) + %fma = call float @llvm.amdgcn.fma.legacy(float %fabs.a, float %b, float %c) + ret float %fma +} + +define float @v_fneg_fabs_fma(float %a, float %b, float %c) { +; GCN-LABEL: v_fneg_fabs_fma: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: v_fma_legacy_f32 v0, v0, -|v1|, v2 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.b = call float @llvm.fabs.f32(float %b) + %neg.fabs.b = fneg float %fabs.b + %fma = call float @llvm.amdgcn.fma.legacy(float %a, float %neg.fabs.b, float %c) + ret float %fma +} + +define float @v_fneg_fma(float %a, float %b, float %c) { +; GCN-LABEL: v_fneg_fma: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: v_fma_legacy_f32 v0, v0, v1, -v2 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 s[30:31] + %neg.c = fneg float %c + %fma = call float @llvm.amdgcn.fma.legacy(float %a, float %b, float %neg.c) + ret float %fma +} + +declare float @llvm.amdgcn.fma.legacy(float, float, float) +declare float @llvm.fabs.f32(float) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/AMDGPU/fma_legacy.ll b/llvm/test/Transforms/InstSimplify/ConstProp/AMDGPU/fma_legacy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/ConstProp/AMDGPU/fma_legacy.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare float @llvm.amdgcn.fma.legacy(float, float, float) + +define void @test(float* %p) { +; CHECK-LABEL: @test( +; CHECK-NEXT: store volatile float 1.000000e+01, float* [[P:%.*]], align 4 +; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4 +; CHECK-NEXT: ret void +; + %a = call float @llvm.amdgcn.fma.legacy(float +2.0, float +3.0, float +4.0) + store volatile float %a, float* %p + %b = call float @llvm.amdgcn.fma.legacy(float +2.0, float +0.0, float +4.0) + store volatile float %b, float* %p + %c = call float @llvm.amdgcn.fma.legacy(float +2.0, float -0.0, float +4.0) + store volatile float %c, float* %p + %d = call float @llvm.amdgcn.fma.legacy(float +0.0, float +0.0, float -0.0) + store volatile float %d, float* %p + %e = call float @llvm.amdgcn.fma.legacy(float +0.0, float -0.0, float -0.0) + store volatile float %e, float* %p + %f = call float @llvm.amdgcn.fma.legacy(float -0.0, float +0.0, float -0.0) + store volatile float %f, float* %p + %g = call float @llvm.amdgcn.fma.legacy(float -0.0, float -0.0, float -0.0) + store volatile float %g, float* %p + %h = call float @llvm.amdgcn.fma.legacy(float +0.0, float 0x7ff0000000000000, float +4.0) ; +inf + store volatile float %h, float* %p + %i = call float @llvm.amdgcn.fma.legacy(float 0xfff0000000000000, float +0.0, float +4.0) ; -inf + store volatile float %i, float* %p + %j = call float @llvm.amdgcn.fma.legacy(float 0x7ff0001000000000, float -0.0, float +4.0) ; +nan + store volatile float %j, float* %p + %k = call float @llvm.amdgcn.fma.legacy(float -0.0, float 0xfff0000100000000, float +4.0) ; -nan + store volatile float %k, float* %p + ret void +}