Index: include/llvm/IR/Operator.h =================================================================== --- include/llvm/IR/Operator.h +++ include/llvm/IR/Operator.h @@ -202,7 +202,6 @@ setNoNaNs(); setNoInfs(); setNoSignedZeros(); - setAllowReciprocal(); } void operator&=(const FastMathFlags &OtherFlags) { @@ -227,7 +226,6 @@ setHasNoNaNs(true); setHasNoInfs(true); setHasNoSignedZeros(true); - setHasAllowReciprocal(true); } } void setHasNoNaNs(bool B) { Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8887,9 +8887,8 @@ // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { - bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; const SDNodeFlags *Flags = N->getFlags(); - if (!UnsafeMath && !Flags->hasAllowReciprocal()) + if (!Flags->hasAllowReciprocal()) return SDValue(); // Skip if current node is a reciprocal. @@ -8912,7 +8911,7 @@ if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { // This division is eligible for optimization only if global unsafe math // is enabled or if this division allows reciprocal formation. - if (UnsafeMath || U->getFlags()->hasAllowReciprocal()) + if (U->getFlags()->hasAllowReciprocal()) Users.insert(U); } } Index: test/CodeGen/AArch64/fdiv-combine.ll =================================================================== --- test/CodeGen/AArch64/fdiv-combine.ll +++ test/CodeGen/AArch64/fdiv-combine.ll @@ -11,9 +11,9 @@ ; CHECK: fmul ; CHECK: fmul ; CHECK: fmul - %div = fdiv float %a, %D - %div1 = fdiv float %b, %D - %div2 = fdiv float %c, %D + %div = fdiv arcp float %a, %D + %div1 = fdiv arcp float %b, %D + %div2 = fdiv arcp float %c, %D tail call void @foo_3f(float %div, float %div1, float %div2) ret void } @@ -25,9 +25,9 @@ ; CHECK: fmul ; CHECK: fmul ; CHECK: fmul - %div = fdiv double %a, %D - %div1 = fdiv double %b, %D - %div2 = fdiv double %c, %D + %div = fdiv arcp double %a, %D + %div1 = fdiv arcp double %b, %D + %div2 = fdiv arcp double %c, %D tail call void @foo_3d(double %div, double %div1, double %div2) ret void } @@ -39,9 +39,9 @@ ; CHECK: fmul ; CHECK: fmul ; CHECK: fmul - %div = fdiv <4 x float> %a, %D - %div1 = fdiv <4 x float> %b, %D - %div2 = fdiv <4 x float> %c, %D + %div = fdiv arcp <4 x float> %a, %D + %div1 = fdiv arcp <4 x float> %b, %D + %div2 = fdiv arcp <4 x float> %c, %D tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2) ret void } @@ -53,9 +53,9 @@ ; CHECK: fmul ; CHECK: fmul ; CHECK: fmul - %div = fdiv <2 x double> %a, %D - %div1 = fdiv <2 x double> %b, %D - %div2 = fdiv <2 x double> %c, %D + %div = fdiv arcp <2 x double> %a, %D + %div1 = fdiv arcp <2 x double> %b, %D + %div2 = fdiv arcp <2 x double> %c, %D tail call void @foo_3_2xd(<2 x double> %div, <2 x double> %div1, <2 x double> %div2) ret void } Index: test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.ll +++ test/CodeGen/AMDGPU/fdiv.ll @@ -62,7 +62,7 @@ ; SI: buffer_store_dword [[RESULT]] define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: - %fdiv = fdiv fast float %a, %b + %fdiv = fdiv fast arcp float %a, %b store float %fdiv, float addrspace(1)* %out ret void } @@ -77,7 +77,7 @@ ; SI: buffer_store_dword [[RESULT]] define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: - %fdiv = fdiv fast float %a, %b + %fdiv = fdiv fast arcp float %a, %b store float %fdiv, float addrspace(1)* %out ret void } Index: test/CodeGen/PowerPC/fdiv-combine.ll =================================================================== --- test/CodeGen/PowerPC/fdiv-combine.ll +++ test/CodeGen/PowerPC/fdiv-combine.ll @@ -14,9 +14,9 @@ ; CHECK: fmul ; CHECK: fmul ; CHECK: fmul - %div = fdiv double %a, %D - %div1 = fdiv double %b, %D - %div2 = fdiv double %c, %D + %div = fdiv arcp double %a, %D + %div1 = fdiv arcp double %b, %D + %div2 = fdiv arcp double %c, %D tail call void @foo_3d(double %div, double %div1, double %div2) ret void } Index: test/CodeGen/X86/fdiv-combine.ll =================================================================== --- test/CodeGen/X86/fdiv-combine.ll +++ test/CodeGen/X86/fdiv-combine.ll @@ -89,8 +89,8 @@ ; CHECK-NEXT: mulsd %xmm2, %xmm0 ; CHECK-NEXT: addsd %xmm2, %xmm0 ; CHECK-NEXT: retq - %div1 = fdiv fast double 1.0, %y - %div2 = fdiv fast double %x, %y + %div1 = fdiv arcp double 1.0, %y + %div2 = fdiv arcp double %x, %y %ret = fadd fast double %div2, %div1 ret double %ret } Index: test/LTO/X86/Inputs/fast-with-recip.ll =================================================================== --- /dev/null +++ test/LTO/X86/Inputs/fast-with-recip.ll @@ -0,0 +1,9 @@ +define void @fastWithRecip(float %a, float %b, float %c) { +entry: + %div = fdiv fast arcp float %a, %c + %div1 = fdiv fast arcp float %b, %c + tail call void @useWithRecip(float %div, float %div1) + ret void +} + +declare void @useWithRecip(float, float) Index: test/LTO/X86/Inputs/fast-without-recip.ll =================================================================== --- /dev/null +++ test/LTO/X86/Inputs/fast-without-recip.ll @@ -0,0 +1,9 @@ +define void @fastWithoutRecip(float %a, float %b, float %c) { +entry: + %div = fdiv fast float %a, %c + %div1 = fdiv fast float %b, %c + tail call void @useWithoutRecip(float %div, float %div1) + ret void +} + +declare void @useWithoutRecip(float, float) Index: test/LTO/X86/fast-recip.ll =================================================================== --- /dev/null +++ test/LTO/X86/fast-recip.ll @@ -0,0 +1,38 @@ +; RUN: llvm-link -o %t.bc %s %p/Inputs/fast-without-recip.ll %p/Inputs/fast-with-recip.ll +; RUN: opt -inline -instcombine -o %t2.bc %t.bc +; RUN: llc -disable-tail-calls %t2.bc -o - | FileCheck %s + +; Inlining will be done on fastWithRecip() (built with fast-math leaving the +; reciprocal-transformation enabled), and fastWithoutRecip() (built with +; fast-math but disabling the reciprocal-transformation). They both contain +; two divisions with the same denominator, and so are candidates for the +; reciprocal-transformation. We verify that in the enabled version, only +; one division is done (the reciprocal) followed by two multiplications. And +; in the disabled version, both divisions are done (and no multiplications). + +define void @foo(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5) #0 { +entry: +; CHECK: fooEnter +; CHECK: div +; CHECK-NOT: div +; CHECK: mul +; CHECK: mul +; CHECK: useWithRecip +; CHECK: div +; CHECK: div +; CHECK-NOT: mul +; CHECK: useWithoutRecip +; CHECK: fooExit + tail call void @fooEnter() + tail call void @fastWithRecip(float %a0, float %a1, float %a2) + tail call void @fastWithoutRecip(float %a3, float %a4, float %a5) + tail call void @fooExit() + ret void +} + +declare void @fooEnter() +declare void @fastWithRecip(float, float, float) +declare void @fastWithoutRecip(float, float, float) +declare void @fooExit() + +attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true" } Index: test/Transforms/InstCombine/fast-math.ll =================================================================== --- test/Transforms/InstCombine/fast-math.ll +++ test/Transforms/InstCombine/fast-math.ll @@ -347,7 +347,7 @@ ; X/C1 / C2 => X * (1/(C2*C1)) define float @fdiv1(float %x) { %div = fdiv float %x, 0x3FF3333340000000 - %div1 = fdiv fast float %div, 0x4002666660000000 + %div1 = fdiv fast arcp float %div, 0x4002666660000000 ret float %div1 ; 0x3FF3333340000000 = 1.2f ; 0x4002666660000000 = 2.3f Index: test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll +++ test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll @@ -254,9 +254,9 @@ %load3 = load float, float* %idx3, align 4 %load4 = load float, float* %idx4, align 4 - %op1 = fadd fast float %load1, 1.0 - %op2 = fadd fast float %load2, 1.0 - %op3 = fadd fast float %load3, 1.0 + %op1 = fadd fast arcp float %load1, 1.0 + %op2 = fadd fast arcp float %load2, 1.0 + %op3 = fadd fast arcp float %load3, 1.0 %op4 = fadd arcp float %load4, 1.0 store float %op1, float* %idx1, align 4 Index: unittests/IR/IRBuilderTest.cpp =================================================================== --- unittests/IR/IRBuilderTest.cpp +++ unittests/IR/IRBuilderTest.cpp @@ -163,6 +163,8 @@ FAdd = cast(F); EXPECT_TRUE(FAdd->hasNoNaNs()); + FMF.setAllowReciprocal(); + Builder.setFastMathFlags(FMF); F = Builder.CreateFDiv(F, F); EXPECT_TRUE(Builder.getFastMathFlags().any()); EXPECT_TRUE(Builder.getFastMathFlags().UnsafeAlgebra);