Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -682,6 +682,8 @@ // Don't recurse exponentially. if (Depth > 6) return 0; + bool UnsafeFPMath = Options->UnsafeFPMath || Op->isFast(); + switch (Op.getOpcode()) { default: return false; case ISD::ConstantFP: { @@ -695,7 +697,7 @@ } case ISD::FADD: // FIXME: determine better conditions for this xform. - if (!Options->UnsafeFPMath) return 0; + if (!UnsafeFPMath) return 0; // After operation legalization, it might not be legal to create new FSUBs. if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) @@ -719,7 +721,7 @@ case ISD::FMUL: case ISD::FDIV: - if (Options->HonorSignDependentRoundingFPMath()) return 0; + if (Options->HonorSignDependentRoundingFPMathOption && !UnsafeFPMath) return 0; // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, @@ -757,7 +759,7 @@ } case ISD::FADD: // FIXME: determine better conditions for this xform. - assert(Options.UnsafeFPMath); + assert(Options.UnsafeFPMath || Op->isFast()); // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, @@ -10190,7 +10192,7 @@ } // FIXME: Auto-upgrade the target/function-level option. - if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) { // fold (fadd A, 0) -> A if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1)) if (N1C->isZero()) @@ -10198,7 +10200,7 @@ } // If 'unsafe math' is enabled, fold lots of things. - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.isFast()) { // No FP constant should be created after legalization as Instruction // Selection pass has a hard time dealing with FP constants. bool AllowNewConst = (Level < AfterLegalizeDAG); @@ -10333,7 +10335,7 @@ GetNegatedExpression(N1, DAG, LegalOperations), Flags); // FIXME: Auto-upgrade the target/function-level option. - if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) { // (fsub 0, B) -> -B if (N0CFP && N0CFP->isZero()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) @@ -10344,7 +10346,7 @@ } // If 'unsafe math' is enabled, fold lots of things. - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.isFast()) { // (fsub A, 0) -> A if (N1CFP && N1CFP->isZero()) return N0; @@ -10409,7 +10411,7 @@ if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || Flags.isFast()) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) return N1; @@ -10642,7 +10644,7 @@ // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { - bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; + bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath || N->isFast(); const SDNodeFlags Flags = N->getFlags(); if (!UnsafeMath && !Flags.hasAllowReciprocal()) return SDValue(); @@ -10720,7 +10722,7 @@ if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if (Options.UnsafeFPMath) { + if (Options.UnsafeFPMath || N->isFast()) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { // Compute the reciprocal 1.0 / c2. @@ -11134,7 +11136,7 @@ // single-step fp_round we want to fold to. // In other words, double rounding isn't the same as rounding. // Also, this is a value preserving truncation iff both fp_round's are. - if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { + if (DAG.getTarget().Options.UnsafeFPMath || N->isFast() || N0IsTrunc) { SDLoc DL(N); return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL)); Index: test/CodeGen/AArch64/fdiv-combine_fmf.ll =================================================================== --- test/CodeGen/AArch64/fdiv-combine_fmf.ll +++ test/CodeGen/AArch64/fdiv-combine_fmf.ll @@ -0,0 +1,93 @@ +; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s + +; Following test cases check: +; a / D; b / D; c / D; +; => +; recip = 1.0 / D; a * recip; b * recip; c * recip; +define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 { +; CHECK-LABEL: three_fdiv_float: +; CHECK: fdiv s +; CHECK-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv fast float %a, %D + %div1 = fdiv fast float %b, %D + %div2 = fdiv fast float %c, %D + tail call void @foo_3f(float %div, float %div1, float %div2) + ret void +} + +define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 { +; CHECK-LABEL: three_fdiv_double: +; CHECK: fdiv d +; CHECK-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv fast double %a, %D + %div1 = fdiv fast double %b, %D + %div2 = fdiv fast double %c, %D + tail call void @foo_3d(double %div, double %div1, double %div2) + ret void +} + +define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; CHECK-LABEL: three_fdiv_4xfloat: +; CHECK: fdiv v +; CHECK-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv fast <4 x float> %a, %D + %div1 = fdiv fast <4 x float> %b, %D + %div2 = fdiv fast <4 x float> %c, %D + tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2) + ret void +} + +define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { +; CHECK-LABEL: three_fdiv_2xdouble: +; CHECK: fdiv v +; CHECK-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv fast <2 x double> %a, %D + %div1 = fdiv fast <2 x double> %b, %D + %div2 = fdiv fast <2 x double> %c, %D + tail call void @foo_3_2xd(<2 x double> %div, <2 x double> %div1, <2 x double> %div2) + ret void +} + +; Following test cases check we never combine two FDIVs if neither of them +; calculates a reciprocal. +define void @two_fdiv_float(float %D, float %a, float %b) #0 { +; CHECK-LABEL: two_fdiv_float: +; CHECK: fdiv s +; CHECK: fdiv s +; CHECK-NOT: fmul + %div = fdiv fast float %a, %D + %div1 = fdiv fast float %b, %D + tail call void @foo_2f(float %div, float %div1) + ret void +} + +define void @two_fdiv_double(double %D, double %a, double %b) #0 { +; CHECK-LABEL: two_fdiv_double: +; CHECK: fdiv d +; CHECK: fdiv d +; CHECK-NOT: fmul + %div = fdiv fast double %a, %D + %div1 = fdiv fast double %b, %D + tail call void @foo_2d(double %div, double %div1) + ret void +} + +declare void @foo_3f(float, float, float) +declare void @foo_3d(double, double, double) +declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>) +declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>) +declare void @foo_2f(float, float) +declare void @foo_2d(double, double) + Index: test/CodeGen/PowerPC/fdiv-combine_fmf.ll =================================================================== --- test/CodeGen/PowerPC/fdiv-combine_fmf.ll +++ test/CodeGen/PowerPC/fdiv-combine_fmf.ll @@ -0,0 +1,38 @@ +; RUN: llc -verify-machineinstrs -mcpu=ppc64 < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Following test case checks: +; a / D; b / D; c / D; +; => +; recip = 1.0 / D; a * recip; b * recip; c * recip; + +define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 { +; CHECK-LABEL: three_fdiv_double: +; CHECK: fdiv {{[0-9]}} +; CHECK-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv fast double %a, %D + %div1 = fdiv fast double %b, %D + %div2 = fdiv fast double %c, %D + tail call void @foo_3d(double %div, double %div1, double %div2) + ret void +} + +define void @two_fdiv_double(double %D, double %a, double %b) #0 { +; CHECK-LABEL: two_fdiv_double: +; CHECK: fdiv {{[0-9]}} +; CHECK: fdiv {{[0-9]}} +; CHECK-NOT: fmul + %div = fdiv fast double %a, %D + %div1 = fdiv fast double %b, %D + tail call void @foo_2d(double %div, double %div1) + ret void +} + +declare void @foo_3d(double, double, double) +declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>) +declare void @foo_2d(double, double) + Index: test/CodeGen/PowerPC/fmf-math.ll =================================================================== --- test/CodeGen/PowerPC/fmf-math.ll +++ test/CodeGen/PowerPC/fmf-math.ll @@ -0,0 +1,8 @@ +; RUN: llc -verify-machineinstrs < %s -mattr=-vsx -mtriple=ppc32-- | grep fmul | count 1 + +define double @foo(double %X) nounwind { + %tmp1 = fmul fast double %X, 1.23 + %tmp2 = fmul fast double %tmp1, 4.124 + ret double %tmp2 +} + Index: test/CodeGen/X86/change-ir-fp-math.ll =================================================================== --- test/CodeGen/X86/change-ir-fp-math.ll +++ test/CodeGen/X86/change-ir-fp-math.ll @@ -0,0 +1,22 @@ +; Check that we can enable/disable fast IR flag attributes. + +; RUN: llc < %s -mtriple=x86_64-unknown-unknown \ +; RUN: | FileCheck %s --check-prefix=CHECK + +; The div in these functions should be converted to a mul when unsafe-fp-math +; is enabled. + +; CHECK-LABEL: fast_fp_math: +define double @fast_fp_math(double %x) { +; CHECK: mulsd + %div = fdiv fast double %x, 2.0 + ret double %div +} + +; CHECK-LABEL: noflags_fp_math: +define double @noflags_fp_math(double %x) { +; CHECK: divsd + %div = fdiv double %x, 2.0 + ret double %div +} + Index: test/CodeGen/X86/fadd-combines_fmf.ll =================================================================== --- test/CodeGen/X86/fadd-combines_fmf.ll +++ test/CodeGen/X86/fadd-combines_fmf.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +define float @fadd_zero_f32(float %x) #0 { +; CHECK-LABEL: fadd_zero_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %y = fadd fast float %x, 0.0 + ret float %y +} + +define <4 x float> @fadd_zero_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_zero_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %y = fadd fast <4 x float> %x, zeroinitializer + ret <4 x float> %y +} + +; CHECK: float 3 +define float @fadd_2const_f32(float %x) #0 { +; CHECK-LABEL: fadd_2const_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast float %x, 1.0 + %z = fadd fast float %y, 2.0 + ret float %z +} + +; CHECK: float 5 +; CHECK: float 5 +; CHECK: float 5 +; CHECK: float 5 +define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_2const_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast <4 x float> %x, + %z = fadd fast <4 x float> %y, + ret <4 x float> %z +} + +; CHECK: float 3 +define float @fadd_x_fmul_x_c_f32(float %x) #0 { +; CHECK-LABEL: fadd_x_fmul_x_c_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fmul fast float %x, 2.0 + %z = fadd fast float %x, %y + ret float %z +} + +; CHECK: float 2 +; CHECK: float 3 +; CHECK: float 4 +; CHECK: float 5 +define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_x_fmul_x_c_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fmul fast <4 x float> %x, + %z = fadd fast <4 x float> %x, %y + ret <4 x float> %z +} + +; CHECK: float 3 +define float @fadd_fmul_x_c_x_f32(float %x) #0 { +; CHECK-LABEL: fadd_fmul_x_c_x_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fmul fast float %x, 2.0 + %z = fadd fast float %y, %x + ret float %z +} + +; CHECK: float 2 +; CHECK: float 3 +; CHECK: float 4 +; CHECK: float 5 +define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_fmul_x_c_x_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fmul fast <4 x float> %x, + %z = fadd fast <4 x float> %y, %x + ret <4 x float> %z +} + +; CHECK: float 4 +define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 { +; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast float %x, %x + %z = fmul fast float %x, 2.0 + %w = fadd fast float %y, %z + ret float %w +} + +; CHECK: float 3 +; CHECK: float 4 +; CHECK: float 5 +; CHECK: float 6 +define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast <4 x float> %x, %x + %z = fmul fast <4 x float> %x, + %w = fadd fast <4 x float> %y, %z + ret <4 x float> %w +} + +; CHECK: float 4 +define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 { +; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast float %x, %x + %z = fmul fast float %x, 2.0 + %w = fadd fast float %z, %y + ret float %w +} + +; CHECK: float 3 +; CHECK: float 4 +; CHECK: float 5 +; CHECK: float 6 +define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast <4 x float> %x, %x + %z = fmul fast <4 x float> %x, + %w = fadd fast <4 x float> %z, %y + ret <4 x float> %w +} + +; CHECK: float 3 +define float @fadd_x_fadd_x_x_f32(float %x) #0 { +; CHECK-LABEL: fadd_x_fadd_x_x_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast float %x, %x + %z = fadd fast float %x, %y + ret float %z +} + +; CHECK: float 3 +; CHECK: float 3 +; CHECK: float 3 +; CHECK: float 3 +define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_x_fadd_x_x_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast <4 x float> %x, %x + %z = fadd fast <4 x float> %x, %y + ret <4 x float> %z +} + +; CHECK: float 3 +define float @fadd_fadd_x_x_x_f32(float %x) #0 { +; CHECK-LABEL: fadd_fadd_x_x_x_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast float %x, %x + %z = fadd fast float %y, %x + ret float %z +} + +; CHECK: float 3 +; CHECK: float 3 +; CHECK: float 3 +; CHECK: float 3 +define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_fadd_x_x_x_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast <4 x float> %x, %x + %z = fadd fast <4 x float> %y, %x + ret <4 x float> %z +} + +; CHECK: float 4 +define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 { +; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast float %x, %x + %z = fadd fast float %y, %y + ret float %z +} + +; CHECK: float 4 +; CHECK: float 4 +; CHECK: float 4 +; CHECK: float 4 +define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 { +; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %y = fadd fast <4 x float> %x, %x + %z = fadd fast <4 x float> %y, %y + ret <4 x float> %z +} + +attributes #0 = { "less-precise-fpmad"="true" } Index: test/CodeGen/X86/fdiv_fmf.ll =================================================================== --- test/CodeGen/X86/fdiv_fmf.ll +++ test/CodeGen/X86/fdiv_fmf.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define double @exact(double %x) { +; Exact division by a constant converted to multiplication. +; CHECK-LABEL: exact: +; CHECK: # %bb.0: +; CHECK-NEXT: mulsd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %div = fdiv fast double %x, 2.0 + ret double %div +} + +define double @inexact(double %x) { +; Inexact division by a constant converted to multiplication. +; CHECK-LABEL: inexact: +; CHECK: # %bb.0: +; CHECK-NEXT: mulsd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %div = fdiv fast double %x, 0x41DFFFFFFFC00000 + ret double %div +} + +define double @funky(double %x) { +; No conversion to multiplication if too funky. +; CHECK-LABEL: funky: +; CHECK: # %bb.0: +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: divsd %xmm1, %xmm0 +; CHECK-NEXT: retq + %div = fdiv fast double %x, 0.0 + ret double %div +} + +define double @denormal1(double %x) { +; Don't generate multiplication by a denormal. +; CHECK-LABEL: denormal1: +; CHECK: # %bb.0: +; CHECK-NEXT: divsd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %div = fdiv fast double %x, 0x7FD0000000000001 + ret double %div +} + +define double @denormal2(double %x) { +; Don't generate multiplication by a denormal. +; CHECK-LABEL: denormal2: +; CHECK: # %bb.0: +; CHECK-NEXT: divsd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %div = fdiv fast double %x, 0x7FEFFFFFFFFFFFFF + ret double %div +} + +; Deleting the negates does not require unsafe-fp-math. + +define float @double_negative(float %x, float %y) #0 { +; CHECK-LABEL: double_negative: +; CHECK: # %bb.0: +; CHECK-NEXT: divss %xmm1, %xmm0 +; CHECK-NEXT: retq + %neg1 = fsub fast float -0.0, %x + %neg2 = fsub fast float -0.0, %y + %div = fdiv float %neg1, %neg2 + ret float %div +} + + Index: test/CodeGen/X86/fmf-flags.ll =================================================================== --- test/CodeGen/X86/fmf-flags.ll +++ test/CodeGen/X86/fmf-flags.ll @@ -7,9 +7,12 @@ define float @fast_recip_sqrt(float %x) { ; X64-LABEL: fast_recip_sqrt: ; X64: # %bb.0: -; X64-NEXT: sqrtss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: divss %xmm1, %xmm0 +; X64-NEXT: rsqrtss %xmm0, %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fast_recip_sqrt: @@ -29,18 +32,13 @@ define float @fast_fmuladd_opts(float %a , float %b , float %c) { ; X64-LABEL: fast_fmuladd_opts: ; X64: # %bb.0: -; X64-NEXT: movaps %xmm0, %xmm1 -; X64-NEXT: addss %xmm0, %xmm1 -; X64-NEXT: addss %xmm0, %xmm1 -; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fast_fmuladd_opts: ; X86: # %bb.0: ; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: fld %st(0) -; X86-NEXT: fadd %st(1) -; X86-NEXT: faddp %st(1) +; X86-NEXT: fmuls {{.*}} ; X86-NEXT: retl %res = call fast float @llvm.fmuladd.f32(float %a, float 2.0, float %a) ret float %res @@ -53,9 +51,9 @@ define double @not_so_fast_mul_add(double %x) { ; X64-LABEL: not_so_fast_mul_add: ; X64: # %bb.0: -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movsd {{.*}}(%rip), %xmm1 ; X64-NEXT: mulsd %xmm0, %xmm1 -; X64-NEXT: addsd %xmm1, %xmm0 +; X64-NEXT: mulsd {{.*}}(%rip), %xmm0 ; X64-NEXT: movsd %xmm1, {{.*}}(%rip) ; X64-NEXT: retq ; @@ -64,7 +62,9 @@ ; X86-NEXT: fldl {{[0-9]+}}(%esp) ; X86-NEXT: fld %st(0) ; X86-NEXT: fmull {{\.LCPI.*}} -; X86-NEXT: fadd %st(0), %st(1) +; X86-NEXT: fxch %st(1) +; X86-NEXT: fmull {{\.LCPI.*}} +; X86-NEXT: fxch %st(1) ; X86-NEXT: fstpl mul1 ; X86-NEXT: retl %m = fmul double %x, 4.2 @@ -80,10 +80,14 @@ define float @not_so_fast_recip_sqrt(float %x) { ; X64-LABEL: not_so_fast_recip_sqrt: ; X64: # %bb.0: -; X64-NEXT: sqrtss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: divss %xmm1, %xmm0 -; X64-NEXT: movss %xmm1, {{.*}}(%rip) +; X64-NEXT: rsqrtss %xmm0, %xmm1 +; X64-NEXT: sqrtss %xmm0, %xmm2 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: mulss {{.*}}(%rip), %xmm1 +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: movss %xmm2, {{.*}}(%rip) ; X64-NEXT: retq ; ; X86-LABEL: not_so_fast_recip_sqrt: Index: test/CodeGen/X86/fmul-combines_fmf.ll =================================================================== --- test/CodeGen/X86/fmul-combines_fmf.ll +++ test/CodeGen/X86/fmul-combines_fmf.ll @@ -0,0 +1,179 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +; CHECK-LABEL: fmul2_f32: +; CHECK: addss %xmm0, %xmm0 +define float @fmul2_f32(float %x) { + %y = fmul float %x, 2.0 + ret float %y +} + +; fmul 2.0, x -> fadd x, x for vectors. + +; CHECK-LABEL: fmul2_v4f32: +; CHECK: addps %xmm0, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul2_v4f32(<4 x float> %x) { + %y = fmul <4 x float> %x, + ret <4 x float> %y +} + +; CHECK-LABEL: constant_fold_fmul_v4f32: +; CHECK: movaps +; CHECK-NEXT: ret +define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) { + %y = fmul <4 x float> , + ret <4 x float> %y +} + +; CHECK-LABEL: fmul0_v4f32: +; CHECK: xorps %xmm0, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul0_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, + ret <4 x float> %y +} + +; CHECK-LABEL: fmul_c2_c4_v4f32: +; CHECK-NOT: addps +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_c2_c4_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, + %z = fmul <4 x float> %y, + ret <4 x float> %z +} + +; CHECK-LABEL: fmul_c3_c4_v4f32: +; CHECK-NOT: addps +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, + %z = fmul <4 x float> %y, + ret <4 x float> %z +} + +; We should be able to pre-multiply the two constant vectors. +; CHECK: float 5 +; CHECK: float 12 +; CHECK: float 21 +; CHECK: float 32 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, + %z = fmul <4 x float> %y, + ret <4 x float> %z +} + +; Same as above, but reverse operands to make sure non-canonical form is also handled. +; CHECK: float 5 +; CHECK: float 12 +; CHECK: float 21 +; CHECK: float 32 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 { + %y = fmul <4 x float> , %x + %z = fmul <4 x float> , %y + ret <4 x float> %z +} + +; More than one use of a constant multiply should not inhibit the optimization. +; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. +; CHECK: float 6 +; CHECK: float 14 +; CHECK: float 24 +; CHECK: float 36 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use: +; CHECK: mulps +; CHECK: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, + %z = fmul <4 x float> %y, + %a = fadd <4 x float> %y, %z + ret <4 x float> %a +} + +; PR22698 - http://llvm.org/bugs/show_bug.cgi?id=22698 +; Make sure that we don't infinite loop swapping constants back and forth. + +define <4 x float> @PR22698_splats(<4 x float> %a) #0 { + %mul1 = fmul fast <4 x float> , + %mul2 = fmul fast <4 x float> , %mul1 + %mul3 = fmul fast <4 x float> %a, %mul2 + ret <4 x float> %mul3 + +; CHECK: float 24 +; CHECK: float 24 +; CHECK: float 24 +; CHECK: float 24 +; CHECK-LABEL: PR22698_splats: +; CHECK: mulps +; CHECK: ret +} + +; Same as above, but verify that non-splat vectors are handled correctly too. +define <4 x float> @PR22698_no_splats(<4 x float> %a) #0 { + %mul1 = fmul fast <4 x float> , + %mul2 = fmul fast <4 x float> , %mul1 + %mul3 = fmul fast <4 x float> %a, %mul2 + ret <4 x float> %mul3 + +; CHECK: float 45 +; CHECK: float 120 +; CHECK: float 231 +; CHECK: float 384 +; CHECK-LABEL: PR22698_no_splats: +; CHECK: mulps +; CHECK: ret +} + +; CHECK-LABEL: fmul_c2_c4_f32: +; CHECK-NOT: addss +; CHECK: mulss +; CHECK-NOT: mulss +; CHECK-NEXT: ret +define float @fmul_c2_c4_f32(float %x) #0 { + %y = fmul float %x, 2.0 + %z = fmul float %y, 4.0 + ret float %z +} + +; CHECK-LABEL: fmul_c3_c4_f32: +; CHECK-NOT: addss +; CHECK: mulss +; CHECK-NOT: mulss +; CHECK-NET: ret +define float @fmul_c3_c4_f32(float %x) #0 { + %y = fmul float %x, 3.0 + %z = fmul float %y, 4.0 + ret float %z +} + +; CHECK-LABEL: fmul_fneg_fneg_f32: +; CHECK: mulss %xmm1, %xmm0 +; CHECK-NEXT: retq +define float @fmul_fneg_fneg_f32(float %x, float %y) { + %x.neg = fsub float -0.0, %x + %y.neg = fsub float -0.0, %y + %mul = fmul float %x.neg, %y.neg + ret float %mul +} +; CHECK-LABEL: fmul_fneg_fneg_v4f32: +; CHECK: mulps {{%xmm1|\(%rdx\)}}, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul_fneg_fneg_v4f32(<4 x float> %x, <4 x float> %y) { + %x.neg = fsub <4 x float> , %x + %y.neg = fsub <4 x float> , %y + %mul = fmul <4 x float> %x.neg, %y.neg + ret <4 x float> %mul +} + +attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } Index: test/CodeGen/X86/fmul-zero_fmf.ll =================================================================== --- test/CodeGen/X86/fmul-zero_fmf.ll +++ test/CodeGen/X86/fmul-zero_fmf.ll @@ -0,0 +1,8 @@ +; RUN: llc < %s -mtriple=x86_64-- | grep mulps + +define void @test14(<4 x float>*) nounwind { + load <4 x float>, <4 x float>* %0, align 1 + fmul <4 x float> %2, zeroinitializer + store <4 x float> %3, <4 x float>* %0, align 1 + ret void +} Index: test/CodeGen/X86/fp-double-rounding_fmf.ll =================================================================== --- test/CodeGen/X86/fp-double-rounding_fmf.ll +++ test/CodeGen/X86/fp-double-rounding_fmf.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64--" + +; CHECK-LABEL: double_rounding: +; SAFE: callq __trunctfdf2 +; SAFE-NEXT: cvtsd2ss %xmm0 +; UNSAFE: callq __trunctfsf2 +; UNSAFE-NOT: cvt +define void @double_rounding(fp128* %x, float* %f) { +entry: + %0 = load fp128, fp128* %x, align 16 + %1 = fptrunc fp128 %0 to double + %2 = fptrunc double %1 to float + store float %2, float* %f, align 4 + ret void +} + +; CHECK-LABEL: double_rounding_precise_first: +; CHECK: fstps (% +; CHECK-NOT: fstpl +define void @double_rounding_precise_first(float* %f) { +entry: + ; Hack, to generate a precise FP_ROUND to double + %precise = call double asm sideeffect "fld %st(0)", "={st(0)}"() + %0 = fptrunc double %precise to float + store float %0, float* %f, align 4 + ret void +} Index: test/CodeGen/X86/fp-fast_fmf.ll =================================================================== --- test/CodeGen/X86/fp-fast_fmf.ll +++ test/CodeGen/X86/fp-fast_fmf.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s + +define float @test1(float %a) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fadd fast float %a, %a + %r = fadd fast float %t1, %t1 + ret float %r +} + +define float @test2(float %a) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %t1 = fmul fast float 4.0, %a + %t2 = fadd fast float %a, %a + %r = fadd float %t1, %t2 + ret float %r +} + +define float @test3(float %a) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fmul fast float %a, 4.0 + %t2 = fadd fast float %a, %a + %r = fadd fast float %t1, %t2 + ret float %r +} + +define float @test4(float %a) { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fadd fast float %a, %a + %t2 = fmul fast float 4.0, %a + %r = fadd fast float %t1, %t2 + ret float %r +} + +define float @test5(float %a) { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fadd fast float %a, %a + %t2 = fmul fast float %a, 4.0 + %r = fadd fast float %t1, %t2 + ret float %r +} + +define float @test6(float %a) { +; CHECK-LABEL: test6: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fmul fast float 2.0, %a + %t2 = fadd fast float %a, %a + %r = fsub fast float %t1, %t2 + ret float %r +} + +define float @test7(float %a) { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fmul fast float %a, 2.0 + %t2 = fadd fast float %a, %a + %r = fsub fast float %t1, %t2 + ret float %r +} + +define float @test8(float %a) { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %t1 = fmul fast float %a, 0.0 + %t2 = fadd fast float %a, %t1 + ret float %t2 +} + +define float @test9(float %a) { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %t1 = fmul fast float 0.0, %a + %t2 = fadd fast float %t1, %a + ret float %t2 +} + +define float @test10(float %a) { +; CHECK-LABEL: test10: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fsub fast float -0.0, %a + %t2 = fadd fast float %a, %t1 + ret float %t2 +} + +define float @test11(float %a) { +; CHECK-LABEL: test11: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t1 = fsub fast float -0.0, %a + %t2 = fadd fast float %a, %t1 + ret float %t2 +} +