diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5567,8 +5567,12 @@ EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); const TargetOptions &Options = DAG.getTarget().Options; - if (!Op.hasOneUse() && !(Op.getOpcode() == ISD::FP_EXTEND && - isFPExtFree(VT, Op.getOperand(0).getValueType()))) + if (!Op.hasOneUse() && + !(Op.getOpcode() == ISD::FP_EXTEND && + isFPExtFree(VT, Op.getOperand(0).getValueType())) && + !(Op.getOpcode() == ISD::ConstantFP && + !getNegatedExpression(Op, DAG, LegalOperations, ForCodeSize) + .hasOneUse())) return NegatibleCost::Expensive; // Don't recurse exponentially. @@ -5775,14 +5779,7 @@ ForCodeSize, Depth + 1); NegatibleCost V1 = getNegatibleCost(Op.getOperand(1), DAG, LegalOperations, ForCodeSize, Depth + 1); - // TODO: This is a hack. It is possible that costs have changed between now - // and the initial calls to getNegatibleCost(). That is because we - // are rewriting the expression, and that may change the number of - // uses (and therefore the cost) of values. If the negation costs are - // equal, only negate this value if it is a constant. Otherwise, try - // operand 1. A better fix would eliminate uses as a cost factor or - // track the change in uses as we rewrite the expression. - if (V0 > V1 || (V0 == V1 && isa(Op.getOperand(0)))) { + if (V0 > V1) { // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z)) SDValue Neg0 = getNegatedExpression( Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1); diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -275,9 +275,9 @@ ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 @@ -299,7 +299,7 @@ ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} ; GCN-FLUSH-DAG: v_rcp_f32_e32 ; GCN-FLUSH-DAG: v_rcp_f32_e64 diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll --- a/llvm/test/CodeGen/PowerPC/fma-combine.ll +++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll @@ -137,3 +137,77 @@ %add = fsub double %mul, %a ret double %add } + +define float @fma_combine_no_ice() { +; CHECK-FAST-LABEL: fma_combine_no_ice: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: addis 3, 2, .LCPI4_0@toc@ha +; CHECK-FAST-NEXT: addis 4, 2, .LCPI4_1@toc@ha +; CHECK-FAST-NEXT: addis 5, 2, .LCPI4_4@toc@ha +; CHECK-FAST-NEXT: addi 3, 3, .LCPI4_0@toc@l +; CHECK-FAST-NEXT: lfsx 0, 0, 3 +; CHECK-FAST-NEXT: lfsx 1, 0, 3 +; CHECK-FAST-NEXT: addis 3, 2, .LCPI4_2@toc@ha +; CHECK-FAST-NEXT: lfs 2, .LCPI4_1@toc@l(4) +; CHECK-FAST-NEXT: addis 4, 2, .LCPI4_3@toc@ha +; CHECK-FAST-NEXT: lfs 3, .LCPI4_2@toc@l(3) +; CHECK-FAST-NEXT: lfs 4, .LCPI4_3@toc@l(4) +; CHECK-FAST-NEXT: xsmaddasp 2, 0, 1 +; CHECK-FAST-NEXT: lfs 1, .LCPI4_4@toc@l(5) +; CHECK-FAST-NEXT: xsmaddasp 4, 0, 3 +; CHECK-FAST-NEXT: xsmaddasp 1, 0, 2 +; CHECK-FAST-NEXT: xsmaddasp 1, 4, 0 +; CHECK-FAST-NEXT: blr +; +; CHECK-FAST-NOVSX-LABEL: fma_combine_no_ice: +; CHECK-FAST-NOVSX: # %bb.0: +; CHECK-FAST-NOVSX-NEXT: lfs 0, 0(3) +; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI4_0@toc@ha +; CHECK-FAST-NOVSX-NEXT: addis 4, 2, .LCPI4_1@toc@ha +; CHECK-FAST-NOVSX-NEXT: lfs 1, .LCPI4_0@toc@l(3) +; CHECK-FAST-NOVSX-NEXT: lfs 2, .LCPI4_1@toc@l(4) +; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI4_2@toc@ha +; CHECK-FAST-NOVSX-NEXT: addis 4, 2, .LCPI4_4@toc@ha +; CHECK-FAST-NOVSX-NEXT: lfs 4, .LCPI4_4@toc@l(4) +; CHECK-FAST-NOVSX-NEXT: fmadds 1, 0, 2, 1 +; CHECK-FAST-NOVSX-NEXT: lfs 2, .LCPI4_2@toc@l(3) +; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI4_3@toc@ha +; CHECK-FAST-NOVSX-NEXT: lfs 3, .LCPI4_3@toc@l(3) +; CHECK-FAST-NOVSX-NEXT: fmadds 2, 0, 3, 2 +; CHECK-FAST-NOVSX-NEXT: fmadds 1, 0, 1, 4 +; CHECK-FAST-NOVSX-NEXT: fmadds 1, 2, 0, 1 +; CHECK-FAST-NOVSX-NEXT: blr +; +; CHECK-LABEL: fma_combine_no_ice: +; CHECK: # %bb.0: +; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha +; CHECK-NEXT: addis 4, 2, .LCPI4_1@toc@ha +; CHECK-NEXT: addis 5, 2, .LCPI4_4@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l +; CHECK-NEXT: lfsx 0, 0, 3 +; CHECK-NEXT: lfsx 1, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI4_2@toc@ha +; CHECK-NEXT: lfs 2, .LCPI4_1@toc@l(4) +; CHECK-NEXT: addis 4, 2, .LCPI4_3@toc@ha +; CHECK-NEXT: lfs 3, .LCPI4_2@toc@l(3) +; CHECK-NEXT: lfs 4, .LCPI4_3@toc@l(4) +; CHECK-NEXT: xsmaddasp 2, 0, 1 +; CHECK-NEXT: lfs 1, .LCPI4_4@toc@l(5) +; CHECK-NEXT: xsmaddasp 4, 0, 3 +; CHECK-NEXT: xsmaddasp 1, 0, 2 +; CHECK-NEXT: xsmaddasp 1, 4, 0 +; CHECK-NEXT: blr + %tmp = load float, float* undef, align 4 + %tmp2 = load float, float* undef, align 4 + %tmp3 = fmul fast float %tmp, 0x3FE372D780000000 + %tmp4 = fadd fast float %tmp3, 1.000000e+00 + %tmp5 = fmul fast float %tmp2, %tmp4 + %tmp6 = load float, float* undef, align 4 + %tmp7 = load float, float* undef, align 4 + %tmp8 = fmul fast float %tmp7, 0x3FE372D780000000 + %tmp9 = fsub fast float -1.000000e+00, %tmp8 + %tmp10 = fmul fast float %tmp9, %tmp6 + %tmp11 = fadd fast float %tmp5, 5.000000e-01 + %tmp12 = fadd fast float %tmp11, %tmp10 + ret float %tmp12 +} diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -440,36 +440,34 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; SSE-LABEL: test16: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: subss %xmm5, %xmm4 -; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE-NEXT: addss %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: addss %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: subss %xmm4, %xmm3 +; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: addss {{.*}}(%rip), %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test16: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss {{.*}}(%rip), %xmm4, %xmm4 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0