diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14153,28 +14153,42 @@ N1.getOperand(1), N0); } - // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E) - // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E) + // fadd (fma A, B, (fmul C, D)), E --> fma C, D, (fma A, B, E) + // fadd E, (fma A, B, (fmul C, D)) --> fma C, D, (fma A, B, E) + // This also works on with nested fma instructions: + // fadd (fma A, B, (fma (C, D, (fmul (E, F))))), G --> fma E, F, (fma C, D, + // fma (A, B, G)) fadd (G, (fma A, B, (fma (C, D, (fmul (E, F)))))) --> fma E, + // F, (fma C, D, fma (A, B, G)). // This requires reassociation because it changes the order of operations. - SDValue FMA, E; - if (CanReassociate && isFusedOp(N0) && - N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() && - N0.getOperand(2).hasOneUse()) { - FMA = N0; - E = N1; - } else if (CanReassociate && isFusedOp(N1) && - N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() && - N1.getOperand(2).hasOneUse()) { - FMA = N1; - E = N0; - } - if (FMA && E) { - SDValue A = FMA.getOperand(0); - SDValue B = FMA.getOperand(1); - SDValue C = FMA.getOperand(2).getOperand(0); - SDValue D = FMA.getOperand(2).getOperand(1); - SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E); - return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE); + // Moving the outermost FMA operands to the innermost FMA in the chain can + // help with eliminating a final copy to an output register. + if (CanReassociate) { + SDValue FMA, E; + if (isFusedOp(N0) && N0.hasOneUse()) { + FMA = N0; + E = N1; + } else if (isFusedOp(N1) && N1.hasOneUse()) { + FMA = N1; + E = N0; + } + + SDValue TmpFMA = FMA; + while (E && TmpFMA && isFusedOp(TmpFMA)) { + SDValue FMul = TmpFMA->getOperand(2); + if (FMul.getOpcode() == ISD::FMUL && FMul.hasOneUse()) { + SDValue A = TmpFMA->getOperand(0); + SDValue B = TmpFMA->getOperand(1); + SDValue C = FMul.getOperand(0); + SDValue D = FMul.getOperand(1); + + SDValue NewFMA = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, FMA); + DAG.UpdateNodeOperands(TmpFMA.getNode(), A, B, E); + + return NewFMA; + } + + TmpFMA = TmpFMA->getOperand(2); + } } // Look through FP_EXTEND nodes to do more combining. diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -34,22 +34,22 @@ ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c ; GCN-NEXT: v_sub_f32_e64 v5, s24, s28 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_clause 0x4 +; GCN-NEXT: s_clause 0x3 ; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60 ; GCN-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20 ; GCN-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0 ; GCN-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70 +; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp ; GCN-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10 ; GCN-NEXT: v_fma_f32 v1, v1, v5, s28 -; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp ; GCN-NEXT: v_add_f32_e64 v5, s29, -1.0 -; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 ; GCN-NEXT: v_fma_f32 v7, -s2, v6, s6 +; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 ; GCN-NEXT: v_fma_f32 v5, v6, v5, 1.0 -; GCN-NEXT: v_mad_f32 v10, s2, v6, v2 ; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a +; GCN-NEXT: v_fma_f32 v7, v7, v6, v2 ; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8 -; GCN-NEXT: v_mac_f32_e32 v10, v7, v6 +; GCN-NEXT: v_mac_f32_e32 v7, s2, v6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v9, s10, v0 ; GCN-NEXT: v_fma_f32 v0, -v0, s10, s14 @@ -58,19 +58,19 @@ ; GCN-NEXT: v_fmac_f32_e32 v9, v0, v6 ; GCN-NEXT: v_sub_f32_e32 v0, v1, v5 ; GCN-NEXT: v_mul_f32_e32 v1, v8, v6 -; GCN-NEXT: v_mul_f32_e32 v7, v6, v3 +; GCN-NEXT: v_mul_f32_e32 v8, v6, v3 ; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9 ; GCN-NEXT: v_fmac_f32_e32 v5, v0, v6 ; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1 -; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6 +; GCN-NEXT: v_fmac_f32_e32 v8, v3, v6 ; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6 ; GCN-NEXT: v_mul_f32_e32 v0, v2, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v4, v4, v10 +; GCN-NEXT: v_add_f32_e32 v4, v4, v7 ; GCN-NEXT: v_mul_f32_e32 v3, v4, v6 ; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a ; GCN-NEXT: v_mul_f32_e32 v1, v3, v1 -; GCN-NEXT: v_mul_f32_e32 v2, v7, v4 +; GCN-NEXT: v_mul_f32_e32 v2, v8, v4 ; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0 ; GCN-NEXT: v_max_f32_e32 v0, 0, v1 ; GCN-NEXT: ; return to shader part epilog @@ -180,9 +180,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_fma_f32 v2, v2, v3, v4 -; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_fma_f32 v0, v0, v1, v4 +; GCN-NEXT: v_fmac_f32_e32 v0, v2, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d @@ -196,10 +195,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GCN-NEXT: v_fmac_f32_e32 v2, v4, v5 -; GCN-NEXT: v_add_f32_e32 v0, v2, v6 +; GCN-NEXT: v_fma_f32 v0, v0, v1, v6 +; GCN-NEXT: v_fmac_f32_e32 v0, v4, v5 +; GCN-NEXT: v_mac_f32_e32 v0, v2, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -24,8 +24,8 @@ ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4 ; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: s_endpgm @@ -44,8 +44,8 @@ ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) -; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 +; GCN-FASTFMA-NEXT: v_fma_f32 v0, v3, v4, v0 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: s_endpgm