diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14153,28 +14153,62 @@ N1.getOperand(1), N0); } - // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E) - // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E) + // fadd (fma A, B, (fmul C, D)), E --> fma C, D, (fma A, B, E) + // fadd E, (fma A, B, (fmul C, D)) --> fma C, D, (fma A, B, E) + // This also works with nested fma instructions: + // fadd (fma A, B, (fma (C, D, (fmul (E, F))))), G --> + // fma E, F, (fma C, D, fma (A, B, G)) + // fadd (G, (fma A, B, (fma (C, D, (fmul (E, F)))))) --> + // fma E, F, (fma C, D, fma (A, B, G)). // This requires reassociation because it changes the order of operations. - SDValue FMA, E; - if (CanReassociate && isFusedOp(N0) && - N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() && - N0.getOperand(2).hasOneUse()) { - FMA = N0; - E = N1; - } else if (CanReassociate && isFusedOp(N1) && - N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() && - N1.getOperand(2).hasOneUse()) { - FMA = N1; - E = N0; - } - if (FMA && E) { - SDValue A = FMA.getOperand(0); - SDValue B = FMA.getOperand(1); - SDValue C = FMA.getOperand(2).getOperand(0); - SDValue D = FMA.getOperand(2).getOperand(1); - SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E); - return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE); + + // Moving the outermost FMA operands to the innermost FMA in the chain can + // help with eliminating a final copy to an output register. For instance, + // look at the DAG transformation + // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E). + // + // If a function wants to return the result of the outermost FMA, a final + // COPY to the first available register will be inserted, which is then + // returned. + // In this case, this is the virtual register being assigned to A (%0). So, + // this register is in use by the RETURN. During RA, a recolorization attempt + // takes place. As A (%0) and B (%1) are in use by the outermost FMA, the + // output register (%6) for the FMA can only be assigned to the first unused + // register, which is the one for C (%2). During virtual register rewriting, + // it will attempt to eliminate identity copies. However, as the register for + // A (%0) is in use by the RETURN, it cannot eliminate the COPY. This means, + // this will result in a superfluous move instruction. By swapping the + // operands of the FMA instructions, the output register is freed up (because + // %6 can be assigned to %0 as the outermost FMA uses only %2 and %3), + // essentially letting the virtual register rewriter eliminate the final copy. + + if (CanReassociate) { + SDValue FMA, E; + if (isFusedOp(N0) && N0.hasOneUse()) { + FMA = N0; + E = N1; + } else if (isFusedOp(N1) && N1.hasOneUse()) { + FMA = N1; + E = N0; + } + + SDValue TmpFMA = FMA; + while (E && TmpFMA && isFusedOp(TmpFMA)) { + SDValue FMul = TmpFMA->getOperand(2); + if (FMul.getOpcode() == ISD::FMUL && FMul.hasOneUse()) { + SDValue A = TmpFMA->getOperand(0); + SDValue B = TmpFMA->getOperand(1); + SDValue C = FMul.getOperand(0); + SDValue D = FMul.getOperand(1); + + SDValue NewFMA = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, FMA); + DAG.UpdateNodeOperands(TmpFMA.getNode(), A, B, E); + + return NewFMA; + } + + TmpFMA = TmpFMA->getOperand(2); + } } // Look through FP_EXTEND nodes to do more combining. diff --git a/llvm/test/CodeGen/AArch64/fadd-combines.ll b/llvm/test/CodeGen/AArch64/fadd-combines.ll --- a/llvm/test/CodeGen/AArch64/fadd-combines.ll +++ b/llvm/test/CodeGen/AArch64/fadd-combines.ll @@ -197,8 +197,8 @@ define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, double %n1) nounwind { ; CHECK-LABEL: fadd_fma_fmul_1: ; CHECK: // %bb.0: -; CHECK-NEXT: fmadd d2, d2, d3, d4 -; CHECK-NEXT: fmadd d0, d0, d1, d2 +; CHECK-NEXT: fmadd d0, d0, d1, d4 +; CHECK-NEXT: fmadd d0, d2, d3, d0 ; CHECK-NEXT: ret %m1 = fmul fast double %a, %b %m2 = fmul fast double %c, %d @@ -214,8 +214,8 @@ define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n0) nounwind { ; CHECK-LABEL: fadd_fma_fmul_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: fmadd s2, s2, s3, s4 -; CHECK-NEXT: fmadd s0, s0, s1, s2 +; CHECK-NEXT: fmadd s0, s0, s1, s4 +; CHECK-NEXT: fmadd s0, s2, s3, s0 ; CHECK-NEXT: ret %m1 = fmul contract float %a, %b %m2 = fmul contract float %c, %d @@ -247,8 +247,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d ; CHECK-NEXT: fmla v2.2d, v1.2d, v0.2d -; CHECK-NEXT: fmla v2.2d, v7.2d, v6.2d ; CHECK-NEXT: fmla v2.2d, v5.2d, v4.2d +; CHECK-NEXT: fmla v2.2d, v7.2d, v6.2d ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %m1 = fmul fast <2 x double> %x1, %x2 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -34,22 +34,22 @@ ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c ; GCN-NEXT: v_sub_f32_e64 v5, s24, s28 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_clause 0x4 +; GCN-NEXT: s_clause 0x3 ; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60 ; GCN-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20 ; GCN-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0 ; GCN-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70 +; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp ; GCN-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10 ; GCN-NEXT: v_fma_f32 v1, v1, v5, s28 -; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp ; GCN-NEXT: v_add_f32_e64 v5, s29, -1.0 -; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 ; GCN-NEXT: v_fma_f32 v7, -s2, v6, s6 +; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 ; GCN-NEXT: v_fma_f32 v5, v6, v5, 1.0 -; GCN-NEXT: v_mad_f32 v10, s2, v6, v2 ; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a +; GCN-NEXT: v_fma_f32 v7, v7, v6, v2 ; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8 -; GCN-NEXT: v_mac_f32_e32 v10, v7, v6 +; GCN-NEXT: v_mac_f32_e32 v7, s2, v6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v9, s10, v0 ; GCN-NEXT: v_fma_f32 v0, -v0, s10, s14 @@ -58,19 +58,19 @@ ; GCN-NEXT: v_fmac_f32_e32 v9, v0, v6 ; GCN-NEXT: v_sub_f32_e32 v0, v1, v5 ; GCN-NEXT: v_mul_f32_e32 v1, v8, v6 -; GCN-NEXT: v_mul_f32_e32 v7, v6, v3 +; GCN-NEXT: v_mul_f32_e32 v8, v6, v3 ; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9 ; GCN-NEXT: v_fmac_f32_e32 v5, v0, v6 ; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1 -; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6 +; GCN-NEXT: v_fmac_f32_e32 v8, v3, v6 ; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6 ; GCN-NEXT: v_mul_f32_e32 v0, v2, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v4, v4, v10 +; GCN-NEXT: v_add_f32_e32 v4, v4, v7 ; GCN-NEXT: v_mul_f32_e32 v3, v4, v6 ; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a ; GCN-NEXT: v_mul_f32_e32 v1, v3, v1 -; GCN-NEXT: v_mul_f32_e32 v2, v7, v4 +; GCN-NEXT: v_mul_f32_e32 v2, v8, v4 ; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0 ; GCN-NEXT: v_max_f32_e32 v0, 0, v1 ; GCN-NEXT: ; return to shader part epilog @@ -180,9 +180,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_fma_f32 v2, v2, v3, v4 -; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_fma_f32 v0, v0, v1, v4 +; GCN-NEXT: v_fmac_f32_e32 v0, v2, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d @@ -196,10 +195,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GCN-NEXT: v_fmac_f32_e32 v2, v4, v5 -; GCN-NEXT: v_add_f32_e32 v0, v2, v6 +; GCN-NEXT: v_fma_f32 v0, v0, v1, v6 +; GCN-NEXT: v_fmac_f32_e32 v0, v4, v5 +; GCN-NEXT: v_mac_f32_e32 v0, v2, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -24,8 +24,8 @@ ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4 ; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GCN-FLUSH-NEXT: s_endpgm @@ -44,8 +44,8 @@ ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) -; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 +; GCN-FASTFMA-NEXT: v_fma_f32 v0, v3, v4, v0 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) ; GCN-FASTFMA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/PowerPC/fma-assoc.ll b/llvm/test/CodeGen/PowerPC/fma-assoc.ll --- a/llvm/test/CodeGen/PowerPC/fma-assoc.ll +++ b/llvm/test/CodeGen/PowerPC/fma-assoc.ll @@ -457,15 +457,14 @@ define double @test_reassoc_FMADD_ASSOC1(double %A, double %B, double %C, ; CHECK-LABEL: test_reassoc_FMADD_ASSOC1: ; CHECK: # %bb.0: -; CHECK-NEXT: fmadd 0, 3, 4, 5 -; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: fmadd 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 ; CHECK-NEXT: blr ; ; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC1: ; CHECK-VSX: # %bb.0: -; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 -; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 -; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 ; CHECK-VSX-NEXT: blr ; ; CHECK-SPE-LABEL: test_reassoc_FMADD_ASSOC1: @@ -494,15 +493,14 @@ define double @test_reassoc_FMADD_ASSOC2(double %A, double %B, double %C, ; CHECK-LABEL: test_reassoc_FMADD_ASSOC2: ; CHECK: # %bb.0: -; CHECK-NEXT: fmadd 0, 3, 4, 5 -; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: fmadd 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 ; CHECK-NEXT: blr ; ; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC2: ; CHECK-VSX: # %bb.0: -; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 -; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 -; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 ; CHECK-VSX-NEXT: blr ; ; CHECK-SPE-LABEL: test_reassoc_FMADD_ASSOC2: diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr7 < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-PWR ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 < %s | FileCheck %s -check-prefix=FIXPOINT target datalayout = "E-m:e-i64:64-n32:64" @@ -9,10 +10,17 @@ define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { ; CHECK-LABEL: reassociate_adds1: ; CHECK: # %bb.0: -; CHECK: fadds [[REG0:[0-9]+]], 1, 2 -; CHECK: fadds [[REG1:[0-9]+]], 3, 4 -; CHECK: fadds 1, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: fadds 0, 1, 2 +; CHECK-NEXT: fadds 1, 3, 4 +; CHECK-NEXT: fadds 1, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_adds1: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsaddsp 0, 1, 2 +; FIXPOINT-NEXT: xsaddsp 1, 3, 4 +; FIXPOINT-NEXT: xsaddsp 1, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %t0, %x2 @@ -23,10 +31,17 @@ define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { ; CHECK-LABEL: reassociate_adds2: ; CHECK: # %bb.0: -; CHECK: fadds [[REG0:[0-9]+]], 1, 2 -; CHECK: fadds [[REG1:[0-9]+]], 3, 4 -; CHECK: fadds 1, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: fadds 0, 1, 2 +; CHECK-NEXT: fadds 1, 3, 4 +; CHECK-NEXT: fadds 1, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_adds2: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsaddsp 0, 1, 2 +; FIXPOINT-NEXT: xsaddsp 1, 3, 4 +; FIXPOINT-NEXT: xsaddsp 1, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %x2, %t0 @@ -37,10 +52,17 @@ define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { ; CHECK-LABEL: reassociate_adds3: ; CHECK: # %bb.0: -; CHECK: fadds [[REG0:[0-9]+]], 1, 2 -; CHECK: fadds [[REG1:[0-9]+]], 3, 4 -; CHECK: fadds 1, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: fadds 0, 1, 2 +; CHECK-NEXT: fadds 1, 3, 4 +; CHECK-NEXT: fadds 1, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_adds3: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsaddsp 0, 1, 2 +; FIXPOINT-NEXT: xsaddsp 1, 3, 4 +; FIXPOINT-NEXT: xsaddsp 1, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %t0, %x2 @@ -51,10 +73,17 @@ define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { ; CHECK-LABEL: reassociate_adds4: ; CHECK: # %bb.0: -; CHECK: fadds [[REG0:[0-9]+]], 1, 2 -; CHECK: fadds [[REG1:[0-9]+]], 3, 4 -; CHECK: fadds 1, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: fadds 0, 1, 2 +; CHECK-NEXT: fadds 1, 3, 4 +; CHECK-NEXT: fadds 1, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_adds4: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsaddsp 0, 1, 2 +; FIXPOINT-NEXT: xsaddsp 1, 3, 4 +; FIXPOINT-NEXT: xsaddsp 1, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %x2, %t0 @@ -68,14 +97,25 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { ; CHECK-LABEL: reassociate_adds5: ; CHECK: # %bb.0: -; CHECK-DAG: fadds [[REG12:[0-9]+]], 5, 6 -; CHECK-DAG: fadds [[REG0:[0-9]+]], 1, 2 -; CHECK-DAG: fadds [[REG11:[0-9]+]], 3, 4 -; CHECK-DAG: fadds [[REG13:[0-9]+]], [[REG12]], 7 -; CHECK-DAG: fadds [[REG1:[0-9]+]], [[REG0]], [[REG11]] -; CHECK-DAG: fadds [[REG2:[0-9]+]], [[REG1]], [[REG13]] -; CHECK: fadds 1, [[REG2]], 8 +; CHECK-NEXT: fadds 0, 1, 2 +; CHECK-NEXT: fadds 1, 3, 4 +; CHECK-NEXT: fadds 2, 5, 6 +; CHECK-NEXT: fadds 0, 0, 1 +; CHECK-NEXT: fadds 1, 2, 7 +; CHECK-NEXT: fadds 0, 0, 1 +; CHECK-NEXT: fadds 1, 0, 8 ; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_adds5: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsaddsp 0, 1, 2 +; FIXPOINT-NEXT: xsaddsp 1, 3, 4 +; FIXPOINT-NEXT: xsaddsp 0, 0, 1 +; FIXPOINT-NEXT: xsaddsp 1, 5, 6 +; FIXPOINT-NEXT: xsaddsp 1, 1, 7 +; FIXPOINT-NEXT: xsaddsp 0, 0, 1 +; FIXPOINT-NEXT: xsaddsp 1, 0, 8 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %t0, %x2 @@ -92,10 +132,17 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds1: ; CHECK: # %bb.0: -; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 -; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 -; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: xvaddsp 0, 34, 35 +; CHECK-NEXT: xvaddsp 1, 36, 37 +; CHECK-NEXT: xvaddsp 34, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: vector_reassociate_adds1: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xvaddsp 0, 34, 35 +; FIXPOINT-NEXT: xvaddsp 1, 36, 37 +; FIXPOINT-NEXT: xvaddsp 34, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz <4 x float> %x0, %x1 %t1 = fadd reassoc nsz <4 x float> %t0, %x2 @@ -106,10 +153,17 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds2: ; CHECK: # %bb.0: -; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 -; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 -; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: xvaddsp 0, 34, 35 +; CHECK-NEXT: xvaddsp 1, 36, 37 +; CHECK-NEXT: xvaddsp 34, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: vector_reassociate_adds2: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xvaddsp 0, 34, 35 +; FIXPOINT-NEXT: xvaddsp 1, 36, 37 +; FIXPOINT-NEXT: xvaddsp 34, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz <4 x float> %x0, %x1 %t1 = fadd reassoc nsz <4 x float> %x2, %t0 @@ -120,10 +174,17 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds3: ; CHECK: # %bb.0: -; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 -; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 -; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: xvaddsp 0, 34, 35 +; CHECK-NEXT: xvaddsp 1, 36, 37 +; CHECK-NEXT: xvaddsp 34, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: vector_reassociate_adds3: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xvaddsp 0, 34, 35 +; FIXPOINT-NEXT: xvaddsp 1, 36, 37 +; FIXPOINT-NEXT: xvaddsp 34, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz <4 x float> %x0, %x1 %t1 = fadd reassoc nsz <4 x float> %t0, %x2 @@ -134,10 +195,17 @@ define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds4: ; CHECK: # %bb.0: -; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 -; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 -; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: xvaddsp 0, 34, 35 +; CHECK-NEXT: xvaddsp 1, 36, 37 +; CHECK-NEXT: xvaddsp 34, 0, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: vector_reassociate_adds4: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xvaddsp 0, 34, 35 +; FIXPOINT-NEXT: xvaddsp 1, 36, 37 +; FIXPOINT-NEXT: xvaddsp 34, 0, 1 +; FIXPOINT-NEXT: blr %t0 = fadd reassoc nsz <4 x float> %x0, %x1 %t1 = fadd reassoc nsz <4 x float> %x2, %t0 @@ -146,6 +214,19 @@ } define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds6: +; CHECK: # %bb.0: +; CHECK-NEXT: fdivs 0, 1, 2 +; CHECK-NEXT: fadds 0, 3, 0 +; CHECK-NEXT: fadds 1, 4, 0 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_adds6: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsdivsp 0, 1, 2 +; FIXPOINT-NEXT: xsaddsp 0, 3, 0 +; FIXPOINT-NEXT: xsaddsp 1, 4, 0 +; FIXPOINT-NEXT: blr %t0 = fdiv float %x0, %x1 %t1 = fadd float %x2, %t0 %t2 = fadd float %x3, %t1 @@ -153,6 +234,19 @@ } define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_muls1: +; CHECK: # %bb.0: +; CHECK-NEXT: fdivs 0, 1, 2 +; CHECK-NEXT: fmuls 0, 3, 0 +; CHECK-NEXT: fmuls 1, 4, 0 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_muls1: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsdivsp 0, 1, 2 +; FIXPOINT-NEXT: xsmulsp 0, 3, 0 +; FIXPOINT-NEXT: xsmulsp 1, 4, 0 +; FIXPOINT-NEXT: blr %t0 = fdiv float %x0, %x1 %t1 = fmul float %x2, %t0 %t2 = fmul float %x3, %t1 @@ -160,6 +254,19 @@ } define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) { +; CHECK-LABEL: reassociate_adds_double: +; CHECK: # %bb.0: +; CHECK-NEXT: xsdivdp 0, 1, 2 +; CHECK-NEXT: xsadddp 0, 3, 0 +; CHECK-NEXT: xsadddp 1, 4, 0 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_adds_double: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsdivdp 0, 1, 2 +; FIXPOINT-NEXT: xsadddp 0, 3, 0 +; FIXPOINT-NEXT: xsadddp 1, 4, 0 +; FIXPOINT-NEXT: blr %t0 = fdiv double %x0, %x1 %t1 = fadd double %x2, %t0 %t2 = fadd double %x3, %t1 @@ -167,6 +274,19 @@ } define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) { +; CHECK-LABEL: reassociate_muls_double: +; CHECK: # %bb.0: +; CHECK-NEXT: xsdivdp 0, 1, 2 +; CHECK-NEXT: xsmuldp 0, 3, 0 +; CHECK-NEXT: xsmuldp 1, 4, 0 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_muls_double: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsdivdp 0, 1, 2 +; FIXPOINT-NEXT: xsmuldp 0, 3, 0 +; FIXPOINT-NEXT: xsmuldp 1, 4, 0 +; FIXPOINT-NEXT: blr %t0 = fdiv double %x0, %x1 %t1 = fmul double %x2, %t0 %t2 = fmul double %x3, %t1 @@ -174,12 +294,19 @@ } define i32 @reassociate_mullw(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { +; CHECK-LABEL: reassociate_mullw: +; CHECK: # %bb.0: +; CHECK-NEXT: mullw 3, 3, 4 +; CHECK-NEXT: mullw 4, 5, 6 +; CHECK-NEXT: mullw 3, 3, 4 +; CHECK-NEXT: blr +; ; FIXPOINT-LABEL: reassociate_mullw: ; FIXPOINT: # %bb.0: -; FIXPOINT: mullw [[REG0:[0-9]+]], 3, 4 -; FIXPOINT: mullw [[REG1:[0-9]+]], 5, 6 -; FIXPOINT: mullw 3, [[REG0]], [[REG1]] -; FIXPOINT-NEXT: blr +; FIXPOINT-NEXT: mullw 3, 3, 4 +; FIXPOINT-NEXT: mullw 4, 5, 6 +; FIXPOINT-NEXT: mullw 3, 3, 4 +; FIXPOINT-NEXT: blr %t0 = mul i32 %x0, %x1 %t1 = mul i32 %t0, %x2 @@ -188,12 +315,19 @@ } define i64 @reassociate_mulld(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { +; CHECK-LABEL: reassociate_mulld: +; CHECK: # %bb.0: +; CHECK-NEXT: mulld 3, 3, 4 +; CHECK-NEXT: mulld 4, 5, 6 +; CHECK-NEXT: mulld 3, 3, 4 +; CHECK-NEXT: blr +; ; FIXPOINT-LABEL: reassociate_mulld: ; FIXPOINT: # %bb.0: -; FIXPOINT: mulld [[REG0:[0-9]+]], 3, 4 -; FIXPOINT: mulld [[REG1:[0-9]+]], 5, 6 -; FIXPOINT: mulld 3, [[REG0]], [[REG1]] -; FIXPOINT-NEXT: blr +; FIXPOINT-NEXT: mulld 3, 3, 4 +; FIXPOINT-NEXT: mulld 4, 5, 6 +; FIXPOINT-NEXT: mulld 3, 3, 4 +; FIXPOINT-NEXT: blr %t0 = mul i64 %x0, %x1 %t1 = mul i64 %t0, %x2 @@ -204,10 +338,17 @@ define double @reassociate_mamaa_double(double %0, double %1, double %2, double %3, double %4, double %5) { ; CHECK-LABEL: reassociate_mamaa_double: ; CHECK: # %bb.0: -; CHECK-PWR-DAG: xsmaddadp 1, 6, 5 -; CHECK-PWR-DAG: xsmaddadp 2, 4, 3 -; CHECK-PWR: xsadddp 1, 2, 1 -; CHECK-NEXT: blr +; CHECK-NEXT: xsmaddadp 1, 6, 5 +; CHECK-NEXT: xsmaddadp 2, 4, 3 +; CHECK-NEXT: xsadddp 1, 2, 1 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_mamaa_double: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsmaddadp 1, 6, 5 +; FIXPOINT-NEXT: xsmaddadp 2, 4, 3 +; FIXPOINT-NEXT: xsadddp 1, 2, 1 +; FIXPOINT-NEXT: blr %7 = fmul contract reassoc nsz double %3, %2 %8 = fmul contract reassoc nsz double %5, %4 %9 = fadd contract reassoc nsz double %1, %0 @@ -219,10 +360,17 @@ define float @reassociate_mamaa_float(float %0, float %1, float %2, float %3, float %4, float %5) { ; CHECK-LABEL: reassociate_mamaa_float: ; CHECK: # %bb.0: -; CHECK-DAG: fmadds [[REG0:[0-9]+]], 4, 3, 2 -; CHECK-DAG: fmadds [[REG1:[0-9]+]], 6, 5, 1 -; CHECK: fadds 1, [[REG0]], [[REG1]] -; CHECK-NEXT: blr +; CHECK-NEXT: fmadds 0, 6, 5, 1 +; CHECK-NEXT: fmadds 1, 4, 3, 2 +; CHECK-NEXT: fadds 1, 1, 0 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_mamaa_float: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsmaddasp 1, 6, 5 +; FIXPOINT-NEXT: xsmaddasp 2, 4, 3 +; FIXPOINT-NEXT: xsaddsp 1, 2, 1 +; FIXPOINT-NEXT: blr %7 = fmul contract reassoc nsz float %3, %2 %8 = fmul contract reassoc nsz float %5, %4 %9 = fadd contract reassoc nsz float %1, %0 @@ -234,10 +382,17 @@ define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) { ; CHECK-LABEL: reassociate_mamaa_vec: ; CHECK: # %bb.0: -; CHECK-PWR-DAG: xvmaddasp [[REG0:[0-9]+]], 39, 38 -; CHECK-PWR-DAG: xvmaddasp [[REG1:[0-9]+]], 37, 36 -; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] -; CHECK-NEXT: blr +; CHECK-NEXT: xvmaddasp 34, 39, 38 +; CHECK-NEXT: xvmaddasp 35, 37, 36 +; CHECK-NEXT: xvaddsp 34, 35, 34 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_mamaa_vec: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xvmaddasp 34, 39, 38 +; FIXPOINT-NEXT: xvmaddasp 35, 37, 36 +; FIXPOINT-NEXT: xvaddsp 34, 35, 34 +; FIXPOINT-NEXT: blr %7 = fmul contract reassoc nsz <4 x float> %3, %2 %8 = fmul contract reassoc nsz <4 x float> %5, %4 %9 = fadd contract reassoc nsz <4 x float> %1, %0 @@ -249,12 +404,21 @@ define double @reassociate_mamama_double(double %0, double %1, double %2, double %3, double %4, double %5, double %6, double %7, double %8) { ; CHECK-LABEL: reassociate_mamama_double: ; CHECK: # %bb.0: -; CHECK-PWR: xsmaddadp 7, 2, 1 -; CHECK-PWR-DAG: xsmuldp [[REG0:[0-9]+]], 4, 3 -; CHECK-PWR-DAG: xsmaddadp 7, 6, 5 -; CHECK-PWR-DAG: xsmaddadp [[REG0]], 9, 8 -; CHECK-PWR: xsadddp 1, 7, [[REG0]] -; CHECK-NEXT: blr +; CHECK-NEXT: xsmaddadp 7, 4, 3 +; CHECK-NEXT: xsmuldp 0, 2, 1 +; CHECK-NEXT: xsmaddadp 7, 6, 5 +; CHECK-NEXT: xsmaddadp 0, 9, 8 +; CHECK-NEXT: xsadddp 1, 7, 0 +; CHECK-NEXT: blr +; +; FIXPOINT-LABEL: reassociate_mamama_double: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsmaddadp 7, 4, 3 +; FIXPOINT-NEXT: xsmuldp 0, 2, 1 +; FIXPOINT-NEXT: xsmaddadp 7, 6, 5 +; FIXPOINT-NEXT: xsmaddadp 0, 9, 8 +; FIXPOINT-NEXT: xsadddp 1, 7, 0 +; FIXPOINT-NEXT: blr %10 = fmul contract reassoc nsz double %1, %0 %11 = fmul contract reassoc nsz double %3, %2 %12 = fmul contract reassoc nsz double %5, %4 @@ -267,21 +431,40 @@ } define dso_local float @reassociate_mamama_8(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, - float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16) { ; CHECK-LABEL: reassociate_mamama_8: ; CHECK: # %bb.0: -; CHECK-DAG: fmadds [[REG0:[0-9]+]], 3, 2, 1 -; CHECK-DAG: fmuls [[REG1:[0-9]+]], 5, 4 -; CHECK-DAG: fmadds [[REG2:[0-9]+]], 7, 6, [[REG0]] -; CHECK-DAG: fmadds [[REG3:[0-9]+]], 9, 8, [[REG1]] -; -; CHECK-DAG: fmadds [[REG4:[0-9]+]], 13, 12, [[REG3]] -; CHECK-DAG: fmadds [[REG5:[0-9]+]], 11, 10, [[REG2]] +; CHECK-NEXT: fmadds 0, 3, 2, 1 +; CHECK-NEXT: fmuls 1, 5, 4 +; CHECK-NEXT: lfs 2, 172(1) +; CHECK-NEXT: lfs 3, 180(1) +; CHECK-NEXT: lfs 4, 156(1) +; CHECK-NEXT: lfs 5, 164(1) +; CHECK-NEXT: fmadds 0, 7, 6, 0 +; CHECK-NEXT: fmadds 1, 9, 8, 1 +; CHECK-NEXT: fmadds 1, 13, 12, 1 +; CHECK-NEXT: fmadds 0, 11, 10, 0 +; CHECK-NEXT: fmadds 1, 3, 2, 1 +; CHECK-NEXT: fmadds 0, 5, 4, 0 +; CHECK-NEXT: fadds 1, 0, 1 +; CHECK-NEXT: blr ; -; CHECK-DAG: fmadds [[REG6:[0-9]+]], 3, 2, [[REG4]] -; CHECK-DAG: fmadds [[REG7:[0-9]+]], 5, 4, [[REG5]] -; CHECK: fadds 1, [[REG7]], [[REG6]] -; CHECK-NEXT: blr +; FIXPOINT-LABEL: reassociate_mamama_8: +; FIXPOINT: # %bb.0: +; FIXPOINT-NEXT: xsmaddasp 1, 3, 2 +; FIXPOINT-NEXT: xsmulsp 2, 5, 4 +; FIXPOINT-NEXT: lxssp 2, 180(1) +; FIXPOINT-NEXT: lxssp 3, 156(1) +; FIXPOINT-NEXT: lxssp 4, 164(1) +; FIXPOINT-NEXT: xsmaddasp 1, 7, 6 +; FIXPOINT-NEXT: xsmaddasp 2, 9, 8 +; FIXPOINT-NEXT: lfs 0, 172(1) +; FIXPOINT-NEXT: xsmaddasp 2, 13, 12 +; FIXPOINT-NEXT: xsmaddasp 1, 11, 10 +; FIXPOINT-NEXT: xsmaddasp 2, 34, 0 +; FIXPOINT-NEXT: xsmaddasp 1, 36, 35 +; FIXPOINT-NEXT: xsaddsp 1, 1, 2 +; FIXPOINT-NEXT: blr + float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16) { %18 = fmul contract reassoc nsz float %2, %1 %19 = fadd contract reassoc nsz float %18, %0 %20 = fmul contract reassoc nsz float %4, %3 @@ -301,3 +484,5 @@ ret float %33 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-PWR: {{.*}} diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1800,20 +1800,20 @@ define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, double %n1) nounwind { ; FMA-LABEL: fadd_fma_fmul_1: ; FMA: # %bb.0: -; FMA-NEXT: vfmadd213sd {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4 -; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm4 +; FMA-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm3 * xmm2) + xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: fadd_fma_fmul_1: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmaddsd {{.*#+}} xmm2 = (xmm2 * xmm3) + xmm4 -; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 +; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm4 +; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm2 * xmm3) + xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: fadd_fma_fmul_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4 -; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm4 +; AVX512-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm3 * xmm2) + xmm0 ; AVX512-NEXT: retq %m1 = fmul fast double %a, %b %m2 = fmul fast double %c, %d @@ -1829,20 +1829,20 @@ define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n0) nounwind { ; FMA-LABEL: fadd_fma_fmul_fmf: ; FMA: # %bb.0: -; FMA-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4 -; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm4 +; FMA-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm3 * xmm2) + xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: fadd_fma_fmul_fmf: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmaddss {{.*#+}} xmm2 = (xmm2 * xmm3) + xmm4 -; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 +; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm4 +; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm2 * xmm3) + xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: fadd_fma_fmul_fmf: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4 -; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm4 +; AVX512-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm3 * xmm2) + xmm0 ; AVX512-NEXT: retq %m1 = fmul float %a, %b %m2 = fmul float %c, %d @@ -1888,8 +1888,8 @@ ; FMA: # %bb.0: ; FMA-NEXT: vmulpd %xmm3, %xmm2, %xmm2 ; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 ; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 +; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 ; FMA-NEXT: vmovapd %xmm2, %xmm0 ; FMA-NEXT: retq ; @@ -1897,16 +1897,16 @@ ; FMA4: # %bb.0: ; FMA4-NEXT: vmulpd %xmm3, %xmm2, %xmm2 ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 -; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm6 * xmm7) + xmm0 ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm4 * xmm5) + xmm0 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm6 * xmm7) + xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: fadd_fma_fmul_3: ; AVX512: # %bb.0: ; AVX512-NEXT: vmulpd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 ; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 ; AVX512-NEXT: vmovapd %xmm2, %xmm0 ; AVX512-NEXT: retq %m1 = fmul fast <2 x double> %x1, %x2