Index: ../lib/Target/X86/X86ISelLowering.cpp =================================================================== --- ../lib/Target/X86/X86ISelLowering.cpp +++ ../lib/Target/X86/X86ISelLowering.cpp @@ -14480,7 +14480,8 @@ /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). -static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG, + bool ForceLegalization = false) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); @@ -14496,6 +14497,10 @@ SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + if (!IsFABS && !ForceLegalization && VT.isVector()) + // Try to combine FNEG with other nodes + return Op; + bool IsF128 = (VT == MVT::f128); // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to @@ -30069,14 +30074,16 @@ /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); SDValue Arg = N->getOperand(0); SDLoc DL(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + if (!TLI.isTypeLegal(VT)) return SDValue(); // If we're negating a FMUL node on a target with FMA, then we can avoid the @@ -30105,6 +30112,32 @@ case X86ISD::FNMSUB: return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMADD_RND: + return DAG.getNode(X86ISD::FNMSUB_RND, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3)); + case X86ISD::FMSUB_RND: + return DAG.getNode(X86ISD::FNMADD_RND, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3)); + case X86ISD::FNMADD_RND: + return DAG.getNode(X86ISD::FMSUB_RND, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3)); + case X86ISD::FNMSUB_RND: + return DAG.getNode(X86ISD::FMADD_RND, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3)); + } + } + if (!DCI.isBeforeLegalizeOps()) { + // Lowering of FNEG was delayed due to possible optimizations. + // If FNEG requires lowering, do it now. + TargetLowering::LegalizeAction Action = + TLI.getOperationAction(N->getOpcode(), VT); + assert((Action == TargetLowering::Custom || + Action == TargetLowering::Legal) && "Unexpected lowering of FNEG"); + if (Action == TargetLowering::Custom) { + // Force FNEG legalization + SDValue NewNode = LowerFABSorFNEG(SDValue(N, 0), DAG, true); + assert(NewNode.getNode() && "Unexpected result of FNEG lowering"); + return NewNode; } } return SDValue(); @@ -30569,13 +30602,22 @@ if (NegC) C = C.getOperand(0); - unsigned Opcode; + unsigned NewOpcode; if (!NegMul) - Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; + NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else - Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; - return DAG.getNode(Opcode, dl, VT, A, B, C); + if (N->getOpcode() == X86ISD::FMADD_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; + } + return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); + } + return DAG.getNode(NewOpcode, dl, VT, A, B, C); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -31248,7 +31290,7 @@ case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); - case ISD::FNEG: return combineFneg(N, DAG, Subtarget); + case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return combineFOr(N, DAG, Subtarget); @@ -31296,6 +31338,8 @@ case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); + case X86ISD::FMADD: + case X86ISD::FMADD_RND: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG); Index: ../test/CodeGen/X86/fma-fneg-combine.ll =================================================================== --- ../test/CodeGen/X86/fma-fneg-combine.ll +++ ../test/CodeGen/X86/fma-fneg-combine.ll @@ -7,8 +7,7 @@ define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test1: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm2, %zmm2 -; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %sub.i = fsub <16 x float> , %c @@ -24,8 +23,7 @@ define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2 @@ -36,8 +34,7 @@ define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test3: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2 @@ -48,8 +45,7 @@ define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test4: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2 @@ -60,8 +56,7 @@ define <16 x float> @test5(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test5: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm2, %zmm2 -; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vfmsub213ps {ru-sae}, %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq entry: @@ -73,8 +68,8 @@ define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test6: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmsub213ps {ru-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 2) #2 @@ -86,8 +81,7 @@ define <8 x float> @test7(<8 x float> %a, <8 x float> %b, <8 x float> %c) { ; CHECK-LABEL: test7: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 -; CHECK-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq entry: %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2