diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -148,6 +148,9 @@ SRA, SHL, + /// FNMSUB - Negated multiply-subtract instruction. + FNMSUB, + /// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign /// word and shift left immediate. EXTSWSLI, @@ -674,6 +677,10 @@ return VT.isScalarInteger(); } + SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, + bool OptForSize, NegatibleCost &Cost, + unsigned Depth = 0) const override; + /// getSetCCResultType - Return the ISD::SETCC ValueType EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -1202,6 +1209,7 @@ SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineFMALike(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1228,6 +1228,7 @@ setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) @@ -1532,6 +1533,7 @@ case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF"; case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; + case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; } return nullptr; } @@ -14115,6 +14117,9 @@ return combineSRL(N, DCI); case ISD::MUL: return combineMUL(N, DCI); + case ISD::FMA: + case PPCISD::FNMSUB: + return combineFMALike(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); @@ -15779,6 +15784,85 @@ return PPC::createFastISel(FuncInfo, LibInfo); } +// 'Inverted' means the FMA opcode after negating one multiplicand. +// For example, (fma -a b c) = (fnmsub a b c) +static unsigned invertFMAOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Invalid FMA opcode for PowerPC!"); + case ISD::FMA: + return PPCISD::FNMSUB; + case PPCISD::FNMSUB: + return ISD::FMA; + } +} + +SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOps, bool OptForSize, + NegatibleCost &Cost, + unsigned Depth) const { + if (Depth > SelectionDAG::MaxRecursionDepth) + return SDValue(); + + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + SDNodeFlags Flags = Op.getNode()->getFlags(); + + switch (Opc) { + case PPCISD::FNMSUB: + // TODO: QPX subtarget is deprecated. No transformation here. + if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX()) + break; + + const TargetOptions &Options = getTargetMachine().Options; + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2 = Op.getOperand(2); + SDLoc Loc(Op); + + NegatibleCost N2Cost = NegatibleCost::Expensive; + SDValue NegN2 = + getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1); + + if (!NegN2) + return SDValue(); + + // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c)) + // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c)) + // These transformations may change sign of zeroes. For example, + // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1. + if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) { + // Try and choose the cheaper one to negate. + NegatibleCost N0Cost = NegatibleCost::Expensive; + SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize, + N0Cost, Depth + 1); + + NegatibleCost N1Cost = NegatibleCost::Expensive; + SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize, + N1Cost, Depth + 1); + + if (NegN0 && N0Cost <= N1Cost) { + Cost = std::min(N0Cost, N2Cost); + return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags); + } else if (NegN1) { + Cost = std::min(N1Cost, N2Cost); + return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags); + } + } + + // (fneg (fnmsub a b c)) => (fma a b (fneg c)) + if (isOperationLegal(ISD::FMA, VT)) { + Cost = N2Cost; + return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags); + } + + break; + } + + return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize, + Cost, Depth); +} + // Override to enable LOAD_STACK_GUARD lowering on Linux. bool PPCTargetLowering::useLoadStackGuardNode() const { if (!Subtarget.isTargetLinux()) @@ -16185,6 +16269,45 @@ } } +// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this +// in combiner since we need to check SD flags and other subtarget features. +SDValue PPCTargetLowering::combineFMALike(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + SDNodeFlags Flags = N->getFlags(); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + const TargetOptions &Options = getTargetMachine().Options; + unsigned Opc = N->getOpcode(); + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOps = !DCI.isBeforeLegalizeOps(); + SDLoc Loc(N); + + // TODO: QPX subtarget is deprecated. No transformation here. + if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT) || + (VT.isVector() && !Subtarget.hasVSX())) + return SDValue(); + + // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0 + // since (fnmsub a b c)=-0 while c-ab=+0. + if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath) + return SDValue(); + + // (fma (fneg a) b c) => (fnmsub a b c) + // (fnmsub (fneg a) b c) => (fma a b c) + if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize)) + return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags); + + // (fma a (fneg b) c) => (fnmsub a b c) + // (fnmsub a (fneg b) c) => (fma a b c) + if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize)) + return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags); + + return SDValue(); +} + bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // Only duplicate to increase tail-calls for the 64bit SysV ABIs. if (!Subtarget.is64BitELFABI()) diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -216,6 +216,8 @@ def PPCsra : SDNode<"PPCISD::SRA" , SDTIntShiftOp>; def PPCshl : SDNode<"PPCISD::SHL" , SDTIntShiftOp>; +def PPCfnmsub : SDNode<"PPCISD::FNMSUB" , SDTFPTernaryOp>; + def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>; // Move 2 i64 values into a VSX register @@ -3381,15 +3383,19 @@ def : Pat<(atomic_fence (timm), (timm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>; let Predicates = [HasFPU] in { -// Additional FNMSUB patterns: -a*c + b == -(a*c - b) -def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), - (FNMSUB $A, $C, $B)>; -def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), - (FNMSUB $A, $C, $B)>; -def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B), - (FNMSUBS $A, $C, $B)>; -def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B), - (FNMSUBS $A, $C, $B)>; +// Additional fnmsub patterns for custom node +def : Pat<(PPCfnmsub f64:$A, f64:$B, f64:$C), + (FNMSUB $A, $B, $C)>; +def : Pat<(PPCfnmsub f32:$A, f32:$B, f32:$C), + (FNMSUBS $A, $B, $C)>; +def : Pat<(fneg (PPCfnmsub f64:$A, f64:$B, f64:$C)), + (FMSUB $A, $B, $C)>; +def : Pat<(fneg (PPCfnmsub f32:$A, f32:$B, f32:$C)), + (FMSUBS $A, $B, $C)>; +def : Pat<(PPCfnmsub f64:$A, f64:$B, (fneg f64:$C)), + (FNMADD $A, $B, $C)>; +def : Pat<(PPCfnmsub f32:$A, f32:$B, (fneg f32:$C)), + (FNMADDS $A, $B, $C)>; // FCOPYSIGN's operand types need not agree. def : Pat<(fcopysign f64:$frB, f32:$frA), diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2416,21 +2416,27 @@ (and v4i32:$B, v4i32:$C))), (v4i32 (XXSEL $A, $B, $C))>; -// Additional fnmsub patterns: -a*b + c == -(a*b - c) -def : Pat<(fma (fneg f64:$A), f64:$B, f64:$C), - (XSNMSUBADP $C, $A, $B)>; -def : Pat<(fma f64:$A, (fneg f64:$B), f64:$C), +// Additional fnmsub pattern for PPC specific ISD opcode +def : Pat<(PPCfnmsub f64:$A, f64:$B, f64:$C), (XSNMSUBADP $C, $A, $B)>; +def : Pat<(fneg (PPCfnmsub f64:$A, f64:$B, f64:$C)), + (XSMSUBADP $C, $A, $B)>; +def : Pat<(PPCfnmsub f64:$A, f64:$B, (fneg f64:$C)), + (XSNMADDADP $C, $A, $B)>; -def : Pat<(fma (fneg v2f64:$A), v2f64:$B, v2f64:$C), - (XVNMSUBADP $C, $A, $B)>; -def : Pat<(fma v2f64:$A, (fneg v2f64:$B), v2f64:$C), +def : Pat<(PPCfnmsub v2f64:$A, v2f64:$B, v2f64:$C), (XVNMSUBADP $C, $A, $B)>; +def : Pat<(fneg (PPCfnmsub v2f64:$A, v2f64:$B, v2f64:$C)), + (XVMSUBADP $C, $A, $B)>; +def : Pat<(PPCfnmsub v2f64:$A, v2f64:$B, (fneg v2f64:$C)), + (XVNMADDADP $C, $A, $B)>; -def : Pat<(fma (fneg v4f32:$A), v4f32:$B, v4f32:$C), - (XVNMSUBASP $C, $A, $B)>; -def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C), +def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C), (XVNMSUBASP $C, $A, $B)>; +def : Pat<(fneg (PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C)), + (XVMSUBASP $C, $A, $B)>; +def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, (fneg v4f32:$C)), + (XVNMADDASP $C, $A, $B)>; def : Pat<(v2f64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; @@ -3003,11 +3009,13 @@ def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)), (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>; -// Additional xsnmsubasp patterns: -a*b + c == -(a*b - c) -def : Pat<(fma (fneg f32:$A), f32:$B, f32:$C), - (XSNMSUBASP $C, $A, $B)>; -def : Pat<(fma f32:$A, (fneg f32:$B), f32:$C), +// Additional fnmsub pattern for PPC specific ISD opcode +def : Pat<(PPCfnmsub f32:$A, f32:$B, f32:$C), (XSNMSUBASP $C, $A, $B)>; +def : Pat<(fneg (PPCfnmsub f32:$A, f32:$B, f32:$C)), + (XSMSUBASP $C, $A, $B)>; +def : Pat<(PPCfnmsub f32:$A, f32:$B, (fneg f32:$C)), + (XSNMADDASP $C, $A, $B)>; // f32 neg // Although XSNEGDP is available in P7, we want to select it starting from P8, @@ -3516,9 +3524,13 @@ // Any Power9 VSX subtarget. let Predicates = [HasVSX, HasP9Vector] in { -// Additional fnmsub patterns: -a*b + c == -(a*b - c) -def : Pat<(fma (fneg f128:$A), f128:$B, f128:$C), (XSNMSUBQP $C, $A, $B)>; -def : Pat<(fma f128:$A, (fneg f128:$B), f128:$C), (XSNMSUBQP $C, $A, $B)>; +// Additional fnmsub pattern for PPC specific ISD opcode +def : Pat<(PPCfnmsub f128:$A, f128:$B, f128:$C), + (XSNMSUBQP $C, $A, $B)>; +def : Pat<(fneg (PPCfnmsub f128:$A, f128:$B, f128:$C)), + (XSMSUBQP $C, $A, $B)>; +def : Pat<(PPCfnmsub f128:$A, f128:$B, (fneg f128:$C)), + (XSNMADDQP $C, $A, $B)>; def : Pat<(f128 (sint_to_fp i64:$src)), (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>; diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll --- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll +++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll @@ -13,10 +13,10 @@ ; CHECK-NEXT: xvredp 2, 0 ; CHECK-NEXT: xxswapd 1, 1 ; CHECK-NEXT: xxlor 3, 1, 1 -; CHECK-NEXT: xvmaddadp 3, 0, 2 -; CHECK-NEXT: xvnmsubadp 2, 2, 3 -; CHECK-NEXT: xvmaddadp 1, 0, 2 -; CHECK-NEXT: xvmsubadp 2, 2, 1 +; CHECK-NEXT: xvnmsubadp 3, 0, 2 +; CHECK-NEXT: xvmaddadp 2, 2, 3 +; CHECK-NEXT: xvnmsubadp 1, 0, 2 +; CHECK-NEXT: xvnmaddadp 2, 2, 1 ; CHECK-NEXT: xvmuldp 34, 34, 2 ; CHECK-NEXT: xvmuldp 35, 35, 2 ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/f128-fma.ll b/llvm/test/CodeGen/PowerPC/f128-fma.ll --- a/llvm/test/CodeGen/PowerPC/f128-fma.ll +++ b/llvm/test/CodeGen/PowerPC/f128-fma.ll @@ -121,7 +121,7 @@ %1 = load fp128, fp128* %b, align 16 %2 = load fp128, fp128* %c, align 16 %mul = fmul contract fp128 %1, %2 - %sub = fsub contract fp128 %0, %mul + %sub = fsub contract nsz fp128 %0, %mul store fp128 %sub, fp128* %res, align 16 ret void ; CHECK-LABEL: qpFmsub diff --git a/llvm/test/CodeGen/PowerPC/fdiv.ll b/llvm/test/CodeGen/PowerPC/fdiv.ll --- a/llvm/test/CodeGen/PowerPC/fdiv.ll +++ b/llvm/test/CodeGen/PowerPC/fdiv.ll @@ -10,6 +10,6 @@ ; CHECK-NEXT: xsmaddasp 0, 3, 1 ; CHECK-NEXT: fmr 1, 0 ; CHECK-NEXT: blr - %3 = fdiv reassoc arcp float %0, %1 + %3 = fdiv reassoc arcp nsz float %0, %1 ret float %3 } diff --git a/llvm/test/CodeGen/PowerPC/fma-assoc.ll b/llvm/test/CodeGen/PowerPC/fma-assoc.ll --- a/llvm/test/CodeGen/PowerPC/fma-assoc.ll +++ b/llvm/test/CodeGen/PowerPC/fma-assoc.ll @@ -225,14 +225,18 @@ define double @test_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, ; CHECK-LABEL: test_FMSUB_ASSOC_EXT3: ; CHECK: # %bb.0: -; CHECK-NEXT: fnmsub 0, 1, 2, 5 -; CHECK-NEXT: fnmsub 1, 3, 4, 0 +; CHECK-NEXT: fneg 0, 1 +; CHECK-NEXT: fmadd 0, 0, 2, 5 +; CHECK-NEXT: fneg 1, 3 +; CHECK-NEXT: fmadd 1, 1, 4, 0 ; CHECK-NEXT: blr ; ; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT3: ; CHECK-VSX: # %bb.0: -; CHECK-VSX-NEXT: xsnmsubmdp 1, 2, 5 -; CHECK-VSX-NEXT: xsnmsubadp 1, 3, 4 +; CHECK-VSX-NEXT: xsnegdp 1, 1 +; CHECK-VSX-NEXT: xsnegdp 0, 3 +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 0, 4 ; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul float %A, %B ; [#uses=1] @@ -246,15 +250,19 @@ define double @test_FMSUB_ASSOC_EXT4(float %A, float %B, float %C, ; CHECK-LABEL: test_FMSUB_ASSOC_EXT4: ; CHECK: # %bb.0: -; CHECK-NEXT: fnmsub 0, 3, 4, 5 -; CHECK-NEXT: fnmsub 1, 1, 2, 0 +; CHECK-NEXT: fneg 0, 3 +; CHECK-NEXT: fmadd 0, 0, 4, 5 +; CHECK-NEXT: fneg 1, 1 +; CHECK-NEXT: fmadd 1, 1, 2, 0 ; CHECK-NEXT: blr ; ; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT4: ; CHECK-VSX: # %bb.0: -; CHECK-VSX-NEXT: xsnmsubmdp 3, 4, 5 -; CHECK-VSX-NEXT: xsnmsubadp 3, 1, 2 -; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: xsnegdp 0, 3 +; CHECK-VSX-NEXT: xsnegdp 1, 1 +; CHECK-VSX-NEXT: xsmaddmdp 0, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 0 ; CHECK-VSX-NEXT: blr float %D, double %E) { %F = fmul float %A, %B ; [#uses=1] @@ -503,14 +511,18 @@ define double @test_reassoc_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, ; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT3: ; CHECK: # %bb.0: -; CHECK-NEXT: fnmsub 0, 1, 2, 5 -; CHECK-NEXT: fnmsub 1, 3, 4, 0 +; CHECK-NEXT: fneg 0, 1 +; CHECK-NEXT: fmadd 0, 0, 2, 5 +; CHECK-NEXT: fneg 1, 3 +; CHECK-NEXT: fmadd 1, 1, 4, 0 ; CHECK-NEXT: blr ; ; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT3: ; CHECK-VSX: # %bb.0: -; CHECK-VSX-NEXT: xsnmsubmdp 1, 2, 5 -; CHECK-VSX-NEXT: xsnmsubadp 1, 3, 4 +; CHECK-VSX-NEXT: xsnegdp 1, 1 +; CHECK-VSX-NEXT: xsnegdp 0, 3 +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 0, 4 ; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul reassoc float %A, %B ; [#uses=1] @@ -521,18 +533,45 @@ ret double %J } +; fnmsub/xsnmsubadp may affect the sign of zero, we need nsz flag +; to ensure generating them +define double @test_fast_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, +; CHECK-LABEL: test_fast_FMSUB_ASSOC_EXT3: +; CHECK: # %bb.0: +; CHECK-NEXT: fnmsub 0, 1, 2, 5 +; CHECK-NEXT: fnmsub 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_fast_FMSUB_ASSOC_EXT3: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsnmsubmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsnmsubadp 1, 3, 4 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc float %A, %B + %G = fpext float %F to double + %H = fmul reassoc double %C, %D + %I = fadd reassoc nsz double %H, %G + %J = fsub reassoc nsz double %E, %I + ret double %J +} + define double @test_reassoc_FMSUB_ASSOC_EXT4(float %A, float %B, float %C, ; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT4: ; CHECK: # %bb.0: -; CHECK-NEXT: fnmsub 0, 3, 4, 5 -; CHECK-NEXT: fnmsub 1, 1, 2, 0 +; CHECK-NEXT: fneg 0, 3 +; CHECK-NEXT: fmadd 0, 0, 4, 5 +; CHECK-NEXT: fneg 1, 1 +; CHECK-NEXT: fmadd 1, 1, 2, 0 ; CHECK-NEXT: blr ; ; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT4: ; CHECK-VSX: # %bb.0: -; CHECK-VSX-NEXT: xsnmsubmdp 3, 4, 5 -; CHECK-VSX-NEXT: xsnmsubadp 3, 1, 2 -; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: xsnegdp 0, 3 +; CHECK-VSX-NEXT: xsnegdp 1, 1 +; CHECK-VSX-NEXT: xsmaddmdp 0, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 0 ; CHECK-VSX-NEXT: blr float %D, double %E) { %F = fmul reassoc float %A, %B ; [#uses=1] @@ -542,3 +581,25 @@ %J = fsub reassoc double %E, %I ; [#uses=1] ret double %J } + +define double @test_fast_FMSUB_ASSOC_EXT4(float %A, float %B, float %C, +; CHECK-LABEL: test_fast_FMSUB_ASSOC_EXT4: +; CHECK: # %bb.0: +; CHECK-NEXT: fnmsub 0, 3, 4, 5 +; CHECK-NEXT: fnmsub 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_fast_FMSUB_ASSOC_EXT4: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsnmsubmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsnmsubadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + float %D, double %E) { + %F = fmul reassoc float %A, %B + %G = fmul reassoc float %C, %D + %H = fadd reassoc nsz float %F, %G + %I = fpext float %H to double + %J = fsub reassoc nsz double %E, %I + ret double %J +} diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll --- a/llvm/test/CodeGen/PowerPC/fma-combine.ll +++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll @@ -176,9 +176,11 @@ ; CHECK-NEXT: addis 3, 2, .LCPI4_2@toc@ha ; CHECK-NEXT: lfs 3, .LCPI4_1@toc@l(4) ; CHECK-NEXT: lfs 1, .LCPI4_2@toc@l(3) +; CHECK-NEXT: fmr 4, 3 ; CHECK-NEXT: xsmaddasp 3, 2, 0 +; CHECK-NEXT: xsnmaddasp 4, 2, 0 ; CHECK-NEXT: xsmaddasp 1, 2, 3 -; CHECK-NEXT: xsnmsubasp 1, 3, 2 +; CHECK-NEXT: xsmaddasp 1, 4, 2 ; CHECK-NEXT: blr %tmp = load float, float* undef, align 4 %tmp2 = load float, float* undef, align 4 diff --git a/llvm/test/CodeGen/PowerPC/fma-ext.ll b/llvm/test/CodeGen/PowerPC/fma-ext.ll --- a/llvm/test/CodeGen/PowerPC/fma-ext.ll +++ b/llvm/test/CodeGen/PowerPC/fma-ext.ll @@ -49,10 +49,27 @@ %F = fsub double %C, %E ; [#uses=1] ret double %F ; CHECK-LABEL: test_FMSUB_EXT2: -; CHECK: fnmsub +; CHECK: fneg +; CHECK-NEXT: fmadd ; CHECK-NEXT: blr ; CHECK-VSX-LABEL: test_FMSUB_EXT2: +; CHECK-VSX: xsnegdp +; CHECK-VSX-NEXT: xsmaddmdp +; CHECK-VSX-NEXT: blr +} + +; need nsz flag to generate fnmsub since it may affect sign of zero +define double @test_FMSUB_EXT2_NSZ(float %A, float %B, double %C) { + %D = fmul nsz float %A, %B ; [#uses=1] + %E = fpext float %D to double ; [#uses=1] + %F = fsub nsz double %C, %E ; [#uses=1] + ret double %F +; CHECK-LABEL: test_FMSUB_EXT2_NSZ: +; CHECK: fnmsub +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMSUB_EXT2_NSZ: ; CHECK-VSX: xsnmsubmdp ; CHECK-VSX-NEXT: blr } diff --git a/llvm/test/CodeGen/PowerPC/fma-negate.ll b/llvm/test/CodeGen/PowerPC/fma-negate.ll --- a/llvm/test/CodeGen/PowerPC/fma-negate.ll +++ b/llvm/test/CodeGen/PowerPC/fma-negate.ll @@ -7,12 +7,14 @@ define double @test_mul_sub_f64(double %a, double %b, double %c) { ; VSX-LABEL: test_mul_sub_f64: ; VSX: # %bb.0: # %entry -; VSX-NEXT: xsnmsubadp 1, 2, 3 +; VSX-NEXT: xsnegdp 0, 2 +; VSX-NEXT: xsmaddadp 1, 0, 3 ; VSX-NEXT: blr ; ; NO-VSX-LABEL: test_mul_sub_f64: ; NO-VSX: # %bb.0: # %entry -; NO-VSX-NEXT: fnmsub 1, 2, 3, 1 +; NO-VSX-NEXT: fneg 0, 2 +; NO-VSX-NEXT: fmadd 1, 0, 3, 1 ; NO-VSX-NEXT: blr entry: %0 = fmul contract reassoc double %b, %c @@ -43,13 +45,15 @@ define double @test_neg_fma_f64(double %a, double %b, double %c) { ; VSX-LABEL: test_neg_fma_f64: ; VSX: # %bb.0: # %entry -; VSX-NEXT: xsnmsubadp 3, 1, 2 +; VSX-NEXT: xsnegdp 0, 1 +; VSX-NEXT: xsmaddadp 3, 0, 2 ; VSX-NEXT: fmr 1, 3 ; VSX-NEXT: blr ; ; NO-VSX-LABEL: test_neg_fma_f64: ; NO-VSX: # %bb.0: # %entry -; NO-VSX-NEXT: fnmsub 1, 1, 2, 3 +; NO-VSX-NEXT: fneg 0, 1 +; NO-VSX-NEXT: fmadd 1, 0, 2, 3 ; NO-VSX-NEXT: blr entry: %0 = fsub contract reassoc double -0.0, %a @@ -61,12 +65,14 @@ define float @test_mul_sub_f32(float %a, float %b, float %c) { ; VSX-LABEL: test_mul_sub_f32: ; VSX: # %bb.0: # %entry -; VSX-NEXT: xsnmsubasp 1, 2, 3 +; VSX-NEXT: xsnegdp 0, 2 +; VSX-NEXT: xsmaddasp 1, 0, 3 ; VSX-NEXT: blr ; ; NO-VSX-LABEL: test_mul_sub_f32: ; NO-VSX: # %bb.0: # %entry -; NO-VSX-NEXT: fnmsubs 1, 2, 3, 1 +; NO-VSX-NEXT: fneg 0, 2 +; NO-VSX-NEXT: fmadds 1, 0, 3, 1 ; NO-VSX-NEXT: blr entry: %0 = fmul contract reassoc float %b, %c @@ -97,13 +103,15 @@ define float @test_neg_fma_f32(float %a, float %b, float %c) { ; VSX-LABEL: test_neg_fma_f32: ; VSX: # %bb.0: # %entry -; VSX-NEXT: xsnmsubasp 3, 1, 2 +; VSX-NEXT: xsnegdp 0, 1 +; VSX-NEXT: xsmaddasp 3, 0, 2 ; VSX-NEXT: fmr 1, 3 ; VSX-NEXT: blr ; ; NO-VSX-LABEL: test_neg_fma_f32: ; NO-VSX: # %bb.0: # %entry -; NO-VSX-NEXT: fnmsubs 1, 1, 2, 3 +; NO-VSX-NEXT: fneg 0, 1 +; NO-VSX-NEXT: fmadds 1, 0, 2, 3 ; NO-VSX-NEXT: blr entry: %0 = fsub contract reassoc float -0.0, %a @@ -114,14 +122,17 @@ define <2 x double> @test_neg_fma_v2f64(<2 x double> %a, <2 x double> %b, ; VSX-LABEL: test_neg_fma_v2f64: ; VSX: # %bb.0: # %entry -; VSX-NEXT: xvnmsubadp 36, 34, 35 +; VSX-NEXT: xvnegdp 0, 34 +; VSX-NEXT: xvmaddadp 36, 0, 35 ; VSX-NEXT: vmr 2, 4 ; VSX-NEXT: blr ; ; NO-VSX-LABEL: test_neg_fma_v2f64: ; NO-VSX: # %bb.0: # %entry -; NO-VSX-NEXT: fnmsub 1, 1, 3, 5 -; NO-VSX-NEXT: fnmsub 2, 2, 4, 6 +; NO-VSX-NEXT: fneg 0, 2 +; NO-VSX-NEXT: fneg 1, 1 +; NO-VSX-NEXT: fmadd 1, 1, 3, 5 +; NO-VSX-NEXT: fmadd 2, 0, 4, 6 ; NO-VSX-NEXT: blr <2 x double> %c) { entry: @@ -135,7 +146,8 @@ define <4 x float> @test_neg_fma_v4f32(<4 x float> %a, <4 x float> %b, ; VSX-LABEL: test_neg_fma_v4f32: ; VSX: # %bb.0: # %entry -; VSX-NEXT: xvnmsubasp 36, 34, 35 +; VSX-NEXT: xvnegsp 0, 34 +; VSX-NEXT: xvmaddasp 36, 0, 35 ; VSX-NEXT: vmr 2, 4 ; VSX-NEXT: blr ; @@ -167,8 +179,8 @@ ; NO-VSX-NEXT: fnmsub 1, 2, 3, 1 ; NO-VSX-NEXT: blr entry: - %0 = fmul reassoc double %b, %c - %1 = fsub reassoc double %a, %0 + %0 = fmul reassoc nsz double %b, %c + %1 = fsub reassoc nsz double %a, %0 ret double %1 } @@ -206,7 +218,7 @@ ; NO-VSX-NEXT: blr entry: %0 = fsub reassoc double -0.0, %a - %1 = call reassoc double @llvm.fma.f64(double %0, double %b, double %c) + %1 = call reassoc nsz double @llvm.fma.f64(double %0, double %b, double %c) ret double %1 } @@ -222,7 +234,7 @@ ; NO-VSX-NEXT: blr entry: %0 = fmul reassoc float %b, %c - %1 = fsub reassoc float %a, %0 + %1 = fsub reassoc nsz float %a, %0 ret float %1 } @@ -242,7 +254,7 @@ entry: %0 = fmul reassoc float %a, %b %1 = fmul reassoc float %c, %d - %2 = fsub reassoc float %0, %1 + %2 = fsub reassoc nsz float %0, %1 ret float %2 } @@ -259,7 +271,7 @@ ; NO-VSX-NEXT: blr entry: %0 = fsub reassoc float -0.0, %a - %1 = call reassoc float @llvm.fma.f32(float %0, float %b, float %c) + %1 = call reassoc nsz float @llvm.fma.f32(float %0, float %b, float %c) ret float %1 } @@ -278,7 +290,7 @@ <2 x double> %c) { entry: %0 = fsub reassoc <2 x double> , %a - %1 = call reassoc <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %b, + %1 = call reassoc nsz <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %b, <2 x double> %c) ret <2 x double> %1 } @@ -301,8 +313,8 @@ entry: %0 = fsub reassoc <4 x float> , %a - %1 = call reassoc <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %b, - <4 x float> %c) + %1 = call reassoc nsz <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %b, + <4 x float> %c) ret <4 x float> %1 } diff --git a/llvm/test/CodeGen/PowerPC/fma-precision.ll b/llvm/test/CodeGen/PowerPC/fma-precision.ll --- a/llvm/test/CodeGen/PowerPC/fma-precision.ll +++ b/llvm/test/CodeGen/PowerPC/fma-precision.ll @@ -13,7 +13,7 @@ entry: %mul = fmul reassoc double %b, %a %mul1 = fmul reassoc double %d, %c - %sub = fsub reassoc double %mul, %mul1 + %sub = fsub reassoc nsz double %mul, %mul1 %mul3 = fmul reassoc double %mul, %sub ret double %mul3 } @@ -113,7 +113,7 @@ store double %ab, double* %p1 ; extra use of %ab store double %ab, double* %p2 ; another extra use of %ab store double %cd, double* %p3 ; extra use of %cd - %r = fsub reassoc double %ab, %cd + %r = fsub reassoc nsz double %ab, %cd ret double %r } @@ -156,8 +156,8 @@ store double %ab, double* %p1 ; extra use of %ab store double %ab, double* %p2 ; another extra use of %ab store double %fg, double* %p3 ; extra use of %fg - %q = fsub reassoc double %fg, %cd ; The uses of %cd reduce to 1 after %r is folded. 2 uses of %fg, fold %cd, remove def of %cd - %r = fsub reassoc double %ab, %cd ; Fold %r before %q. 3 uses of %ab, 2 uses of %cd, fold %cd + %q = fsub reassoc nsz double %fg, %cd ; The uses of %cd reduce to 1 after %r is folded. 2 uses of %fg, fold %cd, remove def of %cd + %r = fsub reassoc nsz double %ab, %cd ; Fold %r before %q. 3 uses of %ab, 2 uses of %cd, fold %cd %add = fadd reassoc double %r, %q ret double %add } diff --git a/llvm/test/CodeGen/PowerPC/fma.ll b/llvm/test/CodeGen/PowerPC/fma.ll --- a/llvm/test/CodeGen/PowerPC/fma.ll +++ b/llvm/test/CodeGen/PowerPC/fma.ll @@ -95,10 +95,25 @@ %E = fsub double %C, %D ; [#uses=1] ret double %E ; CHECK-LABEL: test_FNMSUB1: -; CHECK: fnmsub +; CHECK: fneg +; CHECK-NEXT: fmadd ; CHECK-NEXT: blr ; CHECK-VSX-LABEL: test_FNMSUB1: +; CHECK-VSX: xsnegdp +; CHECK-VSX-NEXT: xsmaddmdp +} + +; need nsz flag to generate fnmsub since it may affect sign of zero +define double @test_FNMSUB1_NSZ(double %A, double %B, double %C) { + %D = fmul nsz double %A, %B ; [#uses=1] + %E = fsub nsz double %C, %D ; [#uses=1] + ret double %E +; CHECK-LABEL: test_FNMSUB1_NSZ: +; CHECK: fnmsub +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FNMSUB1_NSZ: ; CHECK-VSX: xsnmsubmdp } diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -429,9 +429,9 @@ ; CHECK-P9-NEXT: xsmaddasp 4, 1, 0 ; CHECK-P9-NEXT: xsmulsp 1, 3, 4 ; CHECK-P9-NEXT: blr - %x = call reassoc arcp float @llvm.sqrt.f32(float %a) - %y = fmul reassoc float %x, %b - %z = fdiv reassoc arcp float %c, %y + %x = call reassoc arcp nsz float @llvm.sqrt.f32(float %a) + %y = fmul reassoc nsz float %x, %b + %z = fdiv reassoc arcp nsz float %c, %y ret float %z } @@ -651,7 +651,7 @@ ; CHECK-P9-NEXT: xsmaddasp 0, 3, 1 ; CHECK-P9-NEXT: fmr 1, 0 ; CHECK-P9-NEXT: blr - %r = fdiv reassoc arcp float %a, %b + %r = fdiv reassoc arcp nsz float %a, %b ret float %r } @@ -705,7 +705,7 @@ ; CHECK-P9-NEXT: xvmaddasp 0, 1, 34 ; CHECK-P9-NEXT: xxlor 34, 0, 0 ; CHECK-P9-NEXT: blr - %r = fdiv reassoc arcp <4 x float> %a, %b + %r = fdiv reassoc arcp nsz <4 x float> %a, %b ret <4 x float> %r } diff --git a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll --- a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll +++ b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll @@ -13,9 +13,9 @@ ; CHECK-NEXT: lvx 4, 0, 3 ; CHECK-NEXT: xxspltw 0, 0, 0 ; CHECK-NEXT: xvresp 1, 0 -; CHECK-NEXT: xvmaddasp 35, 0, 1 +; CHECK-NEXT: xvnmsubasp 35, 0, 1 ; CHECK-NEXT: xvmulsp 0, 34, 36 -; CHECK-NEXT: xvnmsubasp 1, 1, 35 +; CHECK-NEXT: xvmaddasp 1, 1, 35 ; CHECK-NEXT: xvmulsp 34, 0, 1 ; CHECK-NEXT: blr %ins = insertelement <4 x float> undef, float %a, i32 0