Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -85,7 +85,8 @@ SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; - unsigned getFusedOpcode(const SelectionDAG &DAG, EVT VT) const; + unsigned getFusedOpcode(const SelectionDAG &DAG, + const SDNode *N0, const SDNode *N1) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3906,7 +3906,11 @@ return SDValue(); } -unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, EVT VT) const { +unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, + const SDNode *N0, + const SDNode *N1) const { + EVT VT = N0->getValueType(0); + // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || @@ -3914,7 +3918,10 @@ return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || + (cast(N0)->Flags.hasUnsafeAlgebra() && + cast(N1)->Flags.hasUnsafeAlgebra())) && isFMAFasterThanFMulAndFAdd(VT)) { return ISD::FMA; } @@ -3942,7 +3949,7 @@ if (LHS.getOpcode() == ISD::FADD) { SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - unsigned FusedOp = getFusedOpcode(DAG, VT); + unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); if (FusedOp != 0) { const SDValue Two = DAG.getConstantFP(2.0, SL, VT); return DAG.getNode(FusedOp, SL, VT, Two, A, RHS); @@ -3954,7 +3961,7 @@ if (RHS.getOpcode() == ISD::FADD) { SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - unsigned FusedOp = getFusedOpcode(DAG, VT); + unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); if (FusedOp != 0) { const SDValue Two = DAG.getConstantFP(2.0, SL, VT); return DAG.getNode(FusedOp, SL, VT, Two, A, LHS); @@ -3986,7 +3993,7 @@ // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - unsigned FusedOp = getFusedOpcode(DAG, VT); + unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); if (FusedOp != 0){ const SDValue Two = DAG.getConstantFP(2.0, SL, VT); SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); @@ -4001,7 +4008,7 @@ SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - unsigned FusedOp = getFusedOpcode(DAG, VT); + unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); if (FusedOp != 0){ const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); return DAG.getNode(FusedOp, SL, VT, NegTwo, A, LHS); Index: test/CodeGen/AMDGPU/fmuladd.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f64.ll +++ test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -112,6 +112,69 @@ ret void } +; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add0: +; GCN-STRICT: v_add_f64 +; GCN-STRICT: v_add_f64 + +; GCN-CONTRACT: v_fma_f64 +define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %r0 = load volatile double, double addrspace(1)* %gep.0 + %r1 = load volatile double, double addrspace(1)* %gep.1 + + %add.0 = fadd fast double %r0, %r0 + %add.1 = fadd double %add.0, %r1 + store double %add.1, double addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add1: +; GCN-STRICT: v_add_f64 +; GCN-STRICT: v_add_f64 + +; GCN-CONTRACT: v_fma_f64 +define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %r0 = load volatile double, double addrspace(1)* %gep.0 + %r1 = load volatile double, double addrspace(1)* %gep.1 + + %add.0 = fadd double %r0, %r0 + %add.1 = fadd fast double %add.0, %r1 + store double %add.1, double addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast: +; GCN: v_fma_f64 +define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %r0 = load volatile double, double addrspace(1)* %gep.0 + %r1 = load volatile double, double addrspace(1)* %gep.1 + + %add.0 = fadd fast double %r0, %r0 + %add.1 = fadd fast double %add.0, %r1 + store double %add.1, double addrspace(1)* %gep.out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare double @llvm.fmuladd.f64(double, double, double) #1