Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -135,6 +135,8 @@ void SelectADD_SUB_I64(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectFMA_W_CHAIN(SDNode *N); + void SelectFMUL_W_CHAIN(SDNode *N); SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); @@ -283,6 +285,15 @@ SelectADD_SUB_I64(N); return; } + case AMDGPUISD::FMUL_W_CHAIN: { + SelectFMUL_W_CHAIN(N); + return; + } + case AMDGPUISD::FMA_W_CHAIN: { + SelectFMA_W_CHAIN(N); + return; + } + case ISD::SCALAR_TO_VECTOR: case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { @@ -640,6 +651,33 @@ CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { + SDLoc SL(N); + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[10]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); + Ops[8] = N->getOperand(0); + Ops[9] = N->getOperand(4); + + CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); +} + +void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { + SDLoc SL(N); + // src0_modifiers, src0, src1_modifiers, src1, clamp, omod + SDValue Ops[8]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + Ops[6] = N->getOperand(0); + Ops[7] = N->getOperand(3); + + CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); +} + // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -222,6 +222,11 @@ // This is SETCC with the full mask result which is used for a compare with a // result bit per item in the wavefront. SETCC, + SETREG, + // This FMA has input and out chain + FMA_W_CHAIN, + //This MUL has input and output chain + FMUL_W_CHAIN, // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2780,6 +2780,9 @@ NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(FMA_W_CHAIN) + NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(COS_HW) NODE_NAME_CASE(SIN_HW) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -150,6 +150,19 @@ def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; +def AMDGPUSetRegOp : SDTypeProfile<0, 2, [ + SDTCisSameAs<0, 1>, SDTCisInt<0> +]>; + +def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [ + SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -38,6 +38,7 @@ SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2787,6 +2787,95 @@ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } +SDValue SITargetLowering::LowerFDIV32_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); + + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + LHS, RHS, LHS); + + // Denominator is scaled to not be denormal, so using rcp is ok. + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, + DenominatorScaled); + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, + DenominatorScaled); + + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + const SDValue EnableDenormValue = DAG.getTargetConstant(0x3f0, SL, MVT::i32); + const SDValue BitField = DAG.getTargetConstant(0x3801, SL, MVT::i32); + SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), + EnableDenormValue, BitField); + + SDVTList FmaVT = DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue); + SDValue Fma0 = DAG.getNode(AMDGPUISD::FMA_W_CHAIN, SL, FmaVT, + EnableDenorm.getValue(0), + NegDivScale0, ApproxRcp, One, + EnableDenorm.getValue(1)); + + SDValue Fma1 = DAG.getNode(AMDGPUISD::FMA_W_CHAIN, SL, FmaVT, + Fma0.getValue(1), + Fma0.getValue(0), + ApproxRcp, + ApproxRcp, + Fma0.getValue(2)); + + SDValue Mul = DAG.getNode(AMDGPUISD::FMUL_W_CHAIN, SL, FmaVT, + Fma1.getValue(1), + NumeratorScaled, + Fma1.getValue(0), + Fma1.getValue(2)); + + SDValue Fma2 = DAG.getNode(AMDGPUISD::FMA_W_CHAIN, SL, FmaVT, + Mul.getValue(1), + NegDivScale0, + Mul.getValue(0), + NumeratorScaled, + Mul.getValue(2)); + + SDValue Fma3 = DAG.getNode(AMDGPUISD::FMA_W_CHAIN, SL, FmaVT, + Fma2.getValue(1), + Fma2.getValue(0), + Fma1.getValue(0), + Mul.getValue(0), + Fma2.getValue(2)); + + SDValue Fma4 = DAG.getNode(AMDGPUISD::FMA_W_CHAIN, SL, FmaVT, + Fma3.getValue(1), + NegDivScale0, + Fma3.getValue(0), + NumeratorScaled, + Fma3.getValue(2)); + + const SDValue DisableDenormValue = DAG.getTargetConstant(0x3c0, SL, MVT::i32); + SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), + DisableDenormValue, + BitField, + Fma4.getValue(2)); + + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + DisableDenorm, DAG.getRoot()); + DAG.setRoot(OutputChain); + + SDValue Scale = NumeratorScaled.getValue(1); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, + Fma4, Fma1, Fma3, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); +} + + SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { if (DAG.getTarget().Options.UnsafeFPMath) return lowerFastUnsafeFDIV(Op, DAG); @@ -2858,7 +2947,7 @@ EVT VT = Op.getValueType(); if (VT == MVT::f32) - return LowerFDIV32(Op, DAG); + return LowerFDIV32_W_CHAIN(Op, DAG); if (VT == MVT::f64) return LowerFDIV64(Op, DAG); Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1674,6 +1674,7 @@ // boundaries prevents incorrect movements of such instructions. return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || MI.modifiesRegister(AMDGPU::EXEC, &RI) || + MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || changesVGPRIndexingMode(MI); } Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -598,16 +598,15 @@ // FIXME: Not on SI? //def S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32">; - def S_SETREG_IMM32_B32 : SOPK_Pseudo < "s_setreg_imm32_b32", (outs), (ins i32imm:$imm, hwreg:$simm16), "$simm16, $imm"> { let Size = 8; // Unlike every other SOPK instruction. let has_sdst = 0; + let hasSideEffects = 1; } - //===----------------------------------------------------------------------===// // SOPC Instructions //===----------------------------------------------------------------------===// @@ -872,6 +871,13 @@ >; //===----------------------------------------------------------------------===// +// S_SETREG_B32 Pattern. +//===----------------------------------------------------------------------===// +def : Pat < + (AMDGPUsetreg i32:$reg, i32:$simm16), + (S_SETREG_IMM32_B32 $reg, (as_i16imm $simm16)) +>; +//===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -219,9 +219,20 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; } - } // End SubtargetPredicate = isVI +def : Pat < + (AMDGPUfma (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (V_FMA_F32 $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, $clamp, $omod) +>; + +def : Pat < + (f32 (AMDGPUmul (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers))), + (V_MUL_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, $clamp, $omod) +>; //===----------------------------------------------------------------------===// // Target Index: test/CodeGen/AMDGPU/fdiv_setreg_chain.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fdiv_setreg_chain.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI: s_setreg_imm32_b32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: s_setreg_imm32_b32 +; SI: v_div_fmas_f32 +; SI: v_div_fixup_f32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv float %a, %b + store float %fdiv, float addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" } +