Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -934,9 +934,18 @@ static TargetLowering::LegalizeAction getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) { + auto Action = TLI.getOperationAction(Opcode, VT); + if (Action == TargetLowering::Custom) + return Action; + unsigned EqOpc; switch (Opcode) { default: llvm_unreachable("Unexpected FP pseudo-opcode"); + case ISD::STRICT_FADD: EqOpc = ISD::FADD; break; + case ISD::STRICT_FSUB: EqOpc = ISD::FSUB; break; + case ISD::STRICT_FMUL: EqOpc = ISD::FMUL; break; + case ISD::STRICT_FDIV: EqOpc = ISD::FDIV; break; + case ISD::STRICT_FREM: EqOpc = ISD::FREM; break; case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break; case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break; case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break; @@ -952,7 +961,7 @@ case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; } - auto Action = TLI.getOperationAction(EqOpc, VT); + Action = TLI.getOperationAction(EqOpc, VT); // We don't currently handle Custom or Promote for strict FP pseudo-ops. // For now, we just expand for those cases. Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5462,12 +5462,12 @@ getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); return nullptr; + case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_fadd: case Intrinsic::experimental_constrained_fsub: case Intrinsic::experimental_constrained_fmul: case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: - case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_sqrt: case Intrinsic::experimental_constrained_pow: case Intrinsic::experimental_constrained_powi: @@ -6059,18 +6059,21 @@ SDVTList VTs = DAG.getVTList(ValueVTs); SDValue Result; + const SDValue FPMode = DAG.getConstant(FPI.getRoundingMode(), sdl, MVT::i32); + if (FPI.isUnaryOp()) Result = DAG.getNode(Opcode, sdl, VTs, - { Chain, getValue(FPI.getArgOperand(0)) }); + {Chain, getValue(FPI.getArgOperand(0)), FPMode}); else if (FPI.isTernaryOp()) Result = DAG.getNode(Opcode, sdl, VTs, - { Chain, getValue(FPI.getArgOperand(0)), - getValue(FPI.getArgOperand(1)), - getValue(FPI.getArgOperand(2)) }); + {Chain, getValue(FPI.getArgOperand(0)), + getValue(FPI.getArgOperand(1)), + getValue(FPI.getArgOperand(2)), + FPMode}); else Result = DAG.getNode(Opcode, sdl, VTs, - { Chain, getValue(FPI.getArgOperand(0)), - getValue(FPI.getArgOperand(1)) }); + {Chain, getValue(FPI.getArgOperand(0)), + getValue(FPI.getArgOperand(1)), FPMode}); assert(Result.getNode()->getNumValues() == 2); SDValue OutChain = Result.getValue(1); Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -204,6 +204,8 @@ void SelectDIV_SCALE(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); + void SelectStrictBinOp_W_CHAIN(SDNode *N); + void SelectStrictUnaryOp_W_CHAIN(SDNode *N); SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); @@ -471,15 +473,20 @@ SelectUADDO_USUBO(N); return; } + case AMDGPUISD::FSQRT_W_CHAIN: { + SelectStrictUnaryOp_W_CHAIN(N); + return; + } + case AMDGPUISD::FADD_W_CHAIN: + case AMDGPUISD::FSUB_W_CHAIN: case AMDGPUISD::FMUL_W_CHAIN: { - SelectFMUL_W_CHAIN(N); + SelectStrictBinOp_W_CHAIN(N); return; } case AMDGPUISD::FMA_W_CHAIN: { SelectFMA_W_CHAIN(N); return; } - case ISD::SCALAR_TO_VECTOR: case ISD::BUILD_VECTOR: { EVT VT = N->getValueType(0); @@ -780,11 +787,15 @@ SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); Ops[8] = N->getOperand(0); Ops[9] = N->getOperand(4); - - CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); + assert((N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64) && + "Incorrent Value Type!"); + unsigned TargetOpc = N->getValueType(0) == MVT::f32 ? + AMDGPU::V_FMA_F32 : + AMDGPU::V_FMA_F64; + CurDAG->SelectNodeTo(N, TargetOpc, N->getVTList(), Ops); } -void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectStrictBinOp_W_CHAIN(SDNode *N) { SDLoc SL(N); // src0_modifiers, src0, src1_modifiers, src1, clamp, omod SDValue Ops[8]; @@ -793,8 +804,62 @@ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); Ops[6] = N->getOperand(0); Ops[7] = N->getOperand(3); + unsigned TargetOpc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unpected Opcode encountered!"); + case AMDGPUISD::FADD_W_CHAIN: + if (N->getValueType(0) == MVT::f16) + TargetOpc = AMDGPU::V_ADD_F16_e64; + else + TargetOpc = N->getValueType(0) == MVT::f32 ? + AMDGPU::V_ADD_F32_e64 : + AMDGPU::V_ADD_F64; + break; + case AMDGPUISD::FSUB_W_CHAIN: + assert(N->getValueType(0) == MVT::f16 || N->getValueType(0) == MVT::f32 && + "Expected Type Encountered!"); + TargetOpc = N->getValueType(0) == MVT::f16 ? + AMDGPU::V_SUB_F16_e64 : + AMDGPU::V_SUB_F32_e64; + break; + case AMDGPUISD::FMUL_W_CHAIN: + if (N->getValueType(0) == MVT::f16) + TargetOpc = AMDGPU::V_MUL_F16_e64; + else + TargetOpc = N->getValueType(0) == MVT::f32 ? + AMDGPU::V_MUL_F32_e64 : + AMDGPU::V_MUL_F64; + break; + } + + CurDAG->SelectNodeTo(N, TargetOpc, N->getVTList(), Ops); +} + +void AMDGPUDAGToDAGISel::SelectStrictUnaryOp_W_CHAIN(SDNode *N) { + SDLoc SL(N); + + // src0_modifiers, src0, src1_modifiers, src1, clamp, omod + SDValue Ops[6]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[2], Ops[3]); + Ops[4] = N->getOperand(0); + Ops[5] = N->getOperand(2); + unsigned TargetOpc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected Opcode encountered!"); + case AMDGPUISD::FSQRT_W_CHAIN: + TargetOpc = N->getValueType(0) == MVT::f32 ? + AMDGPU::V_SQRT_F32_e64 : + AMDGPU::V_SQRT_F64_e64; + break; + case AMDGPUISD::FSIN_W_CHAIN: + TargetOpc = N->getValueType(0) == MVT::f32 ? + AMDGPU::V_SIN_F32_e64 : + AMDGPU::V_SIN_F16_e64; + break; + } - CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); + CurDAG->SelectNodeTo(N, TargetOpc, N->getVTList(), Ops); } // We need to handle this here because tablegen doesn't support matching Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -327,6 +327,22 @@ // FP ops with input and output chain. FMA_W_CHAIN, FMUL_W_CHAIN, + FADD_W_CHAIN, + FSUB_W_CHAIN, + FDIV_W_CHAIN, + FREM_W_CHAIN, + FSQRT_W_CHAIN, + FPOW_W_CHAIN, + FPOWI_W_CHAIN, + FSIN_W_CHAIN, + FCOS_W_CHAIN, + FEXP_W_CHAIN, + FEXP2_W_CHAIN, + FLOG_W_CHAIN, + FLOG10_W_CHAIN, + FLOG2_W_CHAIN, + FRINT_W_CHAIN, + FNEARBYINT_W_CHAIN, // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3717,6 +3717,22 @@ NODE_NAME_CASE(SETREG) NODE_NAME_CASE(FMA_W_CHAIN) NODE_NAME_CASE(FMUL_W_CHAIN) + NODE_NAME_CASE(FADD_W_CHAIN) + NODE_NAME_CASE(FSUB_W_CHAIN) + NODE_NAME_CASE(FDIV_W_CHAIN) + NODE_NAME_CASE(FREM_W_CHAIN) + NODE_NAME_CASE(FSQRT_W_CHAIN) + NODE_NAME_CASE(FPOW_W_CHAIN) + NODE_NAME_CASE(FPOWI_W_CHAIN) + NODE_NAME_CASE(FSIN_W_CHAIN) + NODE_NAME_CASE(FCOS_W_CHAIN) + NODE_NAME_CASE(FEXP_W_CHAIN) + NODE_NAME_CASE(FEXP2_W_CHAIN) + NODE_NAME_CASE(FLOG_W_CHAIN) + NODE_NAME_CASE(FLOG10_W_CHAIN) + NODE_NAME_CASE(FLOG2_W_CHAIN) + NODE_NAME_CASE(FRINT_W_CHAIN) + NODE_NAME_CASE(FNEARBYINT_W_CHAIN) NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(COS_HW) NODE_NAME_CASE(SIN_HW) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -224,10 +224,58 @@ def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [ SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; +def AMDGPUfadd : SDNode<"AMDGPUISD::FADD_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUfsub : SDNode<"AMDGPUISD::FSUB_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUfmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUfdiv : SDNode<"AMDGPUISD::FDIV_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUfrem : SDNode<"AMDGPUISD::FREM_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ +def AMDGPUsqrt : SDNode<"AMDGPUISD::SQRT_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUpow : SDNode<"AMDGPUISD::FPOW_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUpowi : SDNode<"AMDGPUISD::FPOWI_W_CHAIN", SDTFPBinOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUsin_chain : SDNode<"AMDGPUISD::FSIN_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUcos_chain : SDNode<"AMDGPUISD::FCOS_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUexp : SDNode<"AMDGPUISD::FEXP_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUexp2 : SDNode<"AMDGPUISD::FEXP2_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUlog : SDNode<"AMDGPUISD::FLOG_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUlog10 : SDNode<"AMDGPUISD::FLOG10_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUlog2 : SDNode<"AMDGPUISD::FLOG2_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUrint : SDNode<"AMDGPUISD::FRINT_W_CHAIN", SDTFPUnaryOp, [ + SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def AMDGPUnearbyint : SDNode<"AMDGPUISD::FNEARBYINT_W_CHAIN", SDTFPUnaryOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -458,6 +458,9 @@ #define FP_ROUND_MODE_SP(x) ((x) & 0x3) #define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) +#define OFFSET_SINGLE_FP_ROUND 0 +#define OFFSET_DOUBLE_FP_ROUND 2 + #define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 #define FP_DENORM_FLUSH_OUT 1 #define FP_DENORM_FLUSH_IN 2 Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -54,6 +54,7 @@ SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstrainedFPs(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -316,6 +316,63 @@ setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + //setOperationAction(ISD::FMA, MVT::f32, Custom); + //setOperationAction(ISD::FMA, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FADD, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FADD, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FSUB, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSUB, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FMUL, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FMUL, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FDIV, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FDIV, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FREM, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FREM, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FMA, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FMA, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FPOW, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FPOW, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FPOWI, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FPOWI, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FSIN, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FSIN, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FCOS, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FCOS, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FEXP, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FEXP, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FEXP2, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FEXP2, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FLOG, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FLOG, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FLOG10, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FLOG10, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FLOG2, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FLOG2, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FRINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FRINT, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom); + + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::Constant, MVT::i16, Legal); @@ -3175,6 +3232,25 @@ SIMachineFunctionInfo *MFI = MF.getInfo(); return LowerGlobalAddress(MFI, Op, DAG); } + case ISD::STRICT_FADD: + case ISD::STRICT_FSUB: + case ISD::STRICT_FMUL: + case ISD::STRICT_FDIV: + case ISD::STRICT_FREM: + case ISD::STRICT_FMA: + case ISD::STRICT_FSQRT: + case ISD::STRICT_FPOW: + case ISD::STRICT_FPOWI: + case ISD::STRICT_FSIN: + case ISD::STRICT_FCOS: + case ISD::STRICT_FEXP: + case ISD::STRICT_FEXP2: + case ISD::STRICT_FLOG: + case ISD::STRICT_FLOG10: + case ISD::STRICT_FLOG2: + case ISD::STRICT_FRINT: + case ISD::STRICT_FNEARBYINT: + return LowerConstrainedFPs(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); @@ -4630,22 +4706,55 @@ return SDValue(); } +static SDValue getFPUnaryOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, + EVT VT, SDValue A, SDValue GlueChain) { + if (GlueChain->getNumValues() <= 1) { + return DAG.getNode(Opcode, SL, VT, A); + } + + assert(GlueChain->getNumValues() == 2 || GlueChain->getNumValues() == 3); + + SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); + switch (Opcode) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FSQRT: + Opcode = AMDGPUISD::FSQRT_W_CHAIN; break; + } + + if (GlueChain->getNumValues() == 2) + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(0), A, + GlueChain.getValue(1)); + + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, + GlueChain.getValue(2)); +} + + static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain) { if (GlueChain->getNumValues() <= 1) { return DAG.getNode(Opcode, SL, VT, A, B); } - assert(GlueChain->getNumValues() == 3); + assert(GlueChain->getNumValues() == 2 || GlueChain->getNumValues() == 3); SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); switch (Opcode) { default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FADD: + Opcode = AMDGPUISD::FADD_W_CHAIN; break; + case ISD::FSUB: + Opcode = AMDGPUISD::FSUB_W_CHAIN; break; case ISD::FMUL: - Opcode = AMDGPUISD::FMUL_W_CHAIN; - break; + Opcode = AMDGPUISD::FMUL_W_CHAIN; break; + case ISD::FDIV: + Opcode = AMDGPUISD::FDIV_W_CHAIN; break; } + if (GlueChain->getNumValues() == 2) + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(0), A, B, + GlueChain.getValue(1)); + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, GlueChain.getValue(2)); } @@ -4657,7 +4766,7 @@ return DAG.getNode(Opcode, SL, VT, A, B, C); } - assert(GlueChain->getNumValues() == 3); + assert(GlueChain->getNumValues() == 3 || GlueChain->getNumValues() == 2); SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); switch (Opcode) { @@ -4667,6 +4776,10 @@ break; } + if (GlueChain->getNumValues() == 2) + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(0), A, B, C, + GlueChain.getValue(1)); + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)); } @@ -4890,6 +5003,106 @@ llvm_unreachable("Unexpected type for fdiv"); } +SDValue SITargetLowering::LowerConstrainedFPs(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + + // Retrieve FP Rouding Mode. + SDValue RoundModeSD = Op.getOperand(Op.getNumOperands()-1); + unsigned RoundModeValue = cast(RoundModeSD.getNode())->getZExtValue(); + unsigned WidthBit = 0, RoundingMode = 0; + if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::f32) + WidthBit = OFFSET_SINGLE_FP_ROUND; + else + WidthBit = OFFSET_DOUBLE_FP_ROUND; + + switch (RoundModeValue) { + case llvm::ConstrainedFPIntrinsic::rmDynamic: + //assert(false && "We don't support dynamic rouding mode currently!"); + break; + case llvm::ConstrainedFPIntrinsic::rmToNearest: + RoundingMode = FP_ROUND_ROUND_TO_NEAREST; break; + case llvm::ConstrainedFPIntrinsic::rmDownward: + RoundingMode = FP_ROUND_ROUND_TO_INF; break; + case llvm::ConstrainedFPIntrinsic::rmUpward: + RoundingMode = FP_ROUND_ROUND_TO_NEGINF; break; + case llvm::ConstrainedFPIntrinsic::rmTowardZero: + RoundingMode = FP_ROUND_ROUND_TO_ZERO; break; + default: llvm_unreachable("Unknown fp mode code!"); + } + + const unsigned RoudingMode32Reg = AMDGPU::Hwreg::ID_MODE | + (WidthBit << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + + const SDValue BitField = DAG.getTargetConstant(RoudingMode32Reg, SL, MVT::i16); + + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + const SDValue EnableDenormValue = DAG.getConstant(RoundingMode, + SL, MVT::i32); + + SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), + EnableDenormValue, BitField); + // get OPC + unsigned EqOpc; + switch (Op.getOpcode()) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::STRICT_FADD : EqOpc = ISD::FADD; break; + case ISD::STRICT_FSUB : EqOpc = ISD::FSUB; break; + case ISD::STRICT_FMUL : EqOpc = ISD::FMUL; break; + case ISD::STRICT_FDIV: EqOpc = ISD::FDIV; break; + case ISD::STRICT_FMA : EqOpc = ISD::FMA; break; + case ISD::STRICT_FREM: EqOpc = ISD::FREM; break; + case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break; + case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break; + case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break; + case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break; + case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break; + case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break; + case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break; + case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break; + case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break; + case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; + case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; + case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; + } + + SDValue Res; + if (Op.getNumOperands() == 3) { + Res = getFPUnaryOp(DAG, EqOpc, SL, Op.getValueType(), + Op.getOperand(1), + EnableDenorm); + } + else if (Op.getNumOperands() == 4) { + Res = getFPBinOp(DAG, EqOpc, SL, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + EnableDenorm); + } + else if (Op.getNumOperands() == 5) { + Res = getFPTernOp(DAG, EqOpc, SL, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3), + EnableDenorm); + } + const SDValue DefaultRoundingModeValue = DAG.getConstant(FP_ROUND_ROUND_TO_NEAREST, + SL, MVT::i32); + + SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Res.getValue(1), + DefaultRoundingModeValue, + BitField, + Res.getValue(2)); + + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + DisableDenorm, DAG.getRoot()); + DAG.setRoot(OutputChain); + + return Res; + + llvm_unreachable("Unexpected type for fma"); +} + SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast(Op); Index: test/CodeGen/AMDGPU/constrained_fp.ll =================================================================== --- test/CodeGen/AMDGPU/constrained_fp.ll +++ test/CodeGen/AMDGPU/constrained_fp.ll @@ -1,19 +1,132 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) nounwind readnone declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) nounwind readnone +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) nounwind readnone +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) nounwind readnone +declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) nounwind readnone +declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) nounwind readnone -; FUNC-LABEL: {{^}}fma_f64: -; FUNC: s_setreg_b32 -; FUNC: v_fma_f64 -; FUNC: s_setreg_b32 -define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2, double addrspace(1)* %in3) { +; GCN-LABEL: {{^}}fadd_f64_round_tonearest +; GCN: s_mov_b32 +; GCN: s_mov_b32 [[MODE:s[0-9]+]], 0 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 2, 2), [[MODE]] +; GCN: v_add_f64 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 2, 2) +define amdgpu_kernel void @fadd_f64_round_tonearest(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r3 = tail call double @llvm.experimental.constrained.fadd.f64(double %r0, double %r1, metadata !"round.tonearest", metadata !"fpexcept.strict") + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fadd_f64_round_downward +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1 +; GCN: v_add_f64 +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +define amdgpu_kernel void @fadd_f64_round_downward(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r3 = tail call double @llvm.experimental.constrained.fadd.f64(double %r0, double %r1, metadata !"round.downward", metadata !"fpexcept.strict") + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fadd_f64_round_upward +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 +; GCN: v_add_f64 +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +define amdgpu_kernel void @fadd_f64_round_upward(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r3 = tail call double @llvm.experimental.constrained.fadd.f64(double %r0, double %r1, metadata !"round.upward", metadata !"fpexcept.strict") + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fadd_f64_round_towardzero +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GCN: v_add_f64 +; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +define amdgpu_kernel void @fadd_f64_round_towardzero(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r3 = tail call double @llvm.experimental.constrained.fadd.f64(double %r0, double %r1, metadata !"round.towardzero", metadata !"fpexcept.strict") + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_f64_tonearest +; GCN: s_mov_b32 +; GCN: s_mov_b32 [[MODE:s[0-9]+]], 0 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 2, 2), [[MODE]] +; GCN: v_mul_f64 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 2, 2) +define amdgpu_kernel void @fmul_f64_tonearest(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r3 = tail call double @llvm.experimental.constrained.fmul.f64(double %r0, double %r1, metadata !"round.tonearest", metadata !"fpexcept.strict") + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fsqrt_f64_tonearest +; GCN: s_mov_b32 +; GCN: s_mov_b32 [[MODE:s[0-9]+]], 0 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 2, 2), [[MODE]] +; GCN: v_sqrt_f64_e32 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 2, 2) +define amdgpu_kernel void @fsqrt_f64_tonearest(double addrspace(1)* %out, double addrspace(1)* %in1) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = tail call double @llvm.experimental.constrained.sqrt.f64(double %r0, metadata !"round.tonearest", metadata !"fpexcept.strict") + store double %r1, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fsub_f32_tonearest +; GCN: s_mov_b32 +; GCN: s_mov_b32 [[MODE:s[0-9]+]], 0 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), [[MODE]] +; GCN: v_sub_f32_e32 +; GCN: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2) +define amdgpu_kernel void @fsub_f32_tonearest(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = tail call float @llvm.experimental.constrained.fsub.f32(float %r0, float %r1, metadata !"round.tonearest", metadata !"fpexcept.strict") + store float %r2, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fma_f64: +; GCN: s_setreg_b32 +; GCN: v_fma_f64 +; GCN: s_setreg_b32 +define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 %r2 = load double, double addrspace(1)* %in3 - %r3 = tail call double @llvm.experimental.constrained.fma.f64(double %r0, double %r1, double %r2, metadata !"round.dynamic", metadata !"fpexcept.strict") + %r3 = tail call double @llvm.experimental.constrained.fma.f64(double %r0, double %r1, double %r2, metadata !"round.tonearest", metadata !"fpexcept.strict") store double %r3, double addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}fma_f32: +; GCN: s_setreg_b32 +; GCN: v_fma_f32 +; GCN: s_setreg_b32 +define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = load float, float addrspace(1)* %in3 + %r3 = tail call float @llvm.experimental.constrained.fma.f32(float %r0, float %r1, float %r2, metadata !"round.tonearest", metadata !"fpexcept.strict") + store float %r3, float addrspace(1)* %out + ret void +} +