Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -363,6 +363,7 @@ FMED3, SMED3, UMED3, + FDOT2, URECIP, DIV_SCALE, DIV_FMAS, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3993,6 +3993,7 @@ NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) + NODE_NAME_CASE(FDOT2) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -341,6 +341,11 @@ def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", + SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, SDTCisFP<0>, SDTCisVec<1>]>, + []>; + + def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -136,6 +136,7 @@ SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -623,6 +623,7 @@ setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); @@ -4930,6 +4931,9 @@ case Intrinsic::amdgcn_fmed3: return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_fdot2: + return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -7461,6 +7465,60 @@ return SDValue(); } +SDValue SITargetLowering::performFMACombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc SL(N); + + if (!Subtarget->hasDLInsts() || VT != MVT::f16) + return SDValue(); + + // FMA(S0.x, S1. x, FMA(S0.y, S1.y, z)) -> FTRUNC (FDOT2(S0, S1, (float)z)) + SDValue Op1 = N->getOperand(0); + SDValue Op2 = N->getOperand(1); + SDValue FMA = N->getOperand(2); + if (FMA.getOpcode() != ISD::FMA || + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec1 = Op1.getOperand(0); + SDValue Idx1 = Op1.getOperand(1); + SDValue Vec2 = Op2.getOperand(0); + + SDValue FMAOp1 = FMA.getOperand(0); + SDValue FMAOp2 = FMA.getOperand(1); + SDValue FMAAcc = FMA.getOperand(2); + + if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec3 = FMAOp1.getOperand(0); + SDValue Vec4 = FMAOp2.getOperand(0); + SDValue Idx2 = FMAOp1.getOperand(1); + + if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || + // Idx1 and Idx2 cannot be the same. + Idx1 == Idx2) + return SDValue(); + + if (Vec1 == Vec2 || Vec3 == Vec4) + return SDValue(); + + if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) + return SDValue(); + + if ((Vec1 == Vec3 && Vec2 == Vec4) || + (Vec1 == Vec4 && Vec2 == Vec3)) { + SDValue DotOp3 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, FMAAcc); + SDValue Res = DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, DotOp3); + return DAG.getNode(ISD::FTRUNC, SL, MVT::f16, Res); + } + return SDValue(); +} + SDValue SITargetLowering::performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -7645,6 +7703,8 @@ return performMinMaxCombine(N, DCI); break; } + case ISD::FMA: + return performFMACombine(N, DCI); case ISD::LOAD: { if (SDValue Widended = widenLoad(cast(N), DCI)) return Widended; Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -167,7 +167,7 @@ let SubtargetPredicate = HasDLInsts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile, int_amdgcn_fdot2>; +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile, AMDGPUfdot2>; def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile, int_amdgcn_sdot2>; def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile, int_amdgcn_udot2>; def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile, int_amdgcn_sdot4>; Index: test/CodeGen/AMDGPU/dotproduct.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/dotproduct.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906 + +; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) + +; GCN-LABEL: {{^}}fdot2_f16 +; GCN: v_fma_legacy_f16 +; GCN: v_fma_legacy_f16 +; GFX906: v_dot2_f32_f16 +define amdgpu_kernel void @fdot2_f16(<2 x half> addrspace(1)* %src1, + <2 x half> addrspace(1)* %src2, + half addrspace(1)* nocapture %dst) { +entry: + %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 + %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + + %mul2 = fmul fast half %src1.el2, %src2.el2 + %mul1 = fmul fast half %src1.el1, %src2.el1 + %acc = load half, half addrspace(1)* %dst, align 2 + %acc1 = fadd fast half %mul2, %acc + %acc2 = fadd fast half %mul1, %acc1 + store half %acc2, half addrspace(1)* %dst, align 2 + ret void +}