Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -363,6 +363,7 @@ FMED3, SMED3, UMED3, + FDOT2, URECIP, DIV_SCALE, DIV_FMAS, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3993,6 +3993,7 @@ NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) + NODE_NAME_CASE(FDOT2) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -341,6 +341,12 @@ def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", + SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, + SDTCisFP<0>, SDTCisVec<1>]>, + []>; + + def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -136,6 +136,7 @@ SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -623,6 +623,7 @@ setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); @@ -4930,6 +4931,9 @@ case Intrinsic::amdgcn_fmed3: return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_fdot2: + return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -7461,6 +7465,79 @@ return SDValue(); } +SDValue SITargetLowering::performFMACombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc SL(N); + + if (!Subtarget->hasDLInsts() || VT != MVT::f32) + return SDValue(); + + // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> + // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) + SDValue Op1 = N->getOperand(0); + SDValue Op2 = N->getOperand(1); + SDValue FMA = N->getOperand(2); + + if (FMA.getOpcode() != ISD::FMA || + Op1.getOpcode() != ISD::FP_EXTEND || + Op2.getOpcode() != ISD::FP_EXTEND) + return SDValue(); + + // fdot2_f32_f16 always flushes fp32 denormal operand/output to zero, regardless + // of the denorm mode setting. Therefore, fdot2 is legal to generate even when + // there is no fp16/fp32 denorm support. + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + (N->getFlags().hasAllowContract() && + FMA->getFlags().hasAllowContract())) { + Op1 = Op1.getOperand(0); + Op2 = Op2.getOperand(0); + if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec1 = Op1.getOperand(0); + SDValue Idx1 = Op1.getOperand(1); + SDValue Vec2 = Op2.getOperand(0); + + SDValue FMAOp1 = FMA.getOperand(0); + SDValue FMAOp2 = FMA.getOperand(1); + SDValue FMAAcc = FMA.getOperand(2); + + if (FMAOp1.getOpcode() != ISD::FP_EXTEND || + FMAOp2.getOpcode() != ISD::FP_EXTEND) + return SDValue(); + + FMAOp1 = FMAOp1.getOperand(0); + FMAOp2 = FMAOp2.getOperand(0); + if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec3 = FMAOp1.getOperand(0); + SDValue Vec4 = FMAOp2.getOperand(0); + SDValue Idx2 = FMAOp1.getOperand(1); + + if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || + // Idx1 and Idx2 cannot be the same. + Idx1 == Idx2) + return SDValue(); + + if (Vec1 == Vec2 || Vec3 == Vec4) + return SDValue(); + + if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) + return SDValue(); + + if ((Vec1 == Vec3 && Vec2 == Vec4) || + (Vec1 == Vec4 && Vec2 == Vec3)) + return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc); + } + return SDValue(); +} + SDValue SITargetLowering::performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -7645,6 +7722,8 @@ return performMinMaxCombine(N, DCI); break; } + case ISD::FMA: + return performFMACombine(N, DCI); case ISD::LOAD: { if (SDValue Widended = widenLoad(cast(N), DCI)) return Widended; Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -167,7 +167,7 @@ let SubtargetPredicate = HasDLInsts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile, int_amdgcn_fdot2>; +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile, AMDGPUfdot2>; def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile, int_amdgcn_sdot2>; def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile, int_amdgcn_udot2>; def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile, int_amdgcn_sdot4>; Index: test/CodeGen/AMDGPU/fdot2.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fdot2.ll @@ -0,0 +1,232 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 +; RUN: llc -march=amdgcn -mcpu=gfx906 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-UNSAFE +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT +; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) + +; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions +; are not converted from f16 to f32. +; GCN-LABEL: {{^}}dotproduct_f16 +; GFX900: v_fma_legacy_f16 +; GCN900: v_fma_legacy_f16 + +; GFX906: v_mul_f16_e32 +; GFX906: v_mul_f16_e32 + +; GFX906-UNSAFE: v_fma_legacy_f16 + +; GFX906-CONTRACT: v_mac_f16_e32 +; GFX906-DENORM-CONTRACT: v_fma_legacy_f16 +define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1, + <2 x half> addrspace(1)* %src2, + half addrspace(1)* nocapture %dst) { +entry: + %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 + %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + + %mul2 = fmul half %src1.el2, %src2.el2 + %mul1 = fmul half %src1.el1, %src2.el1 + %acc = load half, half addrspace(1)* %dst, align 2 + %acc1 = fadd half %mul2, %acc + %acc2 = fadd half %mul1, %acc1 + store half %acc2, half addrspace(1)* %dst, align 2 + ret void +} + + +; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 +; and the vectors are of type <2 x half> +; GCN-LABEL: {{^}}dotproduct_f16_f32 +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_dot2_f32_f16 + +; GFX906-CONTRACT: v_dot2_f32_f16 + +; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 +define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1, + <2 x half> addrspace(1)* %src2, + float addrspace(1)* nocapture %dst) { +entry: + %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 + %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul float %csrc1.el2, %csrc2.el2 + %mul1 = fmul float %csrc1.el1, %csrc2.el1 + %acc = load float, float addrspace(1)* %dst, align 4 + %acc1 = fadd float %mul2, %acc + %acc2 = fadd float %mul1, %acc1 + store float %acc2, float addrspace(1)* %dst, align 4 + ret void +} + +; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 +; and the vectors are of type <2 x half> +; GCN-LABEL: {{^}}dotproduct_diffvecorder +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_dot2_f32_f16 + +; GFX906-CONTRACT: v_dot2_f32_f16 +; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 +define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1, + <2 x half> addrspace(1)* %src2, + float addrspace(1)* nocapture %dst) { +entry: + %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 + %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul float %csrc2.el2, %csrc1.el2 + %mul1 = fmul float %csrc1.el1, %csrc2.el1 + %acc = load float, float addrspace(1)* %dst, align 4 + %acc1 = fadd float %mul2, %acc + %acc2 = fadd float %mul1, %acc1 + store float %acc2, float addrspace(1)* %dst, align 4 + ret void +} + +; Tests to make sure dot product is not generated when the vectors are not of <2 x half>. +; GCN-LABEL: {{^}}dotproduct_v4f16 +; GFX900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1, + <4 x half> addrspace(1)* %src2, + float addrspace(1)* nocapture %dst) { +entry: + %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1 + %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2 + + %src1.el1 = extractelement <4 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <4 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <4 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <4 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul float %csrc1.el2, %csrc2.el2 + %mul1 = fmul float %csrc1.el1, %csrc2.el1 + %acc = load float, float addrspace(1)* %dst, align 4 + %acc1 = fadd float %mul2, %acc + %acc2 = fadd float %mul1, %acc1 + store float %acc2, float addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}NotAdotproduct +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1, + <2 x half> addrspace(1)* %src2, + float addrspace(1)* nocapture %dst) { +entry: + %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 + %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul float %csrc1.el2, %csrc1.el1 + %mul1 = fmul float %csrc2.el1, %csrc2.el2 + %acc = load float, float addrspace(1)* %dst, align 4 + %acc1 = fadd float %mul2, %acc + %acc2 = fadd float %mul1, %acc1 + store float %acc2, float addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1, + <2 x half> addrspace(1)* %src2, + float addrspace(1)* nocapture %dst) { +entry: + %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 + %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul float %csrc1.el2, %csrc2.el1 + %mul1 = fmul float %csrc1.el1, %csrc2.el2 + %acc = load float, float addrspace(1)* %dst, align 4 + %acc1 = fadd float %mul2, %acc + %acc2 = fadd float %mul1, %acc1 + store float %acc2, float addrspace(1)* %dst, align 4 + ret void +}