Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -367,6 +367,8 @@ SMED3, UMED3, FDOT2, + SDOT2, + UDOT2, URECIP, DIV_SCALE, DIV_FMAS, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4027,6 +4027,8 @@ NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) NODE_NAME_CASE(FDOT2) + NODE_NAME_CASE(SDOT2) + NODE_NAME_CASE(UDOT2) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -346,6 +346,16 @@ SDTCisFP<0>, SDTCisVec<1>]>, []>; +def AMDGPUsdot2 : SDNode<"AMDGPUISD::SDOT2", + SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, + SDTCisInt<0>, SDTCisVec<1>]>, + []>; + +def AMDGPUudot2 : SDNode<"AMDGPUISD::UDOT2", + SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, + SDTCisInt<0>, SDTCisVec<1>]>, + []>; + def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4946,6 +4946,12 @@ case Intrinsic::amdgcn_fdot2: return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_sdot2: + return DAG.getNode(AMDGPUISD::SDOT2, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_udot2: + return DAG.getNode(AMDGPUISD::UDOT2, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -7263,6 +7269,148 @@ return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); } +// Return true if operands of MUL are from the correspondent elements of two +// different vectors, such as: S0.x * S1.x. +static bool isProductOfDot2(const SDValue &MUL, + SDValue &Vec1, + SDValue &Vec2, + SDValue &Indx) { + + SDValue MulOp1 = MUL.getOperand(0); + SDValue MulOp2 = MUL.getOperand(1); + + if (MulOp1.getOpcode() != ISD::SIGN_EXTEND && + MulOp1.getOpcode() != ISD::ZERO_EXTEND) + return false; + + if (MulOp1.getOpcode() != MulOp2.getOpcode()) + return false; + + MulOp1 = MulOp1.getOperand(0); + MulOp2 = MulOp2.getOperand(0); + + if (MulOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + MulOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + // Operands should be coming from different vectors. + if (MulOp1.getOperand(0) == MulOp2.getOperand(0)) + return false; + + if (MulOp1.getOperand(1) != MulOp2.getOperand(1)) + return false; + + Vec1 = MulOp1.getOperand(0); + Vec2 = MulOp2.getOperand(0); + Indx = MulOp1.getOperand(1); + + return true; +} + +// Returns true if N = add(mul (S0.x, S1.x), +// add(mul S0.y, S1.y), S3)) +static bool isMAD_OfMAD(const SDNode *N, + SDValue &M1, + SDValue &M2, + SDValue &Acc) { + if (N->getOpcode() != ISD::ADD || N->getValueType(0) != MVT::i32) + return false; + + SDValue ADD = N->getOperand(0); + SDValue MUL1 = N->getOperand(1); + + if (ADD.getOpcode() != ISD::ADD && MUL1.getOpcode() != ISD::ADD) + return false; + + if (ADD.getOpcode() != ISD::ADD) + std::swap(ADD, MUL1); + + unsigned MUL1Opc = MUL1.getOpcode(); + if (MUL1Opc != AMDGPUISD::MUL_U24 && + MUL1Opc != AMDGPUISD::MUL_I24) + return false; + + SDValue MUL2 = ADD.getOperand(0); + SDValue Z = ADD.getOperand(1); + unsigned MUL2Opc = MUL2.getOpcode(); + + if (MUL2Opc != MUL1Opc) { + std::swap(MUL2, Z); + MUL2Opc = MUL2.getOpcode(); + + if (MUL2Opc != MUL1Opc) + return false; + } + + M1 = MUL1; + M2 = MUL2; + Acc = Z; + return true; +} + +static SDValue getIntDot2(const SDNode *N, + SelectionDAG &DAG) { + if (N->getOpcode() != ISD::ADD || N->getValueType(0) != MVT::i32) + return SDValue(); + + SDLoc SL(N); + SDValue MUL1; + SDValue MUL2; + SDValue OP3; + + // Look for the following two patterns. + // Pattern1: add(mul (S0.x, S1.x), + // add_32 (mul S0.y, S1.y), S3)) + // Pattern2: add(add_32(mul (S0.x, S1.x), + // mul (S0.y, S1.y)), + // S3) + if (!isMAD_OfMAD(N, MUL1, MUL2, OP3)) { + SDValue MULs = N->getOperand(0); + OP3 = N->getOperand(1); + if (MULs.getOpcode() != ISD::ADD && OP3.getOpcode() != ISD::ADD) + return SDValue(); + if (MULs.getOpcode() != ISD::ADD) + std::swap(MULs, OP3); + + MUL1 = MULs.getOperand(0); + MUL2 = MULs.getOperand(1); + + if (MUL1.getOpcode() != MUL2.getOpcode()) + return SDValue(); + + if (MUL1.getOpcode() != AMDGPUISD::MUL_U24 && + MUL1.getOpcode() != AMDGPUISD::MUL_I24) + return SDValue(); + } + SDValue Vec1; + SDValue Vec2; + SDValue Indx1; + + if (!isProductOfDot2(MUL1, Vec1, Vec2, Indx1)) + return SDValue(); + + SDValue Vec3; + SDValue Vec4; + SDValue Indx2; + if (!isProductOfDot2(MUL2, Vec3, Vec4, Indx2)) + return SDValue(); + + if (Indx1 == Indx2) + return SDValue(); + + if (Vec1.getValueType() != MVT::v2i16 || Vec2.getValueType() != MVT::v2i16) + return SDValue(); + + if ((Vec1 == Vec3 && Vec2 == Vec4) || + (Vec1 == Vec4 && Vec2 == Vec3)) { + SDValue Dot2Inst = (MUL1.getOpcode() == AMDGPUISD::MUL_I24) ? + DAG.getNode(AMDGPUISD::SDOT2, SL, MVT::i32, Vec1, Vec2, OP3) : + DAG.getNode(AMDGPUISD::UDOT2, SL, MVT::i32, Vec1, Vec2, OP3); + return Dot2Inst; + } + return SDValue(); +} + SDValue SITargetLowering::performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -7301,6 +7449,10 @@ return SDValue(); } + SDValue IDot2; + if (Subtarget->hasDLInsts() && (IDot2 = getIntDot2(N, DAG))) + return IDot2; + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -168,8 +168,8 @@ let SubtargetPredicate = HasDLInsts in { def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile, AMDGPUfdot2>; -def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile, int_amdgcn_sdot2>; -def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile, int_amdgcn_udot2>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile, AMDGPUsdot2>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile, AMDGPUudot2>; def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile, int_amdgcn_sdot4>; def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile, int_amdgcn_udot4>; def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile, int_amdgcn_sdot8>; Index: test/CodeGen/AMDGPU/idot2.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/idot2.ll @@ -0,0 +1,290 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 + +; add(mul(S0.x, S1.y), +; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) + +; GCN-LABEL: {{^}}udot2 + +; GFX900: v_mad_u32_u24 +; GFX900: v_mad_u32_u24 + +; GFX906: v_dot2_u32_u16 +define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = zext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 + %conv2 = zext i16 %s2.elt1 to i32 + %mul1 = mul nuw i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 + %conv3 = zext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv4 = zext i16 %s2.elt2 to i32 + %mul2 = mul nuw i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %s3 + %add6 = add i32 %add, %mul1 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; add(S3, +; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) +; GCN-LABEL: {{^}}udot2_MulMul + +; GFX900: v_add_u32_e32 + +; GFX906: v_dot2_u32_u16 +define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = zext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 + %conv2 = zext i16 %s2.elt1 to i32 + %mul1 = mul nuw i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 + %conv3 = zext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv4 = zext i16 %s2.elt2 to i32 + %mul2 = mul nuw i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %mul1 + %add6 = add i32 %add, %s3 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}idot2 + +; GFX900: v_mad_i32_i24 +; GFX900: v_mad_i32_i24 + +; GFX906: v_dot2_i32_i16 +define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = sext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 + %conv2 = sext i16 %s2.elt1 to i32 + %mul1 = mul nuw i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 + %conv3 = sext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv4 = sext i16 %s2.elt2 to i32 + %mul2 = mul nuw i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %s3 + %add6 = add i32 %add, %mul1 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}idot2_MixedTypedMul + +; GFX900: v_mad_i32_i24 + +; GFX906: v_mad_i32_i24 +define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = sext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 + %conv2 = sext i16 %s2.elt1 to i32 + %mul1 = mul nuw i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 + %conv3 = zext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv4 = zext i16 %s2.elt2 to i32 + %mul2 = mul nuw i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %s3 + %add6 = add i32 %add, %mul1 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}udot2_alt_AddOperands + +; GFX900: v_mad_u32_u24 +; GFX900: v_mad_u32_u24 + +; GFX906: v_dot2_u32_u16 +define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = zext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 + %conv2 = zext i16 %s2.elt1 to i32 + %mul1 = mul nuw i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 + %conv3 = zext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv4 = zext i16 %s2.elt2 to i32 + %mul2 = mul nuw i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %s3, %mul2 + %add6 = add i32 %mul1, %add + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}idot2_MixedExt + +; GFX900: v_mad_i32_i24 +; GFX900: v_mad_i32_i24 + +; GFX906: v_mad_i32_i24 +define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = sext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 + %conv2 = zext i16 %s2.elt1 to i32 + %mul1 = mul nuw i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 + %conv3 = sext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv4 = sext i16 %s2.elt2 to i32 + %mul2 = mul nuw i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %s3 + %add6 = add i32 %add, %mul1 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}notudot2_SameVec + +; GFX900: v_mad_u32_u24 + +; GFX906: v_mad_u32_u24 +define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = zext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv2 = zext i16 %s2.elt1 to i32 + %mul1 = mul i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv3 = zext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 + %conv4 = zext i16 %s2.elt2 to i32 + %mul2 = mul i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %s3 + %add6 = add i32 %add, %mul1 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}notudot2_LargerVec + +; GFX900: v_mad_u32_u24 + +; GFX906: v_mad_u32_u24 +define amdgpu_kernel void @notudot2_LargerVec(<4 x i16> addrspace(1)* %src1, + <4 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 + %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 + %conv = zext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 + %conv2 = zext i16 %s2.elt1 to i32 + %mul1 = mul i32 %conv2, %conv + + %s1.elt2 = extractelement <4 x i16> %vec1, i64 1 + %conv3 = zext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <4 x i16> %vec2, i64 1 + %conv4 = zext i16 %s2.elt2 to i32 + %mul2 = mul i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %s3 + %add6 = add i32 %add, %mul1 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}notudot2_DiffIndex + +; GFX900: v_mad_u32_u24 + +; GFX906: v_mad_u32_u24 +define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, + <2 x i16> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 + %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 + + %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 + %conv = zext i16 %s1.elt1 to i32 + %s2.elt1 = extractelement <2 x i16> %vec2, i64 1 + %conv2 = zext i16 %s2.elt1 to i32 + %mul1 = mul i32 %conv2, %conv + + %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 + %conv3 = zext i16 %s1.elt2 to i32 + %s2.elt2 = extractelement <2 x i16> %vec2, i64 0 + %conv4 = zext i16 %s2.elt2 to i32 + %mul2 = mul i32 %conv4, %conv3 + + %s3 = load i32, i32 addrspace(1)* %dst, align 4 + %add = add i32 %mul2, %s3 + %add6 = add i32 %add, %mul1 + store i32 %add6, i32 addrspace(1)* %dst, align 4 + ret void +}