diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -193,13 +193,13 @@ // Deep learning builtins. //===----------------------------------------------------------------------===// -TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot2-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot7-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts") -TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot2-insts") +TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot7-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts") -TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot2-insts") +TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts") //===----------------------------------------------------------------------===// // GFX10+ only builtins. diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -183,6 +183,7 @@ Features["dot2-insts"] = true; Features["dot5-insts"] = true; Features["dot6-insts"] = true; + Features["dot7-insts"] = true; Features["dl-insts"] = true; Features["flat-address-space"] = true; Features["16-bit-insts"] = true; @@ -200,6 +201,7 @@ Features["dot2-insts"] = true; Features["dot5-insts"] = true; Features["dot6-insts"] = true; + Features["dot7-insts"] = true; LLVM_FALLTHROUGH; case GK_GFX1010: Features["dl-insts"] = true; @@ -227,6 +229,7 @@ Features["dl-insts"] = true; Features["dot1-insts"] = true; Features["dot2-insts"] = true; + Features["dot7-insts"] = true; LLVM_FALLTHROUGH; case GK_GFX90C: case GK_GFX909: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -50,17 +50,17 @@ // GFX900: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" // GFX902: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" // GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" +// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" +// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX1030: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX1031: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX1032: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" -// GFX1033: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX1030: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX1031: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX1032: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX1033: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" kernel void test() {} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl @@ -13,8 +13,8 @@ half2 v2hA, half2 v2hB, float fC, short2 v2ssA, short2 v2ssB, int siA, int siB, int siC, ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC) { - fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot2-insts}} - fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot2-insts}} + fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}} + fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}} siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} @@ -25,12 +25,12 @@ siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}} siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}} - uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot2-insts}} - uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot2-insts}} + uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}} + uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}} siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}} siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}} - uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot2-insts}} - uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot2-insts}} + uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}} + uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}} } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -480,7 +480,7 @@ def FeatureDot2Insts : SubtargetFeature<"dot2-insts", "HasDot2Insts", "true", - "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" + "Has v_dot2_i32_i16, v_dot2_u32_u16 instructions" >; def FeatureDot3Insts : SubtargetFeature<"dot3-insts", @@ -507,6 +507,12 @@ "Has v_dot4c_i32_i8 instruction" >; +def FeatureDot7Insts : SubtargetFeature<"dot7-insts", + "HasDot7Insts", + "true", + "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" +>; + def FeatureMAIInsts : SubtargetFeature<"mai-insts", "HasMAIInsts", "true", @@ -902,6 +908,7 @@ FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, + FeatureDot7Insts, FeatureSupportsSRAMECC, FeatureImageGather4D16Bug]>; @@ -920,6 +927,7 @@ FeatureDot4Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, FeatureMAIInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, @@ -948,6 +956,7 @@ FeatureDot4Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, Feature64BitDPP, FeaturePackedFP32Ops, FeatureMAIInsts, @@ -1008,6 +1017,7 @@ FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, FeatureNSAEncoding, FeatureWavefrontSize32, FeatureScalarStores, @@ -1028,6 +1038,7 @@ FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, FeatureNSAEncoding, FeatureWavefrontSize32, FeatureScalarStores, @@ -1049,6 +1060,7 @@ FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, FeatureNSAEncoding, FeatureWavefrontSize32, FeatureShaderCyclesRegister]>; @@ -1373,6 +1385,9 @@ def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, AssemblerPredicate<(all_of FeatureDot6Insts)>; +def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">, + AssemblerPredicate<(all_of FeatureDot7Insts)>; + def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -267,6 +267,7 @@ HasDot4Insts(false), HasDot5Insts(false), HasDot6Insts(false), + HasDot7Insts(false), HasMAIInsts(false), HasPkFmacF16Inst(false), HasAtomicFaddInsts(false), diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -150,6 +150,7 @@ bool HasDot4Insts; bool HasDot5Insts; bool HasDot6Insts; + bool HasDot7Insts; bool HasMAIInsts; bool HasPkFmacF16Inst; bool HasAtomicFaddInsts; @@ -687,6 +688,10 @@ return HasDot6Insts; } + bool hasDot7Insts() const { + return HasDot7Insts; + } + bool hasMAIInsts() const { return HasMAIInsts; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10486,7 +10486,7 @@ EVT VT = N->getValueType(0); SDLoc SL(N); - if (!Subtarget->hasDot2Insts() || VT != MVT::f32) + if (!Subtarget->hasDot7Insts() || VT != MVT::f32) return SDValue(); // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -287,19 +287,24 @@ let IsDOT = 1 in { let SubtargetPredicate = HasDot2Insts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", - VOP3_Profile, - AMDGPUfdot2, 1/*ExplicitClamp*/>; def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile, int_amdgcn_sdot2, 1>; def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile, int_amdgcn_udot2, 1>; + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot7Insts in { + +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", + VOP3_Profile, + AMDGPUfdot2, 1/*ExplicitClamp*/>; def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile, int_amdgcn_udot4, 1>; def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile, int_amdgcn_udot8, 1>; -} // End SubtargetPredicate = HasDot2Insts +} // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { @@ -564,13 +569,18 @@ let SubtargetPredicate = HasDot2Insts in { -defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>; defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>; defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>; + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot7Insts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>; defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>; defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>; -} // End SubtargetPredicate = HasDot2Insts +} // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { @@ -657,13 +667,18 @@ let SubtargetPredicate = HasDot2Insts in { -defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>; defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>; defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot7Insts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>; defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>; defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>; -} // End SubtargetPredicate = HasDot2Insts +} // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in {