Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -18,13 +18,20 @@ #endif #pragma push_macro("SM_70") -#define SM_70 "sm_70|sm_71" +#pragma push_macro("SM_72") +#pragma push_macro("SM_75") +#define SM_75 "sm_75" +#define SM_72 "sm_72|" SM_75 +#define SM_70 "sm_70|" SM_72 + #pragma push_macro("SM_60") #define SM_60 "sm_60|sm_61|sm_62|" SM_70 -#pragma push_macro("PTX61") -#define PTX61 "ptx61" #pragma push_macro("PTX60") +#pragma push_macro("PTX61") +#pragma push_macro("PTX63") +#define PTX63 "ptx63" +#define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 #pragma push_macro("AND") @@ -666,10 +673,52 @@ TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) +// Builtins to support integer and sub-integer WMMA instructions on sm_72/sm_75 +TARGET_BUILTIN(__bmma_m8n8k128_ld_a_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_ld_b_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_mma_xor_popc_b1, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_st_c_i32, "vi*i*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_st_c_i32, "vi*i*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_st_c_i32, "vi*i*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_st_c_i32, "vi*i*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_a_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_a_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_b_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_b_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*i*UiIi", "", AND(SM_75,PTX63)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") #pragma pop_macro("SM_60") #pragma pop_macro("SM_70") +#pragma pop_macro("SM_72") +#pragma pop_macro("SM_75") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") +#pragma pop_macro("PTX63") Index: clang/lib/Basic/Targets/NVPTX.cpp =================================================================== --- clang/lib/Basic/Targets/NVPTX.cpp +++ clang/lib/Basic/Targets/NVPTX.cpp @@ -44,6 +44,8 @@ if (!Feature.startswith("+ptx")) continue; PTXVersion = llvm::StringSwitch(Feature) + .Case("+ptx64", 64) + .Case("+ptx63", 63) .Case("+ptx61", 61) .Case("+ptx60", 60) .Case("+ptx50", 50) Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -12879,8 +12879,266 @@ } } -Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { +namespace { +// Helper classes for mapping MMA builtins to particular LLVM intrinsic variant. +class NVPTXMmaLdstInfo { +public: + unsigned NumResults; // Number of elements to load/store + // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported. + unsigned IID_col; + unsigned IID_row; + NVPTXMmaLdstInfo(unsigned BuiltinID) + : NVPTXMmaLdstInfo(getNVPTXMmaLdstInfo(BuiltinID)) {} + +private: + NVPTXMmaLdstInfo(unsigned NumResults, unsigned IID_col, unsigned IID_row) + : NumResults(NumResults), IID_col(IID_col), IID_row(IID_row) {} + +#define MMA_INTR(geom_op_type, layout) \ + Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride +#define MMA_LDST(n, geom_op_type) \ + { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) } + + static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) { + switch (BuiltinID) { + // FP MMA loads + case NVPTX::BI__hmma_m16n16k16_ld_a: + return MMA_LDST(8, m16n16k16_load_a_f16); + case NVPTX::BI__hmma_m16n16k16_ld_b: + return MMA_LDST(8, m16n16k16_load_b_f16); + case NVPTX::BI__hmma_m16n16k16_ld_c_f16: + return MMA_LDST(4, m16n16k16_load_c_f16); + case NVPTX::BI__hmma_m16n16k16_ld_c_f32: + return MMA_LDST(8, m16n16k16_load_c_f32); + case NVPTX::BI__hmma_m32n8k16_ld_a: + return MMA_LDST(8, m32n8k16_load_a_f16); + case NVPTX::BI__hmma_m32n8k16_ld_b: + return MMA_LDST(8, m32n8k16_load_b_f16); + case NVPTX::BI__hmma_m32n8k16_ld_c_f16: + return MMA_LDST(4, m32n8k16_load_c_f16); + case NVPTX::BI__hmma_m32n8k16_ld_c_f32: + return MMA_LDST(8, m32n8k16_load_c_f32); + case NVPTX::BI__hmma_m8n32k16_ld_a: + return MMA_LDST(8, m8n32k16_load_a_f16); + case NVPTX::BI__hmma_m8n32k16_ld_b: + return MMA_LDST(8, m8n32k16_load_b_f16); + case NVPTX::BI__hmma_m8n32k16_ld_c_f16: + return MMA_LDST(4, m8n32k16_load_c_f16); + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: + return MMA_LDST(8, m8n32k16_load_c_f32); + + // Integer MMA loads + case NVPTX::BI__imma_m16n16k16_ld_a_s8: + return MMA_LDST(2, m16n16k16_load_a_s8); + case NVPTX::BI__imma_m16n16k16_ld_a_u8: + return MMA_LDST(2, m16n16k16_load_a_u8); + case NVPTX::BI__imma_m16n16k16_ld_b_s8: + return MMA_LDST(2, m16n16k16_load_b_s8); + case NVPTX::BI__imma_m16n16k16_ld_b_u8: + return MMA_LDST(2, m16n16k16_load_b_u8); + case NVPTX::BI__imma_m16n16k16_ld_c: + return MMA_LDST(8, m16n16k16_load_c_s32); + case NVPTX::BI__imma_m32n8k16_ld_a_s8: + return MMA_LDST(4, m32n8k16_load_a_s8); + case NVPTX::BI__imma_m32n8k16_ld_a_u8: + return MMA_LDST(4, m32n8k16_load_a_u8); + case NVPTX::BI__imma_m32n8k16_ld_b_s8: + return MMA_LDST(1, m32n8k16_load_b_s8); + case NVPTX::BI__imma_m32n8k16_ld_b_u8: + return MMA_LDST(1, m32n8k16_load_b_u8); + case NVPTX::BI__imma_m32n8k16_ld_c: + return MMA_LDST(8, m32n8k16_load_c_s32); + case NVPTX::BI__imma_m8n32k16_ld_a_s8: + return MMA_LDST(1, m8n32k16_load_a_s8); + case NVPTX::BI__imma_m8n32k16_ld_a_u8: + return MMA_LDST(1, m8n32k16_load_a_u8); + case NVPTX::BI__imma_m8n32k16_ld_b_s8: + return MMA_LDST(4, m8n32k16_load_b_s8); + case NVPTX::BI__imma_m8n32k16_ld_b_u8: + return MMA_LDST(4, m8n32k16_load_b_u8); + case NVPTX::BI__imma_m8n32k16_ld_c: + return MMA_LDST(8, m8n32k16_load_c_s32); + + // Sub-integer MMA loads. + // Only row/col layout is supported by A/B fragments. + case NVPTX::BI__imma_m8n8k32_ld_a_s4: + return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)}; + case NVPTX::BI__imma_m8n8k32_ld_a_u4: + return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)}; + case NVPTX::BI__imma_m8n8k32_ld_b_s4: + return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0}; + case NVPTX::BI__imma_m8n8k32_ld_b_u4: + return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0}; + case NVPTX::BI__imma_m8n8k32_ld_c: + return MMA_LDST(2, m8n8k32_load_c_s32); + case NVPTX::BI__bmma_m8n8k128_ld_a_b1: + return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)}; + case NVPTX::BI__bmma_m8n8k128_ld_b_b1: + return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0}; + case NVPTX::BI__bmma_m8n8k128_ld_c: + return MMA_LDST(2, m8n8k128_load_c_s32); + + // NOTE: We need to follow inconsitent naming scheme used by NVCC. Unlike + // PTX and LLVM IR where stores always use fragment D, NVCC builtins always + // use fragment C for both loads and stores. + // FP MMA stores. + case NVPTX::BI__hmma_m16n16k16_st_c_f16: + return MMA_LDST(4, m16n16k16_store_d_f16); + case NVPTX::BI__hmma_m16n16k16_st_c_f32: + return MMA_LDST(8, m16n16k16_store_d_f32); + case NVPTX::BI__hmma_m32n8k16_st_c_f16: + return MMA_LDST(4, m32n8k16_store_d_f16); + case NVPTX::BI__hmma_m32n8k16_st_c_f32: + return MMA_LDST(8, m32n8k16_store_d_f32); + case NVPTX::BI__hmma_m8n32k16_st_c_f16: + return MMA_LDST(4, m8n32k16_store_d_f16); + case NVPTX::BI__hmma_m8n32k16_st_c_f32: + return MMA_LDST(8, m8n32k16_store_d_f32); + + // Integer and sub-integer MMA stores. + // Another naming quirk. Unlike other MMA builtins that use PTX types in the + // name, integer loads/stores use LLVM's i32. + case NVPTX::BI__imma_m16n16k16_st_c_i32: + return MMA_LDST(8, m16n16k16_store_d_s32); + case NVPTX::BI__imma_m32n8k16_st_c_i32: + return MMA_LDST(8, m32n8k16_store_d_s32); + case NVPTX::BI__imma_m8n32k16_st_c_i32: + return MMA_LDST(8, m8n32k16_store_d_s32); + case NVPTX::BI__imma_m8n8k32_st_c_i32: + return MMA_LDST(2, m8n8k32_store_d_s32); + case NVPTX::BI__bmma_m8n8k128_st_c_i32: + return MMA_LDST(2, m8n8k128_store_d_s32); + + default: + llvm_unreachable("Unknown MMA builtin"); + } + } +#undef MMA_LDST +#undef MMA_INTR +}; + +class NVPTXMmaInfo { +private: + +public: + unsigned NumEltsA; + unsigned NumEltsB; + unsigned NumEltsC; + unsigned NumEltsD; + std::array Variants; + NVPTXMmaInfo(unsigned BuiltinID) : NVPTXMmaInfo(getNVPTXMmaInfo(BuiltinID)) {} + + // Returns an intrinsic that matches Layout and Satf for valid combinations of + // Layout and Satf, 0 otherwise. + unsigned getMMAIntrinsic(int Layout, bool Satf) { + unsigned Index = Layout * 2 + Satf; + if (Index >= Variants.size()) + return 0; + return Variants[Index]; + } + +private: + NVPTXMmaInfo(unsigned NumA, unsigned NumB, unsigned NumC, unsigned NumD, + std::array Variants) + : NumEltsA(NumA), NumEltsB(NumB), NumEltsC(NumC), NumEltsD(NumD), + Variants(Variants){}; + static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) { + // clang-format off +#define MMA_VARIANTS(geom, type) {{ \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \ + }} +// Sub-integer MMA only supports row.col layout. +#define MMA_VARIANTS_I4(geom, type) {{ \ + 0, \ + 0, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ + 0, \ + 0, \ + 0, \ + 0 \ + }} +// b1 MMA does not support .satfinite. +#define MMA_VARIANTS_B1(geom, type) {{ \ + 0, \ + 0, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ + 0, \ + 0, \ + 0, \ + 0, \ + 0 \ + }} + // clang-format on + switch (BuiltinID) { + // FP MMA + // Note that 'type' argument of MMA_VARIANT uses D_C notation, while + // NumEltsN of return value are ordered as A,B,C,D. + case NVPTX::BI__hmma_m16n16k16_mma_f16f16: + return {8, 8, 4, 4, MMA_VARIANTS(m16n16k16, f16_f16)}; + case NVPTX::BI__hmma_m16n16k16_mma_f32f16: + return {8, 8, 4, 8, MMA_VARIANTS(m16n16k16, f32_f16)}; + case NVPTX::BI__hmma_m16n16k16_mma_f16f32: + return {8, 8, 8, 4, MMA_VARIANTS(m16n16k16, f16_f32)}; + case NVPTX::BI__hmma_m16n16k16_mma_f32f32: + return {8, 8, 8, 8, MMA_VARIANTS(m16n16k16, f32_f32)}; + case NVPTX::BI__hmma_m32n8k16_mma_f16f16: + return {8, 8, 4, 4, MMA_VARIANTS(m32n8k16, f16_f16)}; + case NVPTX::BI__hmma_m32n8k16_mma_f32f16: + return {8, 8, 4, 8, MMA_VARIANTS(m32n8k16, f32_f16)}; + case NVPTX::BI__hmma_m32n8k16_mma_f16f32: + return {8, 8, 8, 4, MMA_VARIANTS(m32n8k16, f16_f32)}; + case NVPTX::BI__hmma_m32n8k16_mma_f32f32: + return {8, 8, 8, 8, MMA_VARIANTS(m32n8k16, f32_f32)}; + case NVPTX::BI__hmma_m8n32k16_mma_f16f16: + return {8, 8, 4, 4, MMA_VARIANTS(m8n32k16, f16_f16)}; + case NVPTX::BI__hmma_m8n32k16_mma_f32f16: + return {8, 8, 4, 8, MMA_VARIANTS(m8n32k16, f32_f16)}; + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: + return {8, 8, 8, 4, MMA_VARIANTS(m8n32k16, f16_f32)}; + case NVPTX::BI__hmma_m8n32k16_mma_f32f32: + return {8, 8, 8, 8, MMA_VARIANTS(m8n32k16, f32_f32)}; + + // Integer MMA + case NVPTX::BI__imma_m16n16k16_mma_s8: + return {2, 2, 8, 8, MMA_VARIANTS(m16n16k16, s8)}; + case NVPTX::BI__imma_m16n16k16_mma_u8: + return {2, 2, 8, 8, MMA_VARIANTS(m16n16k16, u8)}; + case NVPTX::BI__imma_m32n8k16_mma_s8: + return {4, 1, 8, 8, MMA_VARIANTS(m32n8k16, s8)}; + case NVPTX::BI__imma_m32n8k16_mma_u8: + return {4, 1, 8, 8, MMA_VARIANTS(m32n8k16, u8)}; + case NVPTX::BI__imma_m8n32k16_mma_s8: + return {1, 4, 8, 8, MMA_VARIANTS(m8n32k16, s8)}; + case NVPTX::BI__imma_m8n32k16_mma_u8: + return {1, 4, 8, 8, MMA_VARIANTS(m8n32k16, u8)}; + + // Sub-integer MMA + case NVPTX::BI__imma_m8n8k32_mma_s4: + return {1, 1, 2, 2, MMA_VARIANTS_I4(m8n8k32, s4)}; + case NVPTX::BI__imma_m8n8k32_mma_u4: + return {1, 1, 2, 2, MMA_VARIANTS_I4(m8n8k32, u4)}; + case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: + return {1, 1, 2, 2, MMA_VARIANTS_B1(m8n8k128, b1)}; + default: + llvm_unreachable("Unexpected builtin ID."); + } +#undef MMA_VARIANTS +#undef MMA_VARIANTS_I4 +#undef MMA_VARIANTS_B1 + } +}; +} // namespace + +Value * +CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { auto MakeLdg = [&](unsigned IntrinsicID) { Value *Ptr = EmitScalarExpr(E->getArg(0)); clang::CharUnits Align = @@ -13143,6 +13401,8 @@ Builder.CreateStore(Pred, PredOutPtr); return Builder.CreateExtractValue(ResultPair, 0); } + + // FP MMA loads case NVPTX::BI__hmma_m16n16k16_ld_a: case NVPTX::BI__hmma_m16n16k16_ld_b: case NVPTX::BI__hmma_m16n16k16_ld_c_f16: @@ -13154,7 +13414,33 @@ case NVPTX::BI__hmma_m8n32k16_ld_a: case NVPTX::BI__hmma_m8n32k16_ld_b: case NVPTX::BI__hmma_m8n32k16_ld_c_f16: - case NVPTX::BI__hmma_m8n32k16_ld_c_f32: { + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: + // Integer MMA loads. + case NVPTX::BI__imma_m16n16k16_ld_a_s8: + case NVPTX::BI__imma_m16n16k16_ld_a_u8: + case NVPTX::BI__imma_m16n16k16_ld_b_s8: + case NVPTX::BI__imma_m16n16k16_ld_b_u8: + case NVPTX::BI__imma_m16n16k16_ld_c: + case NVPTX::BI__imma_m32n8k16_ld_a_s8: + case NVPTX::BI__imma_m32n8k16_ld_a_u8: + case NVPTX::BI__imma_m32n8k16_ld_b_s8: + case NVPTX::BI__imma_m32n8k16_ld_b_u8: + case NVPTX::BI__imma_m32n8k16_ld_c: + case NVPTX::BI__imma_m8n32k16_ld_a_s8: + case NVPTX::BI__imma_m8n32k16_ld_a_u8: + case NVPTX::BI__imma_m8n32k16_ld_b_s8: + case NVPTX::BI__imma_m8n32k16_ld_b_u8: + case NVPTX::BI__imma_m8n32k16_ld_c: + // Sub-integer MMA loads. + case NVPTX::BI__imma_m8n8k32_ld_a_s4: + case NVPTX::BI__imma_m8n8k32_ld_a_u4: + case NVPTX::BI__imma_m8n8k32_ld_b_s4: + case NVPTX::BI__imma_m8n8k32_ld_b_u4: + case NVPTX::BI__imma_m8n8k32_ld_c: + case NVPTX::BI__bmma_m8n8k128_ld_a_b1: + case NVPTX::BI__bmma_m8n8k128_ld_b_b1: + case NVPTX::BI__bmma_m8n8k128_ld_c: + { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Value *Src = EmitScalarExpr(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -13162,82 +13448,28 @@ if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext())) return nullptr; bool isColMajor = isColMajorArg.getSExtValue(); - unsigned IID; - unsigned NumResults; - switch (BuiltinID) { - case NVPTX::BI__hmma_m16n16k16_ld_a: - IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride - : Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m16n16k16_ld_b: - IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride - : Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m16n16k16_ld_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride - : Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride; - NumResults = 4; - break; - case NVPTX::BI__hmma_m16n16k16_ld_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride - : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m32n8k16_ld_a: - IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride - : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m32n8k16_ld_b: - IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride - : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m32n8k16_ld_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride - : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride; - NumResults = 4; - break; - case NVPTX::BI__hmma_m32n8k16_ld_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride - : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m8n32k16_ld_a: - IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride - : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m8n32k16_ld_b: - IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride - : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride; - NumResults = 8; - break; - case NVPTX::BI__hmma_m8n32k16_ld_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride - : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride; - NumResults = 4; - break; - case NVPTX::BI__hmma_m8n32k16_ld_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride - : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride; - NumResults = 8; - break; - default: - llvm_unreachable("Unexpected builtin ID."); - } + NVPTXMmaLdstInfo II(BuiltinID); + unsigned IID = isColMajor ? II.IID_col : II.IID_row; + if (IID == 0) + return nullptr; + Value *Result = Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm}); // Save returned values. - for (unsigned i = 0; i < NumResults; ++i) { - Builder.CreateAlignedStore( - Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), - Dst.getElementType()), - Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)), - CharUnits::fromQuantity(4)); + assert(II.NumResults); + if (II.NumResults == 1) { + Builder.CreateAlignedStore(Result, Dst.getPointer(), + CharUnits::fromQuantity(4)); + } else { + for (unsigned i = 0; i < II.NumResults; ++i) { + Builder.CreateAlignedStore( + Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), + Dst.getElementType()), + Builder.CreateGEP(Dst.getPointer(), + llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); + } } return Result; } @@ -13247,7 +13479,12 @@ case NVPTX::BI__hmma_m32n8k16_st_c_f16: case NVPTX::BI__hmma_m32n8k16_st_c_f32: case NVPTX::BI__hmma_m8n32k16_st_c_f16: - case NVPTX::BI__hmma_m8n32k16_st_c_f32: { + case NVPTX::BI__hmma_m8n32k16_st_c_f32: + case NVPTX::BI__imma_m16n16k16_st_c_i32: + case NVPTX::BI__imma_m32n8k16_st_c_i32: + case NVPTX::BI__imma_m8n32k16_st_c_i32: + case NVPTX::BI__imma_m8n8k32_st_c_i32: + case NVPTX::BI__bmma_m8n8k128_st_c_i32: { Value *Dst = EmitScalarExpr(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -13255,45 +13492,15 @@ if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext())) return nullptr; bool isColMajor = isColMajorArg.getSExtValue(); - unsigned IID; - unsigned NumResults = 8; - // PTX Instructions (and LLVM intrinsics) are defined for slice _d_, yet - // for some reason nvcc builtins use _c_. - switch (BuiltinID) { - case NVPTX::BI__hmma_m16n16k16_st_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride - : Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride; - NumResults = 4; - break; - case NVPTX::BI__hmma_m16n16k16_st_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride - : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride; - break; - case NVPTX::BI__hmma_m32n8k16_st_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride - : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride; - NumResults = 4; - break; - case NVPTX::BI__hmma_m32n8k16_st_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride - : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride; - break; - case NVPTX::BI__hmma_m8n32k16_st_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride - : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride; - NumResults = 4; - break; - case NVPTX::BI__hmma_m8n32k16_st_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride - : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride; - break; - default: - llvm_unreachable("Unexpected builtin ID."); - } - Function *Intrinsic = CGM.getIntrinsic(IID, Dst->getType()); + NVPTXMmaLdstInfo II(BuiltinID); + unsigned IID = isColMajor ? II.IID_col : II.IID_row; + if (IID == 0) + return nullptr; + Function *Intrinsic = + CGM.getIntrinsic(IID, Dst->getType()); llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1); SmallVector Values = {Dst}; - for (unsigned i = 0; i < NumResults; ++i) { + for (unsigned i = 0; i < II.NumResults; ++i) { Value *V = Builder.CreateAlignedLoad( Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)), CharUnits::fromQuantity(4)); @@ -13317,7 +13524,16 @@ case NVPTX::BI__hmma_m8n32k16_mma_f16f16: case NVPTX::BI__hmma_m8n32k16_mma_f32f16: case NVPTX::BI__hmma_m8n32k16_mma_f32f32: - case NVPTX::BI__hmma_m8n32k16_mma_f16f32: { + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: + case NVPTX::BI__imma_m16n16k16_mma_s8: + case NVPTX::BI__imma_m16n16k16_mma_u8: + case NVPTX::BI__imma_m32n8k16_mma_s8: + case NVPTX::BI__imma_m32n8k16_mma_u8: + case NVPTX::BI__imma_m8n32k16_mma_s8: + case NVPTX::BI__imma_m8n32k16_mma_u8: + case NVPTX::BI__imma_m8n8k32_mma_s4: + case NVPTX::BI__imma_m8n8k32_mma_u4: + case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Address SrcA = EmitPointerWithAlignment(E->getArg(1)); Address SrcB = EmitPointerWithAlignment(E->getArg(2)); @@ -13332,116 +13548,35 @@ if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext())) return nullptr; bool Satf = SatfArg.getSExtValue(); - - // clang-format off -#define MMA_VARIANTS(geom, type) {{ \ - Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \ - Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ - Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ - Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \ - Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \ - Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \ - }} - // clang-format on - - auto getMMAIntrinsic = [Layout, Satf](std::array Variants) { - unsigned Index = Layout * 2 + Satf; - assert(Index < 8); - return Variants[Index]; - }; - unsigned IID; - unsigned NumEltsC; - unsigned NumEltsD; - switch (BuiltinID) { - case NVPTX::BI__hmma_m16n16k16_mma_f16f16: - IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16)); - NumEltsC = 4; - NumEltsD = 4; - break; - case NVPTX::BI__hmma_m16n16k16_mma_f32f16: - IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16)); - NumEltsC = 4; - NumEltsD = 8; - break; - case NVPTX::BI__hmma_m16n16k16_mma_f16f32: - IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32)); - NumEltsC = 8; - NumEltsD = 4; - break; - case NVPTX::BI__hmma_m16n16k16_mma_f32f32: - IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32)); - NumEltsC = 8; - NumEltsD = 8; - break; - case NVPTX::BI__hmma_m32n8k16_mma_f16f16: - IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16)); - NumEltsC = 4; - NumEltsD = 4; - break; - case NVPTX::BI__hmma_m32n8k16_mma_f32f16: - IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16)); - NumEltsC = 4; - NumEltsD = 8; - break; - case NVPTX::BI__hmma_m32n8k16_mma_f16f32: - IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32)); - NumEltsC = 8; - NumEltsD = 4; - break; - case NVPTX::BI__hmma_m32n8k16_mma_f32f32: - IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32)); - NumEltsC = 8; - NumEltsD = 8; - break; - case NVPTX::BI__hmma_m8n32k16_mma_f16f16: - IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16)); - NumEltsC = 4; - NumEltsD = 4; - break; - case NVPTX::BI__hmma_m8n32k16_mma_f32f16: - IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16)); - NumEltsC = 4; - NumEltsD = 8; - break; - case NVPTX::BI__hmma_m8n32k16_mma_f16f32: - IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32)); - NumEltsC = 8; - NumEltsD = 4; - break; - case NVPTX::BI__hmma_m8n32k16_mma_f32f32: - IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32)); - NumEltsC = 8; - NumEltsD = 8; - break; - default: - llvm_unreachable("Unexpected builtin ID."); - } -#undef MMA_VARIANTS + NVPTXMmaInfo MI(BuiltinID); + unsigned IID = MI.getMMAIntrinsic(Layout, Satf); + if (IID == 0) // Unsupported combination of Layout/Satf. + return nullptr; SmallVector Values; Function *Intrinsic = CGM.getIntrinsic(IID); - llvm::Type *ABType = Intrinsic->getFunctionType()->getParamType(0); + llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0); // Load A - for (unsigned i = 0; i < 8; ++i) { + for (unsigned i = 0; i < MI.NumEltsA; ++i) { Value *V = Builder.CreateAlignedLoad( Builder.CreateGEP(SrcA.getPointer(), llvm::ConstantInt::get(IntTy, i)), CharUnits::fromQuantity(4)); - Values.push_back(Builder.CreateBitCast(V, ABType)); + Values.push_back(Builder.CreateBitCast(V, AType)); } // Load B - for (unsigned i = 0; i < 8; ++i) { + llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA); + for (unsigned i = 0; i < MI.NumEltsB; ++i) { Value *V = Builder.CreateAlignedLoad( Builder.CreateGEP(SrcB.getPointer(), llvm::ConstantInt::get(IntTy, i)), CharUnits::fromQuantity(4)); - Values.push_back(Builder.CreateBitCast(V, ABType)); + Values.push_back(Builder.CreateBitCast(V, BType)); } // Load C - llvm::Type *CType = Intrinsic->getFunctionType()->getParamType(16); - for (unsigned i = 0; i < NumEltsC; ++i) { + llvm::Type *CType = + Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB); + for (unsigned i = 0; i < MI.NumEltsC; ++i) { Value *V = Builder.CreateAlignedLoad( Builder.CreateGEP(SrcC.getPointer(), llvm::ConstantInt::get(IntTy, i)), @@ -13450,7 +13585,7 @@ } Value *Result = Builder.CreateCall(Intrinsic, Values); llvm::Type *DType = Dst.getElementType(); - for (unsigned i = 0; i < NumEltsD; ++i) + for (unsigned i = 0; i < MI.NumEltsD; ++i) Builder.CreateAlignedStore( Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType), Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)), Index: clang/lib/Driver/ToolChains/Cuda.cpp =================================================================== --- clang/lib/Driver/ToolChains/Cuda.cpp +++ clang/lib/Driver/ToolChains/Cuda.cpp @@ -644,19 +644,25 @@ CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); - // Libdevice in CUDA-7.0 requires PTX version that's more recent than LLVM - // defaults to. Use PTX4.2 by default, which is the PTX version that came with - // CUDA-7.0. - const char *PtxFeature = "+ptx42"; - // TODO(tra): CUDA-10+ needs PTX 6.3 to support new features. However that - // requires fair amount of work on LLVM side. We'll keep using PTX 6.1 until - // all prerequisites are in place. - if (CudaInstallation.version() >= CudaVersion::CUDA_91) { - // CUDA-9.1 uses new instructions that are only available in PTX6.1+ - PtxFeature = "+ptx61"; - } else if (CudaInstallation.version() >= CudaVersion::CUDA_90) { - // CUDA-9.0 uses new instructions that are only available in PTX6.0+ - PtxFeature = "+ptx60"; + // New CUDA versions often introduce new instructions that are only supported + // by new PTX version, so we need to raise PTX level to enable them in NVPTX + // back-end. + const char *PtxFeature = nullptr; + switch(CudaInstallation.version()) { + case CudaVersion::CUDA_101: + PtxFeature = "+ptx64"; + break; + case CudaVersion::CUDA_100: + PtxFeature = "+ptx63"; + break; + case CudaVersion::CUDA_91: + PtxFeature = "+ptx61"; + break; + case CudaVersion::CUDA_90: + PtxFeature = "+ptx60"; + break; + default: + PtxFeature = "+ptx42"; } CC1Args.append({"-target-feature", PtxFeature}); if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, Index: clang/test/CodeGen/builtins-nvptx-mma.cu =================================================================== --- /dev/null +++ clang/test/CodeGen/builtins-nvptx-mma.cu @@ -0,0 +1,755 @@ + +// +// *** DO NOT EDIT *** +// +// This test has been automatically generated by +// builtins-nvtx-mma.py --ptx=63 --gpu-arch=75 +// +// Make sure we can handle all builtins available on sm_75 with PTX63 +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_75 \ +// RUN: -fcuda-is-device -target-feature +ptx63 \ +// RUN: -DPTX=63 -DSM=75 \ +// RUN: -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefixes=CHECK_PTX61_SM70,CHECK_PTX63_SM75,CHECK_PTX63_SM72,CHECK_PTX60_SM70 %s +// Verify that all builtins have correct constraints. +// RUN: %clang_cc1 -triple nvptx-unknown-unknown \ +// RUN: -target-cpu sm_60 -target-feature +ptx42 \ +// RUN: -DPTX=63 -DSM=75 -fcuda-is-device -S -o /dev/null -x cuda \ +// RUN: -verify %s + + +#if !defined(CUDA_VERSION) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +typedef unsigned long long uint64_t; +#endif + +// CHECK-LABEL: test_wmma_buitins +__device__ void test_wmma_buitins(int *src, int *dst, + float *fsrc, float *fdst, int ldm) { + + +#if (PTX >= 60) && (SM >= 70) + + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_a(dst, src, ldm, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_a(dst, src, ldm, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_b(dst, src, ldm, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_b(dst, src, ldm, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_st_c_f16(dst, src, ldm, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_st_c_f16(dst, src, ldm, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32 + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); +#endif // (PTX >= 60) && (SM >= 70) + +#if (PTX >= 61) && (SM >= 70) + + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_a(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_a(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_b(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_b(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f16(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f16(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_a(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_a(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_b(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_b(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f16(dst, src, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f16(dst, src, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32 + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32 + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32.satfinite + // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); +#endif // (PTX >= 61) && (SM >= 70) + +#if (PTX >= 63) && (SM >= 72) + + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_a_s8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_a_s8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_a_u8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_a_u8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_b_s8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_b_s8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_b_u8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_b_u8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_c(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32 + // expected-error-re@+1 {{'__imma_m16n16k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_ld_c(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32 + // expected-error-re@+1 {{'__imma_m16n16k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_st_c_i32(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32 + // expected-error-re@+1 {{'__imma_m16n16k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_st_c_i32(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_a_s8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_a_s8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_a_u8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_a_u8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_b_s8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_b_s8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_b_u8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_b_u8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_c(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32 + // expected-error-re@+1 {{'__imma_m32n8k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_ld_c(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32 + // expected-error-re@+1 {{'__imma_m32n8k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_st_c_i32(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32 + // expected-error-re@+1 {{'__imma_m32n8k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_st_c_i32(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_a_s8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_a_s8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_a_u8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_a_u8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_b_s8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_b_s8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_b_u8(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_b_u8(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_c(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32 + // expected-error-re@+1 {{'__imma_m8n32k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_ld_c(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32 + // expected-error-re@+1 {{'__imma_m8n32k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_st_c_i32(dst, src, ldm, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32 + // expected-error-re@+1 {{'__imma_m8n32k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_st_c_i32(dst, src, ldm, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 3, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 3, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 2, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.s8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 2, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.s8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 1, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 0, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_s8(dst, src, src, src, 0, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 3, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 3, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 2, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.u8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 2, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.u8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 1, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8 + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 0, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8.satfinite + // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m16n16k16_mma_u8(dst, src, src, src, 0, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 3, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 3, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 2, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.s8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 2, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.s8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 1, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 0, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_s8(dst, src, src, src, 0, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 3, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 3, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 2, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.u8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 2, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.u8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 1, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8 + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 0, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8.satfinite + // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m32n8k16_mma_u8(dst, src, src, src, 0, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 3, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 3, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 2, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.s8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 2, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.s8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 1, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 0, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_s8(dst, src, src, src, 0, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 3, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 3, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 2, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.u8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 2, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.u8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 1, 1); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8 + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 0, 0); + // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8.satfinite + // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}} + __imma_m8n32k16_mma_u8(dst, src, src, src, 0, 1); +#endif // (PTX >= 63) && (SM >= 72) + +#if (PTX >= 63) && (SM >= 75) + + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.a.row.stride.b1 + // expected-error-re@+1 {{'__bmma_m8n8k128_ld_a_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __bmma_m8n8k128_ld_a_b1(dst, src, ldm, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.b.col.stride.b1 + // expected-error-re@+1 {{'__bmma_m8n8k128_ld_b_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __bmma_m8n8k128_ld_b_b1(dst, src, ldm, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.c.col.stride.s32 + // expected-error-re@+1 {{'__bmma_m8n8k128_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __bmma_m8n8k128_ld_c(dst, src, ldm, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.c.row.stride.s32 + // expected-error-re@+1 {{'__bmma_m8n8k128_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __bmma_m8n8k128_ld_c(dst, src, ldm, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.store.d.col.stride.s32 + // expected-error-re@+1 {{'__bmma_m8n8k128_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __bmma_m8n8k128_st_c_i32(dst, src, ldm, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.store.d.row.stride.s32 + // expected-error-re@+1 {{'__bmma_m8n8k128_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __bmma_m8n8k128_st_c_i32(dst, src, ldm, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.a.row.stride.s4 + // expected-error-re@+1 {{'__imma_m8n8k32_ld_a_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_ld_a_s4(dst, src, ldm, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.a.row.stride.u4 + // expected-error-re@+1 {{'__imma_m8n8k32_ld_a_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_ld_a_u4(dst, src, ldm, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.b.col.stride.s4 + // expected-error-re@+1 {{'__imma_m8n8k32_ld_b_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_ld_b_s4(dst, src, ldm, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.b.col.stride.u4 + // expected-error-re@+1 {{'__imma_m8n8k32_ld_b_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_ld_b_u4(dst, src, ldm, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.c.col.stride.s32 + // expected-error-re@+1 {{'__imma_m8n8k32_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_ld_c(dst, src, ldm, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.c.row.stride.s32 + // expected-error-re@+1 {{'__imma_m8n8k32_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_ld_c(dst, src, ldm, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.store.d.col.stride.s32 + // expected-error-re@+1 {{'__imma_m8n8k32_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_st_c_i32(dst, src, ldm, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.store.d.row.stride.s32 + // expected-error-re@+1 {{'__imma_m8n8k32_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_st_c_i32(dst, src, ldm, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.mma.row.col.b1 + // expected-error-re@+1 {{'__bmma_m8n8k128_mma_xor_popc_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __bmma_m8n8k128_mma_xor_popc_b1(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.s4 + // expected-error-re@+1 {{'__imma_m8n8k32_mma_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_mma_s4(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.s4.satfinite + // expected-error-re@+1 {{'__imma_m8n8k32_mma_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_mma_s4(dst, src, src, src, 1, 1); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.u4 + // expected-error-re@+1 {{'__imma_m8n8k32_mma_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_mma_u4(dst, src, src, src, 1, 0); + // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.u4.satfinite + // expected-error-re@+1 {{'__imma_m8n8k32_mma_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}} + __imma_m8n8k32_mma_u4(dst, src, src, src, 1, 1); +#endif // (PTX >= 63) && (SM >= 75) +} Index: clang/test/CodeGen/builtins-nvptx-mma.py =================================================================== --- /dev/null +++ clang/test/CodeGen/builtins-nvptx-mma.py @@ -0,0 +1,339 @@ +# This script generates all variants of wmma builtins, verifies that clang calls +# correct LLVM instrinsics, and checks that availability of specific builtins is +# constrained by the correct PTX version and the target GPU variant. + +# Dummy test run to avoid lit warnings. +# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null + +from __future__ import print_function + +import argparse +from collections import defaultdict +from itertools import product +from string import Template + +class MMAFrag: + def __init__(self, geom, frag, ptx_elt_type): + self.geom = geom + self.frag = frag + self.ptx_type = ptx_elt_type; + + def __repr__(self): + return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type) + +class MMAOp: + def __init__(self, a, b, c, d): + self.a = a + self.b = b + self.c = c + self.d = d + + def __repr__(self): + return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d )) + +def make_mma_ops(geoms, types_a, types_b, types_c, types_d): + ops = [] + for geom, type_a, type_c in product( geoms, types_a, types_c): + for type_b, type_d in product(types_b if types_b else [type_a], + types_d if types_d else [type_c]): + ops.append(MMAOp(MMAFrag(geom, "a", type_a), + MMAFrag(geom, "b", type_b), + MMAFrag(geom, "c", type_c), + MMAFrag(geom, "d", type_d))) + return ops + +def make_ldst_ops(geoms, frags, types): + return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type) + in product(geoms, frags, types)] + +def get_mma_ops(): + return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], + ["f16"], [], ["f16", "f32"], ["f16", "f32"]) + + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], + ["s8", "u8"], [], ["s32"], []) + + make_mma_ops(["m8n8k32"], + ["s4", "u4"], [], ["s32"], []) + + make_mma_ops(["m8n8k128"], + ["b1"], [], ["s32"], [])) +def get_ldst_ops(): + return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"], + ["a", "b"], ["f16", "u8", "s8"]) + + make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"], + ["c", "d"], ["f16", "f32", "s32"]) + + make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) + + make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) + + make_ldst_ops(["m8n8k32", "m8n8k128"], ["c", "d"], ["s32"])) + +def is_geom_supported(geom): + # geometries for FP and ints. + if geom in ["m8n32k16", "m32n8k16"]: + return ptx_version >= 61 + # geometries for sub-ints. + if geom in ["m8n8k32", "m8n8k128"]: + return ptx_version >= 63 and gpu_arch >= 75 + if geom == "m16n16k16": + return ptx_version >= 60 + assert(False) # Unexpected geometry. + +def is_type_supported(ptx_type): + if ptx_type in ["s8", "u8", "s32"]: + return ptx_version >= 63 and gpu_arch >= 72 + if ptx_type in ["s4", "u4", "b1"]: + return ptx_version >= 63 and gpu_arch >= 75 + return ptx_version >= 60 and gpu_arch >= 70 + +def is_mma_variant_supported(op, layout_a, layout_b, satf): + if not (is_type_supported(op.a.ptx_type) + and is_geom_supported(op.a.geom)): + return False + # sub-integer require row/col layout, and no satf. + if op.a.ptx_type in ["s4", "u4", "b1"]: + if op.a.ptx_type == "b1" and satf: + return False + return layout_a == "row" and layout_b == "col" + return True + +def is_ldst_variant_supported(frag, layout): + if not (is_type_supported(frag.ptx_type) + and is_geom_supported(frag.geom)): + return False + if frag.ptx_type in ["s4", "u4", "b1"]: + # sub-integer require sm_75 and ptx63, row/col layout for a/b. + return ((frag.frag == "a" and layout == "row") + or (frag.frag == "b" and layout == "col") + or frag.frag in ["c", "d"]) + return True + +def get_builtin_prefix(frag): + prefix = None + if frag.geom in ["m16n16k16", "m32n8k16", "m8n32k16"]: + if frag.ptx_type in ["f16", "f32"]: + prefix = "__hmma" + else: + prefix = "__imma" + elif frag.geom == "m8n8k32": + prefix = "__imma" # sub-integers + elif frag.geom == "m8n8k128": + prefix = "__bmma" + assert prefix + return prefix + +def get_ldst_builtin_name(frag): + prefix = get_builtin_prefix(frag) + + if prefix == "__hmma": + suffix = "" if frag.frag in ["a","b"] else frag.ptx_type + elif prefix in ["__imma", "__bmma"]: + suffix = "" if frag.frag in ["c"] else frag.ptx_type + if suffix == "s32": + suffix = "i32" + if frag.frag == "d": + ifrag = "c" + op = "st" + else: + ifrag = frag.frag + op = "ld" + + name = "%s_%s_%s_%s%s" % (prefix, frag.geom, op, ifrag, + "_" + suffix if suffix else "") + return name + +def get_mma_builtin_name(op): + prefix = get_builtin_prefix(op.a) + + if prefix == "__hmma": + suffix = op.d.ptx_type + op.c.ptx_type + else: + suffix = op.a.ptx_type + + name = "%s_%s_mma%s_%s" % (prefix, op.a.geom, + "_xor_popc" if op.a.ptx_type == "b1" else "", + suffix) + return name + + +def get_required_sm(frag): + if frag.ptx_type in ["u4", "s4", "b1"]: + return 75 + if frag.ptx_type in ["s8", "u8"]: + return 72 + if frag.ptx_type == "s32": + if frag.geom in ["m8n8k32", "m8n8k128"]: # s4/u4/b1 + return 75 + else: # s8/u8 + return 72 + if frag.ptx_type in ["f16", "f32"]: + return 70 + assert(False) + +def get_required_ptx(frag): + if frag.ptx_type in ["f16", "f32"]: + return 60 if frag.geom == "m16n16k16" else 61 + return 63 + +def gen_wmma_ldst_tests(results): + load_template = """ + // CHECK${check_suffix}: call {{.*}} @${intrinsic} + // expected-error-re@+1 {{'${builtin}' needs target feature sm_${min_sm}{{.*}},ptx${min_ptx}{{.*}}}} + ${builtin}(${dst}, ${src}, ldm, ${blayout}); +""".rstrip() + intrinsic_template = "llvm.nvvm.wmma.${geom}.${op}.${frag}.${ilayout}.stride.${itype}" + + for frag, layout in sorted(product(get_ldst_ops(), ["row","col"]), key=str): + + if not is_ldst_variant_supported(frag, layout): + continue + + is_fp = frag.ptx_type == "f32" + min_sm = get_required_sm(frag) + min_ptx = get_required_ptx(frag) + params = { + "check_suffix" : "_PTX%d_SM%d" % (min_ptx, min_sm), + "builtin" : get_ldst_builtin_name(frag), + "min_ptx" : min_ptx, + "min_sm" : min_sm, + "dst": "fdst" if is_fp else "dst", + "src": "fsrc" if is_fp else "src", + "blayout" : 0 if layout == "row" else 1, + "intrinsic" : Template(intrinsic_template).substitute({ + "frag" : frag.frag, + "geom" : frag.geom, + "ilayout" : layout, + "itype" : frag.ptx_type, + "op" : "store" if frag.frag == "d" else "load", + }) + } + results[(min_ptx,min_sm)] += Template(load_template).substitute(params) + + return results + +def mma_signature(op): + if op.a.ptx_type in ["s8", "u8", "s4", "u4", "b1"]: + # int and sub-int ops are identified by input type. + return op.a.ptx_type + else: + # the rest are FP ops identified by accumulator & result type. + return "%s.%s" % (op.d.ptx_type, op.c.ptx_type) + +# Get numeric value for rowcol parameter of the builtin +# AFAICT it uses the encoding accepted by NVVM intrinsics: +# https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-matrix-mma +def get_ilayout(a, b): + return { + "row.row" : 0, + "row.col" : 1, + "col.row" : 2, + "col.col" : 3 + }[a + "." + b] + +def gen_wmma_mma_tests(results): + mma_template = """ + // CHECK${check_suffix}: call {{.*}} @${intrinsic} + // expected-error-re@+1 {{'${builtin}' needs target feature sm_${min_sm}{{.*}},ptx${min_ptx}{{.*}}}} + ${builtin}(${dst}, ${asrc}, ${asrc}, ${csrc}, ${ilayout}, ${isatf}); +""".rstrip() + intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}" + + for op, alayout, blayout, satf in sorted(product( get_mma_ops(), + ["row","col"], + ["row","col"], + [".satfinite", ""]), + key=str): + + if not is_mma_variant_supported(op, alayout, blayout, satf): + continue + + a_is_fp = op.a.ptx_type == "f32" + c_is_fp = op.c.ptx_type == "f32" + d_is_fp = op.d.ptx_type == "f32" + min_sm = get_required_sm(op.a) + min_ptx = get_required_ptx(op.a) + params = { + "check_suffix" : "_PTX%d_SM%d" % (min_ptx, min_sm), + "builtin" : get_mma_builtin_name(op), + "min_ptx" : min_ptx, + "min_sm" : min_sm, + "dst": "fdst" if d_is_fp else "dst", + "asrc": "fsrc" if a_is_fp else "src", + "csrc": "fsrc" if c_is_fp else "src", + "ilayout" : get_ilayout(alayout, blayout), + "isatf" : 1 if satf else 0, + "intrinsic" : Template(intrinsic_template).substitute({ + "geom" : op.a.geom, + "alayout" : alayout, + "blayout" : blayout, + "intrinsic_signature" : mma_signature(op), + "satf" : satf, + }) + } + results[(min_ptx, min_sm)] += Template(mma_template).substitute(params) + + return results + +def gen_tests(): + results = gen_wmma_ldst_tests(defaultdict(str)) + results = gen_wmma_mma_tests(results) + + run_template = r""" +// +// *** DO NOT EDIT *** +// +// This test has been automatically generated by +// builtins-nvtx-mma.py --ptx=${ptx} --gpu-arch=${sm} +// +// Make sure we can handle all builtins available on sm_${sm} with PTX${ptx} +// ${run}: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_${sm} \ +// ${run}: -fcuda-is-device -target-feature +ptx${ptx} \ +// ${run}: -DPTX=${ptx} -DSM=${sm} \ +// ${run}: -S -emit-llvm -o - -x cuda %s \ +// ${run}: | FileCheck -check-prefixes=${check_labels} %s +// Verify that all builtins have correct constraints. +// ${run}: %clang_cc1 -triple nvptx-unknown-unknown \ +// ${run}: -target-cpu sm_60 -target-feature +ptx42 \ +// ${run}: -DPTX=${ptx} -DSM=${sm} -fcuda-is-device -S -o /dev/null -x cuda \ +// ${run}: -verify %s +""" + def supported_variants(ptx, sm, results): + return [(ptx_, sm_) for ptx_, sm_ in results if ptx_ <= ptx and sm_ <= sm] + + print(Template(run_template).substitute({ + "run" : "RUN", # To avoid lit misinterpreting the template + "ptx" : ptx_version, + "sm" : gpu_arch, + "check_labels" : ",".join(["CHECK_PTX%d_SM%d" % (ptx_, sm_) + for ptx_, sm_ + in supported_variants(ptx_version, gpu_arch, + results)]) + })) + + print(""" +#if !defined(CUDA_VERSION) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +typedef unsigned long long uint64_t; +#endif + +// CHECK-LABEL: test_wmma_buitins +__device__ void test_wmma_buitins(int *src, int *dst, + float *fsrc, float *fdst, int ldm) { +"""); + + for (ptx, sm), tests in sorted(results.items()): + print() + print("#if (PTX >= %d) && (SM >= %d)" % (ptx, sm)) + print(tests) + print("#endif // (PTX >= %d) && (SM >= %d) "% (ptx, sm)) + + print("}") + +parser = argparse.ArgumentParser() +parser.add_argument("--ptx", type=int, default=60) +parser.add_argument("--gpu-arch", type=int, default=70) +args = parser.parse_args() +ptx_version = args.ptx +gpu_arch = args.gpu_arch + +gen_tests() Index: llvm/lib/Target/NVPTX/NVPTX.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTX.td +++ llvm/lib/Target/NVPTX/NVPTX.td @@ -75,6 +75,8 @@ "Use PTX version 6.1">; def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63", "Use PTX version 6.3">; +def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64", + "Use PTX version 6.4">; //===----------------------------------------------------------------------===// // NVPTX supported processors.