Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -692,17 +692,41 @@ BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") // Builtins to support WMMA instructions on sm_70 -TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", "ptx60") - -TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "ptx60") -TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", "sm_70,ptx60") + +TARGET_BUILTIN(__hmma_m32n8k16_ld_a, "vi*iC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_ld_b, "vi*iC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f16, "vi*iC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f32, "vf*fC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f16, "vi*i*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f32, "vf*f*UiIi", "", "sm_70,ptx61") + +TARGET_BUILTIN(__hmma_m8n32k16_ld_a, "vi*iC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_ld_b, "vi*iC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f16, "vi*iC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f32, "vf*fC*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f16, "vi*i*UiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f32, "vf*f*UiIi", "", "sm_70,ptx61") + +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "sm_70,ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "sm_70,ptx60") + +TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "sm_70,ptx61") + +TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "sm_70,ptx61") +TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "sm_70,ptx61") #undef BUILTIN #undef TARGET_BUILTIN Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -10525,7 +10525,15 @@ case NVPTX::BI__hmma_m16n16k16_ld_a: case NVPTX::BI__hmma_m16n16k16_ld_b: case NVPTX::BI__hmma_m16n16k16_ld_c_f16: - case NVPTX::BI__hmma_m16n16k16_ld_c_f32: { + case NVPTX::BI__hmma_m16n16k16_ld_c_f32: + case NVPTX::BI__hmma_m32n8k16_ld_a: + case NVPTX::BI__hmma_m32n8k16_ld_b: + case NVPTX::BI__hmma_m32n8k16_ld_c_f16: + case NVPTX::BI__hmma_m32n8k16_ld_c_f32: + case NVPTX::BI__hmma_m8n32k16_ld_a: + case NVPTX::BI__hmma_m8n32k16_ld_b: + case NVPTX::BI__hmma_m8n32k16_ld_c_f16: + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Value *Src = EmitScalarExpr(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -10556,6 +10564,46 @@ : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride; NumResults = 8; break; + case NVPTX::BI__hmma_m32n8k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m32n8k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m32n8k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m32n8k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride; + NumResults = 8; + break; default: llvm_unreachable("Unexpected builtin ID."); } @@ -10574,7 +10622,11 @@ } case NVPTX::BI__hmma_m16n16k16_st_c_f16: - case NVPTX::BI__hmma_m16n16k16_st_c_f32: { + case NVPTX::BI__hmma_m16n16k16_st_c_f32: + case NVPTX::BI__hmma_m32n8k16_st_c_f16: + case NVPTX::BI__hmma_m32n8k16_st_c_f32: + case NVPTX::BI__hmma_m8n32k16_st_c_f16: + case NVPTX::BI__hmma_m8n32k16_st_c_f32: { Value *Dst = EmitScalarExpr(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -10596,6 +10648,24 @@ IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride; break; + case NVPTX::BI__hmma_m32n8k16_st_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m32n8k16_st_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride; + break; + case NVPTX::BI__hmma_m8n32k16_st_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m8n32k16_st_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride; + break; default: llvm_unreachable("Unexpected builtin ID."); } @@ -10618,7 +10688,15 @@ case NVPTX::BI__hmma_m16n16k16_mma_f16f16: case NVPTX::BI__hmma_m16n16k16_mma_f32f16: case NVPTX::BI__hmma_m16n16k16_mma_f32f32: - case NVPTX::BI__hmma_m16n16k16_mma_f16f32: { + case NVPTX::BI__hmma_m16n16k16_mma_f16f32: + case NVPTX::BI__hmma_m32n8k16_mma_f16f16: + case NVPTX::BI__hmma_m32n8k16_mma_f32f16: + case NVPTX::BI__hmma_m32n8k16_mma_f32f32: + case NVPTX::BI__hmma_m32n8k16_mma_f16f32: + case NVPTX::BI__hmma_m8n32k16_mma_f16f16: + case NVPTX::BI__hmma_m8n32k16_mma_f32f16: + case NVPTX::BI__hmma_m8n32k16_mma_f32f32: + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Address SrcA = EmitPointerWithAlignment(E->getArg(1)); Address SrcB = EmitPointerWithAlignment(E->getArg(2)); @@ -10635,15 +10713,15 @@ bool Satf = SatfArg.getSExtValue(); // clang-format off -#define MMA_VARIANTS(type) {{ \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_col_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_row_col_##type##_satfinite, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_row_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_col_##type, \ - Intrinsic::nvvm_wmma_m16n16k16_mma_col_col_##type##_satfinite \ +#define MMA_VARIANTS(geom, type) {{ \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \ }} // clang-format on @@ -10657,22 +10735,62 @@ unsigned NumEltsD; switch (BuiltinID) { case NVPTX::BI__hmma_m16n16k16_mma_f16f16: - IID = getMMAIntrinsic(MMA_VARIANTS(f16_f16)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16)); NumEltsC = 4; NumEltsD = 4; break; case NVPTX::BI__hmma_m16n16k16_mma_f32f16: - IID = getMMAIntrinsic(MMA_VARIANTS(f32_f16)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16)); NumEltsC = 4; NumEltsD = 8; break; case NVPTX::BI__hmma_m16n16k16_mma_f16f32: - IID = getMMAIntrinsic(MMA_VARIANTS(f16_f32)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32)); NumEltsC = 8; NumEltsD = 4; break; case NVPTX::BI__hmma_m16n16k16_mma_f32f32: - IID = getMMAIntrinsic(MMA_VARIANTS(f32_f32)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32)); + NumEltsC = 8; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f16f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16)); + NumEltsC = 4; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f32f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16)); + NumEltsC = 4; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f16f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32)); + NumEltsC = 8; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f32f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32)); + NumEltsC = 8; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f16f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16)); + NumEltsC = 4; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f32f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16)); + NumEltsC = 4; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32)); + NumEltsC = 8; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f32f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32)); NumEltsC = 8; NumEltsD = 8; break; Index: clang/lib/Driver/ToolChains/Cuda.cpp =================================================================== --- clang/lib/Driver/ToolChains/Cuda.cpp +++ clang/lib/Driver/ToolChains/Cuda.cpp @@ -570,17 +570,19 @@ CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); - if (CudaInstallation.version() >= CudaVersion::CUDA_90) { + // Libdevice in CUDA-7.0 requires PTX version that's more recent than LLVM + // defaults to. Use PTX4.2 by default, which is the PTX version that came with + // CUDA-7.0. + const char *PtxFeature = "+ptx42"; + if (CudaInstallation.version() >= CudaVersion::CUDA_91) { + // CUDA-9 uses new instructions that are only available in PTX6.1 + PtxFeature = "+ptx61"; + } else if (CudaInstallation.version() >= CudaVersion::CUDA_90) { // CUDA-9 uses new instructions that are only available in PTX6.0 - CC1Args.push_back("-target-feature"); - CC1Args.push_back("+ptx60"); - } else { - // Libdevice in CUDA-7.0 requires PTX version that's more recent - // than LLVM defaults to. Use PTX4.2 which is the PTX version that - // came with CUDA-7.0. - CC1Args.push_back("-target-feature"); - CC1Args.push_back("+ptx42"); + PtxFeature = "+ptx60"; } + CC1Args.push_back("-target-feature"); + CC1Args.push_back(PtxFeature); if (DeviceOffloadingKind == Action::OFK_OpenMP) { SmallVector LibraryPaths; Index: clang/test/CodeGen/builtins-nvptx-sm_70.cu =================================================================== --- clang/test/CodeGen/builtins-nvptx-sm_70.cu +++ clang/test/CodeGen/builtins-nvptx-sm_70.cu @@ -1,9 +1,16 @@ // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \ // RUN: -fcuda-is-device -target-feature +ptx60 \ // RUN: -S -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CHECK %s +// RUN: | FileCheck -check-prefix=CHECK_M16 %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \ +// RUN: -fcuda-is-device -target-feature +ptx61 -DPTX61 \ +// RUN: -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefixes=CHECK_M16,CHECK_M32_M8 %s // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \ -// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s +// RUN: -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-sm_70 %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown \ +// RUN: -target-cpu sm_70 -target-feature +ptx60 \ +// RUN: -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-ptx61 %s #if !defined(CUDA_VERSION) #define __device__ __attribute__((device)) @@ -18,149 +25,443 @@ // that encounters an error, so -verify will not be able to find errors in // subsequent functions. -// CHECK-LABEL: nvvm_wmma -__device__ void nvvm_wmma(int *src, int *dst, - float *fsrc, float *fdst, - int ldm) { - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}} +// CHECK-LABEL: nvvm_wmma_m16n16k16 +__device__ void nvvm_wmma_m16n16k16(int *src, int *dst, + float *fsrc, float *fdst, + int ldm) { + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_a(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_a(dst, src+1, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_b(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_b(dst, src+2, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_st_c_f16(dst, src, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_st_c_f16(dst, src, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32 - // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32 - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32 + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); - // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite - // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite + // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}} __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); } + +#ifdef PTX61 +// CHECK-LABEL: nvvm_wmma_m32n8k16 +__device__ void nvvm_wmma_m32n8k16(int *src, int *dst, + float *fsrc, float *fdst, + int ldm) { + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_a(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_a(dst, src+1, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_b(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_b(dst, src+2, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_st_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_st_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); + + + // m8n32k16 variants. + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_a(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_a(dst, src+1, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_b(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_b(dst, src+2, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_st_c_f16(dst, src, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_st_c_f16(dst, src, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 1); + + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32 + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32.satfinite + // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}} + __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); +} +#endif Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -3920,7 +3920,9 @@ } multiclass NVVM_WMMA_LD { + defm _m32n8k16_load: NVVM_WMMA_LD_G<"m32n8k16">; defm _m16n16k16_load: NVVM_WMMA_LD_G<"m16n16k16">; + defm _m8n32k16_load: NVVM_WMMA_LD_G<"m8n32k16">; } defm int_nvvm_wmma: NVVM_WMMA_LD; @@ -3947,7 +3949,7 @@ # !if(WithStride, ".stride", "") # "." # Type>; -multiclass NVVM_WMMA_STD_GLT { def _stride: NVVM_WMMA_STD_GLSTS; def NAME: NVVM_WMMA_STD_GLSTS; @@ -3963,7 +3965,9 @@ } multiclass NVVM_WMMA_STD { + defm _m32n8k16_store: NVVM_WMMA_STD_G<"m32n8k16">; defm _m16n16k16_store: NVVM_WMMA_STD_G<"m16n16k16">; + defm _m8n32k16_store: NVVM_WMMA_STD_G<"m8n32k16">; } defm int_nvvm_wmma: NVVM_WMMA_STD; @@ -4033,7 +4037,9 @@ } multiclass NVVM_WMMA_MMA { + defm _m32n8k16_mma : NVVM_WMMA_MMA_G<"m32n8k16">; defm _m16n16k16_mma : NVVM_WMMA_MMA_G<"m16n16k16">; + defm _m8n32k16_mma : NVVM_WMMA_MMA_G<"m8n32k16">; } defm int_nvvm_wmma : NVVM_WMMA_MMA; Index: llvm/lib/Target/NVPTX/NVPTX.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTX.td +++ llvm/lib/Target/NVPTX/NVPTX.td @@ -52,6 +52,8 @@ "Target SM 6.2">; def SM70 : SubtargetFeature<"sm_70", "SmVersion", "70", "Target SM 7.0">; +def SM72 : SubtargetFeature<"sm_72", "SmVersion", "72", + "Target SM 7.2">; // PTX Versions def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", @@ -68,6 +70,8 @@ "Use PTX version 5.0">; def PTX60 : SubtargetFeature<"ptx60", "PTXVersion", "60", "Use PTX version 6.0">; +def PTX61 : SubtargetFeature<"ptx61", "PTXVersion", "61", + "Use PTX version 6.1">; //===----------------------------------------------------------------------===// // NVPTX supported processors. @@ -89,6 +93,7 @@ def : Proc<"sm_61", [SM61, PTX50]>; def : Proc<"sm_62", [SM62, PTX50]>; def : Proc<"sm_70", [SM70, PTX60]>; +def : Proc<"sm_72", [SM72, PTX61]>; def NVPTXInstrInfo : InstrInfo { } Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3330,7 +3330,23 @@ case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: - case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: { + case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::v8f16; Info.ptrVal = I.getArgOperand(0); @@ -3343,7 +3359,15 @@ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: - case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: { + case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::v4f16; Info.ptrVal = I.getArgOperand(0); @@ -3356,7 +3380,15 @@ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: - case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: { + case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::v8f32; Info.ptrVal = I.getArgOperand(0); @@ -3369,7 +3401,15 @@ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: - case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: { + case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = MVT::v4f16; Info.ptrVal = I.getArgOperand(0); @@ -3382,7 +3422,15 @@ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: - case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: { + case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: + case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: + case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = MVT::v8f32; Info.ptrVal = I.getArgOperand(0); Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -142,6 +142,7 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; +def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7378,7 +7378,11 @@ class WMMA_LOAD_GALSTOS - : EmptyNVPTXInst, Requires<[hasPTX60, hasSM70]> { + : EmptyNVPTXInst, + Requires<[!if(!eq(Geometry, "m16n16k16"), + hasPTX60, + hasPTX61), + hasSM70]> { // Pattern (created by WMMA_LOAD_INTR_HELPER below) that matches the intrinsic // for this function. PatFrag IntrMatcher = !cast("INT_WMMA_" @@ -7420,10 +7424,10 @@ let InOperandList = Ins; let AsmString = "wmma.load." # Abc - # ".sync." - # Layout - # ".m16n16k16" - # Space + # ".sync" + # "." # Layout + # "." # Geometry + # Space # "." # Type # " \t" # !if(!eq(Abc#Type, "cf16"), "{{$r0, $r1, $r2, $r3}}", @@ -7512,7 +7516,9 @@ defm _load_c_f32: WMMA_LOAD_GAT; } +defm INT_WMMA_m32n8k16: WMMA_LOAD_G<"m32n8k16">; defm INT_WMMA_m16n16k16: WMMA_LOAD_G<"m16n16k16">; +defm INT_WMMA_m8n32k16: WMMA_LOAD_G<"m8n32k16">; // // wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32] @@ -7520,7 +7526,11 @@ class WMMA_STORE_D_GLSTSO - : EmptyNVPTXInst, Requires<[hasPTX60, hasSM70]> { + : EmptyNVPTXInst, + Requires<[!if(!eq(Geometry, "m16n16k16"), + hasPTX60, + hasPTX61), + hasSM70]> { PatFrag IntrMatcher = !cast("INT_WMMA" # "_" # Geometry # "_store_d" # "_" # Type @@ -7641,11 +7651,9 @@ defm _store_d_f32: WMMA_STORE_D_GT; } -// multiclass WMMA_STORE_D { -// defm _m16n16k16: WMMA_STORE_D_G<"m16n16k16">; -// } - +defm INT_WMMA_m32n8k16: WMMA_STORE_D_G<"m32n8k16">; defm INT_WMMA_m16n16k16: WMMA_STORE_D_G<"m16n16k16">; +defm INT_WMMA_m8n32k16: WMMA_STORE_D_G<"m8n32k16">; // WMMA.MMA class WMMA_MMA_GABDCS - : EmptyNVPTXInst, Requires<[hasPTX60, hasSM70]> { + : EmptyNVPTXInst, + Requires<[!if(!eq(Geometry, "m16n16k16"), + hasPTX60, + hasPTX61), + hasSM70]> { Intrinsic Intr = !cast("int_nvvm_wmma_" # Geometry # "_mma" @@ -7686,7 +7698,7 @@ let AsmString = "wmma.mma.sync." # ALayout # "." # BLayout - # ".m16n16k16" + # "." # Geometry # "." # DType # "." # CType # Satfinite # "\n\t\t" @@ -7734,4 +7746,6 @@ defm _row: WMMA_MMA_GA; } +defm INT_WMMA_MMA_m32n8k16 : WMMA_MMA_G<"m32n8k16">; defm INT_WMMA_MMA_m16n16k16 : WMMA_MMA_G<"m16n16k16">; +defm INT_WMMA_MMA_m8n32k16 : WMMA_MMA_G<"m8n32k16">; Index: llvm/test/CodeGen/NVPTX/wmma.py =================================================================== --- llvm/test/CodeGen/NVPTX/wmma.py +++ llvm/test/CodeGen/NVPTX/wmma.py @@ -2,7 +2,7 @@ # generates correct instructions for them. # RUN: python %s > %t.ll -# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %t.ll +# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 | FileCheck %t.ll from itertools import product from string import Template @@ -36,13 +36,15 @@ check_f16_4 = "{{%s}}" % ", *".join(["%hh[0-9]+"] * 4) check_f32_8 = "{{%s}}" % ", *".join(["%f[0-9]+"] * 8) +known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"] + def gen_wmma_load_tests(): load_template = """ declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args}); ; CHECK-LABEL: .func {{.*}}test_${function}( define ${ret_ty} @test_${function}(i8 ${as}* %src ${extra_args}) { -; CHECK ${instruction} +; CHECK: ${instruction} ; CHECK: {${check_result}} ; CHECK: [%rd{{[0-9]+}}]${stride_pattern} %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args}); @@ -51,7 +53,7 @@ ; CHECK-LABEL: .func{{.*}}test_${function}_o( define ${ret_ty} @test_${function}_o(i8 ${as}* %src ${extra_args}) { -; CHECK ${instruction} +; CHECK: ${instruction} ; CHECK: {${check_result}} ; CHECK: [%rd{{[0-9]+}}+128]${stride_pattern} %src1 = getelementptr i8, i8 ${as}* %src, i32 128; @@ -60,9 +62,10 @@ } """ intrinsic_template = "llvm.nvvm.wmma.${geom}.load.${abc}.${layout}${stride}.${itype}.${pspace}" - instruction_template = "wmma.load.${abc}.sync.${geom}.${layout}${space}.${itype}" + instruction_template = "wmma.load.${abc}.sync.${layout}.${geom}${space}.${itype}" - for abc, layout, space, stride, itype in product( + for geom, abc, layout, space, stride, itype in product( + known_geoms, "abc", ["row","col"], ["",".shared",".global"], @@ -77,7 +80,7 @@ "itype" : itype, "pspace" : get_pspace(space), "as" : "addrspace(%d)" % get_aspace(space), - "geom" : "m16n16k16", + "geom" : geom, } if itype == "f32" and abc != "c": @@ -112,7 +115,7 @@ ; CHECK-LABEL: .func {{.*}}test_${function}( define void @test_${function}(i8 ${as}* %src, ${args}${extra_args}) { -; CHECK ${instruction} {{.*}}[%rd{{[0-9+]}} +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}} ; CHECK: {${check_args}} ; CHECK: ${stride_pattern} call void @${intrinsic}(i8 ${as}* %src, ${args} ${extra_args}); @@ -121,7 +124,7 @@ ; CHECK-LABEL: .func{{.*}}test_${function}_o( define void @test_${function}_o(i8 ${as}* %src, ${args}${extra_args}) { -; CHECK ${instruction} {{.*}}[%rd{{[0-9+]}}+128] +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128] ; CHECK: ${check_args} ; CHECK: ${stride_pattern} %src1 = getelementptr i8, i8 ${as}* %src, i32 128; @@ -130,9 +133,10 @@ } """ intrinsic_template = "llvm.nvvm.wmma.${geom}.store.${abc}.${layout}${stride}.${itype}.${pspace}" - instruction_template = "wmma.store.${abc}.sync.${geom}.${layout}${space}.${itype}" + instruction_template = "wmma.store.${abc}.sync.${layout}.${geom}${space}.${itype}" - for abc, layout, space, stride, itype in product( + for geom, abc, layout, space, stride, itype in product( + known_geoms, "d", ["row","col"], ["",".shared",".global"], @@ -147,7 +151,7 @@ "itype" : itype, "pspace" : get_pspace(space), "as" : "addrspace(%d)" % get_aspace(space), - "geom" : "m16n16k16", + "geom" : geom, } test_params = params @@ -174,11 +178,11 @@ ; CHECK-LABEL: .func {{.*}}test_${function}( define ${ret_ty} @test_${function}( ${args}) { -; CHECK ${instruction} {{.*}}[%rd{{[0-9+]}} -; CHECK ${check_d} -; CHECK ${check_ab} -; CHECK ${check_ab} -; CHECK ${check_c} +; CHECK: ${instruction} +; CHECK-NEXT: ${check_d} +; CHECK-NEXT: ${check_ab} +; CHECK-NEXT: ${check_ab} +; CHECK-NEXT: ${check_c} %r = call ${ret_ty} @${intrinsic}( ${args}); ret ${ret_ty} %r; @@ -187,7 +191,8 @@ intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${dtype}.${ctype}${satf}" instruction_template = "wmma.mma.sync.${alayout}.${blayout}.${geom}.${dtype}.${ctype}${satf}" - for alayout, blayout, ctype, dtype, satf in product( + for geom, alayout, blayout, ctype, dtype, satf in product( + known_geoms, ["row","col"], ["row","col"], ["f16", "f32"], @@ -200,7 +205,7 @@ "ctype" : ctype, "dtype" : dtype, "satf" : satf, - "geom" : "m16n16k16", + "geom" : geom, } test_params = params