Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -688,5 +688,18 @@ BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") +// Builtins to support WMMA instructions on sm_70 +TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", "ptx60") + +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "ptx60") + #undef BUILTIN #undef TARGET_BUILTIN Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -9720,6 +9720,203 @@ Builder.CreateStore(Pred, PredOutPtr); return Builder.CreateExtractValue(ResultPair, 0); } + case NVPTX::BI__hmma_m16n16k16_ld_a: + case NVPTX::BI__hmma_m16n16k16_ld_b: + case NVPTX::BI__hmma_m16n16k16_ld_c_f16: + case NVPTX::BI__hmma_m16n16k16_ld_c_f32:{ + Address Dst = EmitPointerWithAlignment(E->getArg(0)); + Value *Src = EmitScalarExpr(E->getArg(1)); + Value *Ldm = EmitScalarExpr(E->getArg(2)); + llvm::APSInt isColMajorArg; + if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext())) + return nullptr; + bool isColMajor = isColMajorArg.getZExtValue(); + unsigned IID; + unsigned NumResults = 8; + switch(BuiltinID) { + case NVPTX::BI__hmma_m16n16k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_load_a_f16_row_stride; + break; + case NVPTX::BI__hmma_m16n16k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_load_b_f16_row_stride; + break; + case NVPTX::BI__hmma_m16n16k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_load_c_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m16n16k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_load_c_f32_row_stride; + break; + default: + llvm_unreachable("Unexpected builtin ID."); + } + Value *Result = + Builder.CreateCall(CGM.getIntrinsic(IID), + {Builder.CreatePointerCast(Src, VoidPtrTy), Ldm}); + + //auto EltTy = cast(Src->getType())->getElementType(); + // Returned are 8 16x2 elements. + for (unsigned i = 0; i < NumResults; ++i) { + Value *V = Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), + Dst.getElementType()); + Builder.CreateAlignedStore( + V, + Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); + } + return Result; + } + + case NVPTX::BI__hmma_m16n16k16_st_c_f16: + case NVPTX::BI__hmma_m16n16k16_st_c_f32: { + Value *Dst = EmitScalarExpr(E->getArg(0)); + Address Src = EmitPointerWithAlignment(E->getArg(1)); + Value *Ldm = EmitScalarExpr(E->getArg(2)); + llvm::APSInt isColMajorArg; + if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext())) + return nullptr; + bool isColMajor = isColMajorArg.getZExtValue(); + unsigned IID; + unsigned NumResults = 8; + // PTX Instructions (and LLVM instrinsics) are defined for slice _d_, yet + // for some reason nvcc buildtins are using _c_. + switch(BuiltinID) { + case NVPTX::BI__hmma_m16n16k16_st_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_store_d_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m16n16k16_st_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_store_d_f32_row_stride; + break; + default: + llvm_unreachable("Unexpected builtin ID."); + } + Function * Intrinsic = CGM.getIntrinsic(IID); + llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1); + SmallVector Values; + Values.push_back(Builder.CreatePointerCast(Dst, VoidPtrTy)); + for (unsigned i = 0; i < NumResults; ++i) { + Value *V = Builder.CreateAlignedLoad( + Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); + Values.push_back(Builder.CreateBitCast(V, ParamType)); + } + Values.push_back(Ldm); + Value *Result = Builder.CreateCall(Intrinsic, Values); + return Result; + } + + // BI__hmma_m16n16k16_mma_(d, a, b, c, layout, satf) + // --> Intrinsic::nvvm_wmma_mma_sync + case NVPTX::BI__hmma_m16n16k16_mma_f16f16: + case NVPTX::BI__hmma_m16n16k16_mma_f32f16: + case NVPTX::BI__hmma_m16n16k16_mma_f32f32: + case NVPTX::BI__hmma_m16n16k16_mma_f16f32: { + Address Dst = EmitPointerWithAlignment(E->getArg(0)); + Address SrcA = EmitPointerWithAlignment(E->getArg(1)); + Address SrcB = EmitPointerWithAlignment(E->getArg(2)); + Address SrcC = EmitPointerWithAlignment(E->getArg(3)); + llvm::APSInt LayoutArg; + if (!E->getArg(4)->isIntegerConstantExpr(LayoutArg, getContext())) + return nullptr; + unsigned Layout = LayoutArg.getZExtValue(); + if (Layout > 3) + return nullptr; + llvm::APSInt SatfArg; + if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext())) + return nullptr; + bool Satf = SatfArg.getZExtValue(); + + // clang-format off +#define MMA_VARIANTS(type) {{ \ + Intrinsic::nvvm_wmma_mma_sync_row_row_##type, \ + Intrinsic::nvvm_wmma_mma_sync_row_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_mma_sync_row_col_##type, \ + Intrinsic::nvvm_wmma_mma_sync_row_col_##type##_satfinite, \ + Intrinsic::nvvm_wmma_mma_sync_col_row_##type, \ + Intrinsic::nvvm_wmma_mma_sync_col_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_mma_sync_col_col_##type, \ + Intrinsic::nvvm_wmma_mma_sync_col_col_##type##_satfinite \ + }} + // clang-format on + + auto getMMAIntrinsic = [Layout, Satf](std::array Variants) { + unsigned Index = Layout * 2 + Satf; + assert(Index < 8); + return Variants[Index]; + }; + unsigned IID; + unsigned NumEltsC; + unsigned NumEltsD; + switch (BuiltinID) { + case NVPTX::BI__hmma_m16n16k16_mma_f16f16: + IID = getMMAIntrinsic(MMA_VARIANTS(f16_f16)); + NumEltsC = 4; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m16n16k16_mma_f32f16: + IID = getMMAIntrinsic(MMA_VARIANTS(f32_f16)); + NumEltsC = 4; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m16n16k16_mma_f16f32: + IID = getMMAIntrinsic(MMA_VARIANTS(f16_f32)); + NumEltsC = 8; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m16n16k16_mma_f32f32: + IID = getMMAIntrinsic(MMA_VARIANTS(f32_f32)); + NumEltsC = 8; + NumEltsD = 8; + break; + default: + llvm_unreachable("Unexpected builtin ID."); + } +#undef MMA_VARIANTS + + SmallVector Values; + Function * Intrinsic = CGM.getIntrinsic(IID); + llvm::Type *ABType = Intrinsic->getFunctionType()->getParamType(0); + // Load A + for (unsigned i = 0; i < 8; ++i) { + Value *V = Builder.CreateAlignedLoad( + Builder.CreateGEP(SrcA.getPointer(), + llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); + Values.push_back(Builder.CreateBitCast(V, ABType)); + } + // Load B + for (unsigned i = 0; i < 8; ++i) { + Value *V = Builder.CreateAlignedLoad( + Builder.CreateGEP(SrcB.getPointer(), + llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); + Values.push_back(Builder.CreateBitCast(V, ABType)); + } + // Load C + llvm::Type *CType = Intrinsic->getFunctionType()->getParamType(16); + for (unsigned i = 0; i < NumEltsC; ++i) { + Value *V = Builder.CreateAlignedLoad( + Builder.CreateGEP(SrcC.getPointer(), + llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); + Values.push_back(Builder.CreateBitCast(V, CType)); + } + Value *Result = Builder.CreateCall(Intrinsic, Values); + llvm::Type *DType = Dst.getElementType(); + for (unsigned i = 0; i < NumEltsD; ++i) + Builder.CreateAlignedStore( + Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType), + Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); + return Result; + } default: return nullptr; } Index: clang/test/CodeGen/builtins-nvptx-sm_70.cu =================================================================== --- /dev/null +++ clang/test/CodeGen/builtins-nvptx-sm_70.cu @@ -0,0 +1,166 @@ +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \ +// RUN: -fcuda-is-device -target-feature +ptx60 \ +// RUN: -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \ +// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s + +#if !defined(CUDA_VERSION) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +typedef unsigned long long uint64_t; +#endif +// We have to keep all builtins that depend on particular target feature in the +// same function, because the codegen will stop after the very first function +// that encounters an error, so -verify will not be able to find errors in +// subsequent functions. + +// CHECK-LABEL: nvvm_wmma +__device__ void nvvm_wmma(int *src, int *dst, + float *fsrc, float *fdst, + int ldm) { + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.a.sync.row.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}} + __hmma_m16n16k16_ld_a(dst, src, ldm, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.a.sync.col.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}} + __hmma_m16n16k16_ld_a(dst, src+1, ldm, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.b.sync.row.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}} + __hmma_m16n16k16_ld_b(dst, src, ldm, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.b.sync.col.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}} + __hmma_m16n16k16_ld_b(dst, src+2, ldm, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.row.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}} + __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.col.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}} + __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.row.m16n16k16.stride.f32 + // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}} + __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.col.m16n16k16.stride.f32 + // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}} + __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.row.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}} + __hmma_m16n16k16_st_c_f16(dst, src, ldm, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.col.m16n16k16.stride.f16 + // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}} + __hmma_m16n16k16_st_c_f16(dst, src, ldm, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.row.m16n16k16.stride.f32 + // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}} + __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.col.m16n16k16.stride.f32 + // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}} + __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f16 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f16.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1); + + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f32 + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0); + // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f32.satfinite + // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}} + __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1); +}