Index: mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
===================================================================
--- mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -112,6 +112,8 @@
     `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
     `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
   }];
+
+  let hasVerifier = 1;
 }
 
 
Index: mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
===================================================================
--- mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -17,6 +17,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include <iostream>
 
 using namespace mlir;
 using namespace mlir::nvgpu;
@@ -88,5 +89,117 @@
   return success();
 }
 
+LogicalResult MmaSyncOp::verify() {
+
+  // Fundamental tensor core mma.sync op
+  // For F32 (TF32), F16, S8, and S4 data types fundamental tensor core
+  // operation is of shape: 8-by-8-by-128b. F64 is an exception. The
+  // verification for mma.sync covering various shapes and data types is based
+  // on the fundamental tensor core operionation.
+  constexpr int kThreads = 32; // 32 threads per warp
+  int64_t shapeM = 8;
+  int64_t shapeN = 8;
+  int64_t shapeK; // set based on data type (128b for all data types except F64)
+
+  // Number of elements A, B, and C per thread per fundamental tensor core tile
+  int64_t numElementA;    // set based on data type (32b except F64)
+  int64_t numElementB;    // set based on data type (32b except F64)
+  int64_t numElementC{2}; // two accumulator elements per fundamental tile
+
+  // nvgpu.mma.sync vector operands (per thread)
+  auto aVector = getMatrixA().getType().cast<VectorType>();
+  auto bVector = getMatrixB().getType().cast<VectorType>();
+  auto cVector = getMatrixC().getType().cast<VectorType>();
+
+  // vector shapes
+  auto aShape = aVector.getShape();
+  auto bShape = bVector.getShape();
+  auto cShape = cVector.getShape();
+
+  // vector element type
+  auto aType = aVector.getElementType();
+  auto bType = bVector.getElementType();
+  auto cType = cVector.getElementType();
+
+  // nvgpu.mma.sync shape (per 32 threads or per warp)
+  int64_t m = getMmaShape()[0].cast<IntegerAttr>().getInt();
+  int64_t n = getMmaShape()[1].cast<IntegerAttr>().getInt();
+  int64_t k = getMmaShape()[2].cast<IntegerAttr>().getInt();
+
+  if (aType.getTypeID() != bType.getTypeID()) {
+    return emitError() << "expected same data type for matrix A and matrix B";
+  }
+
+  if (aType.isF64()) {
+    // exception to 8-by-8-128b fundamental tensor core tile size
+    shapeK = 4;
+    numElementA = 1;
+    numElementB = 1;
+  }
+
+  else if (aType.isF32() || aType.isBF16() || aType.isF16() ||
+           aType.isInteger(8)) {
+    // 8-by-8-128b fundamental tensor core tile size
+    int operandBitwidth = aType.getIntOrFloatBitWidth();
+    shapeK = 128 / operandBitwidth;     // 128b wide shapeK
+    numElementA = 32 / operandBitwidth; // 32b wide operand A
+    numElementB = 32 / operandBitwidth; // 32b wide operand B
+  } else {
+    return emitError() << "expected input data type (i4,i8,f16,bf16,tf32,f64) "
+                          "supported by nvgpu.mma.sync";
+  }
+
+  //
+  // Basic verfication
+  //
+
+  // verify warp-wide size for vector a
+  if (aShape[0] * aShape[1] * kThreads != m * k) {
+    return emitOpError() << "expected " << m * k
+                         << " warp-wide matrix A elements";
+  }
+
+  // verify warp-wide size for vector b
+  if (bShape[0] * bShape[1] * kThreads != k * n) {
+    return emitOpError() << "expected " << k * n
+                         << " warp-wide matrix B elements";
+  }
+
+  // verify warp-wide size for vector c
+  if (cShape[0] * cShape[1] * kThreads != m * n) {
+    return emitOpError() << "expected " << m * n
+                         << " warp-wide matrix C elements";
+  }
+
+  //
+  // Extended verification
+  //
+
+  // tiles of fundamental tensor core operations
+  int64_t mTile = m / shapeM;
+  int64_t nTile = n / shapeN;
+  int64_t kTile = k / shapeK;
+
+  // verify shape of aVector
+  if (!((aShape[0] == mTile * kTile) && (aShape[1] == numElementA))) {
+    return emitOpError() << "expected matrix A to be shaped (" << mTile * kTile
+                         << " x " << numElementA << ")";
+  }
+
+  // verify shape of bVector
+  if (!((bShape[0] == kTile * nTile) && (bShape[1] == numElementB))) {
+    return emitOpError() << "expected matrix B to be shaped (" << kTile * nTile
+                         << " x " << numElementB << ")";
+  }
+
+  // verify shape of cVector
+  if (!((cShape[0] == mTile * nTile) && (cShape[1] == numElementC))) {
+    return emitOpError() << "expected matrix C to be shaped (" << mTile * nTile
+                         << " x " << numElementC << ")";
+  }
+
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
Index: mlir/test/Dialect/NVGPU/invalid.mlir
===================================================================
--- mlir/test/Dialect/NVGPU/invalid.mlir
+++ mlir/test/Dialect/NVGPU/invalid.mlir
@@ -1,4 +1,72 @@
 // RUN: mlir-opt -split-input-file -verify-diagnostics %s
+func.func @m16n8k16_fp16_vector_shape_a(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
+  // expected-error @+1 {{expected 256 warp-wide matrix A elements}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>    
+  return %d : vector<2x2xf16>
+}
+// -----
+
+func.func @m16n8k16_fp16_vector_shape_b(%arg0: vector<4x2xf16>, %arg1: vector<2x4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
+  // expected-error @+1 {{expected 128 warp-wide matrix B elements}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x4xf16>, vector<2x2xf16>) -> vector<2x2xf16>    
+  return %d : vector<2x2xf16>
+}
+// -----
+
+func.func @m16n8k16_fp16_vector_shape_c(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x4xf16>) -> vector<2x4xf16> {
+  // expected-error @+1 {{expected 128 warp-wide matrix C elements}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x4xf16>) -> vector<2x4xf16>    
+  return %d : vector<2x4xf16>
+}
+// -----
+
+func.func @m16n8k16_fp16_vector_shape_a_extended(%arg0: vector<2x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
+  // expected-error @+1 {{expected matrix A to be shaped (4 x 2)}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<2x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>    
+  return %d : vector<2x2xf16>
+}
+// -----
+
+func.func @m16n8k8_fp32_vector_shape_a(%arg0: vector<4x2xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
+  // expected-error @+1 {{expected 128 warp-wide matrix A elements}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<4x2xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>    
+  return %d : vector<2x2xf32>
+}
+// -----
+
+func.func @m16n8k8_fp32_vector_shape_a_extended(%arg0: vector<1x4xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
+  // expected-error @+1 {{expected matrix A to be shaped (4 x 1)}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<1x4xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>    
+  return %d : vector<2x2xf32>
+}
+// -----
+
+func.func @m8n8k4_fp64_vector_shape_a(%arg0: vector<1x2xf64>, %arg1: vector<1x1xf64>, %arg2: vector<1x2xf64>) -> vector<1x2xf64> {
+  // expected-error @+1 {{expected 32 warp-wide matrix A elements}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x2xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>    
+  return %d : vector<1x2xf64>
+}
+// -----
+
+func.func @m8n8k4_fp64_vector_shape_c_extended(%arg0: vector<1x1xf64>, %arg1: vector<1x1xf64>, %arg2: vector<2x1xf64>) -> vector<2x1xf64> {
+  // expected-error @+1 {{expected matrix C to be shaped (1 x 2)}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<2x1xf64>) -> vector<2x1xf64>    
+  return %d : vector<2x1xf64>
+}
+// -----
+
+func.func @m16n8k32_int8_vector_shape_b(%arg0: vector<4x4xi8>, %arg1: vector<4x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
+  // expected-error @+1 {{expected 256 warp-wide matrix B elements}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
+  return %d : vector<2x2xi32>
+}
+// -----
+func.func @m16n8k32_int32_datatype(%arg0: vector<4x4xi32>, %arg1: vector<2x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
+  // expected-error @+1 {{expected input data type (i4,i8,f16,bf16,tf32,f64) supported by nvgpu.mma.sync}}
+  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi32>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
+  return %d : vector<2x2xi32>
+}
+// -----
 
 func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () {
   // expected-error @+1 {{destination memref must have memory space 3}}