diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -55,6 +55,24 @@
 
 thread_local static int32_t defaultDevice = 0;
 
+const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG";
+
+/// Helper method that checks environment value for debugging.
+bool isDebugEnabled() {
+  static bool isInitialized = false;
+  static bool isEnabled = false;
+  if (!isInitialized)
+    isEnabled = getenv(kDebugEnvironmentVariable) != nullptr;
+  return isEnabled;
+}
+
+#define debug_print(fmt, ...)                                                  \
+  do {                                                                         \
+    if (isDebugEnabled())                                                      \
+      fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \
+              __func__, __VA_ARGS__);                                          \
+  } while (0)
+
 // Make the primary context of the current default device current for the
 // duration
 //  of the instance and restore the previous context on destruction.
@@ -273,6 +291,24 @@
       tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
       globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion,
       oobFill));
+  debug_print("Created TMA descriptor\n Addr: %p\n"
+              "data type : %d\n"
+              "rank : %d\n"
+              "globalDim[5]: %zu, %zu, %zu, %zu, %zu\n"
+              "globalStrides[5]: %zu, %zu, %zu, %zu, %zu\n"
+              "boxDim[5]: %u, %u, %u, %u, %u\n"
+              "elementStrides[5]: %u, %u, %u, %u, %u\n"
+              "interleave: %u \n"
+              "swizzle: %u \n"
+              "l2Promotion: %u \n"
+              "oobFill: %u \n",
+              (void *)&tensorMap, tensorDataType, tensorRank, globalDim[0],
+              globalDim[1], globalDim[2], globalDim[3], globalDim[4],
+              globalStrides[0], globalStrides[1], globalStrides[2],
+              globalStrides[3], globalStrides[4], boxDim[0], boxDim[1],
+              boxDim[2], boxDim[3], boxDim[4], elementStrides[0],
+              elementStrides[1], elementStrides[2], elementStrides[3],
+              elementStrides[4], interleave, swizzle, l2Promotion, oobFill);
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(