diff --git a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h
--- a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_CONVERSION_LLVMCOMMON_LOWERINGOPTIONS_H
 #define MLIR_CONVERSION_LLVMCOMMON_LOWERINGOPTIONS_H
 
+#include "mlir/IR/BuiltinTypes.h"
 #include "llvm/IR/DataLayout.h"
 
 namespace mlir {
@@ -66,6 +67,9 @@
   /// Get the index bitwidth.
   unsigned getIndexBitwidth() const { return indexBitwidth; }
 
+  /// Hook to customize the conversion of MemRefType to LLVMType.
+  llvm::function_ref<Type(MemRefType)> memrefIndexTypeConverter = nullptr;
+
 private:
   unsigned indexBitwidth;
 };
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -653,10 +653,18 @@
 
 } // namespace
 
+static IntegerType getIndexTypeForMemRef(MemRefType type) {
+  if (type.getMemorySpaceAsInt() == 3)
+    // nvgpu::NVGPUDialect::kSharedMemoryAddressSpace)
+    return IntegerType::get(type.getContext(), 32);
+  return IntegerType::get(type.getContext(), 64);
+}
+
 void GpuToLLVMConversionPass::runOnOperation() {
   LowerToLLVMOptions options(&getContext());
   options.useOpaquePointers = useOpaquePointers;
   options.useBarePtrCallConv = hostBarePtrCallConv;
+  options.memrefIndexTypeConverter = getIndexTypeForMemRef;
 
   LLVMTypeConverter converter(&getContext(), options);
   RewritePatternSet patterns(&getContext());
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -202,6 +202,13 @@
 /// Import the GPU Ops to NVVM Patterns.
 #include "GPUToNVVM.cpp.inc"
 
+static IntegerType getIndexTypeForMemRef(MemRefType type) {
+  if (type.getMemorySpaceAsInt() == 3)
+    // nvgpu::NVGPUDialect::kSharedMemoryAddressSpace)
+    return IntegerType::get(type.getContext(), 32);
+  return IntegerType::get(type.getContext(), 64);
+}
+
 /// A pass that replaces all occurrences of GPU device operations with their
 /// corresponding NVVM equivalent.
 ///
@@ -232,6 +239,7 @@
       options.overrideIndexBitwidth(indexBitwidth);
     options.useOpaquePointers = useOpaquePointers;
     options.useBarePtrCallConv = useBarePtrCallConv;
+    options.memrefIndexTypeConverter = getIndexTypeForMemRef;
 
     // Apply in-dialect lowering. In-dialect lowering will replace
     // ops which need to be lowered further, which is not supported by a
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -339,7 +339,9 @@
   }
   auto ptrTy = getPointerType(elementType, *addressSpace);
 
-  auto indexTy = getIndexType();
+  auto indexTy = options.memrefIndexTypeConverter
+                     ? options.memrefIndexTypeConverter(type)
+                     : getIndexType();
 
   SmallVector<Type, 5> results = {ptrTy, ptrTy, indexTy};
   auto rank = type.getRank();
diff --git a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
--- a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
@@ -14,6 +14,7 @@
   LINK_LIBS PUBLIC
   MLIRAnalysis
   MLIRDataLayoutInterfaces
+  MLIRIndexDialect
   MLIRLLVMCommonConversion
   MLIRMemRefDialect
   MLIRMemRefUtils
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -12,11 +12,14 @@
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
@@ -29,15 +32,22 @@
 
 using namespace mlir;
 
-/// GPU has 32 bit registers, this function truncates values when larger width
-/// is not needed.
+/// GPU has 32 bit registers, this function truncates values when larger
+/// width is not needed.
 static Value truncToI32(ConversionPatternRewriter &rewriter, Location loc,
                         Value value) {
   Type type = value.getType();
+  if (llvm::isa<IndexType>(type))
+    return rewriter.create<index::CastSOp>(loc, rewriter.getI32Type(), value);
+
   assert(llvm::isa<IntegerType>(type) && "expected an integer Value");
   if (type.getIntOrFloatBitWidth() <= 32)
     return value;
-  return rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), value);
+  // Avoid direct use of LVVM and instead roundtrip through index dialect which
+  // connects things properly.
+  Value index =
+      rewriter.create<index::CastSOp>(loc, rewriter.getIndexType(), value);
+  return rewriter.create<index::CastSOp>(loc, rewriter.getI32Type(), index);
 }
 
 /// Returns the type for the intrinsic given the vectorResultType of the
@@ -97,8 +107,8 @@
   Type f32x1Ty = LLVM::getFixedVectorType(f32Ty, 1);
 
   auto makeConst = [&](int32_t index) -> Value {
-    return rewriter.create<LLVM::ConstantOp>(loc, IntegerType::get(ctx, 32),
-                                             rewriter.getI32IntegerAttr(index));
+    return rewriter.create<index::ConstantOp>(loc,
+                                              rewriter.getIndexAttr(index));
   };
 
   if (arrayType) {
@@ -196,8 +206,8 @@
            idx < innerSize; idx++) {
         result.push_back(rewriter.create<LLVM::ExtractElementOp>(
             loc, toUse,
-            rewriter.create<LLVM::ConstantOp>(
-                loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(idx))));
+            rewriter.create<index::ConstantOp>(loc,
+                                               rewriter.getIndexAttr(idx))));
       }
       continue;
     }
@@ -390,47 +400,6 @@
                                     memrefType);
 }
 
-struct ConvertNVGPUToNVVMPass
-    : public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {
-  using Base::Base;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect>();
-  }
-
-  void runOnOperation() override {
-    LowerToLLVMOptions options(&getContext());
-    options.useOpaquePointers = useOpaquePointers;
-    RewritePatternSet patterns(&getContext());
-    LLVMTypeConverter converter(&getContext(), options);
-    IRRewriter rewriter(&getContext());
-    /// device-side async tokens cannot be materialized in nvvm. We just
-    /// convert them to a dummy i32 type in order to easily drop them during
-    /// conversion.
-    converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {
-      return converter.convertType(IntegerType::get(type.getContext(), 32));
-    });
-    converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {
-      return converter.convertType(IntegerType::get(type.getContext(), 64));
-    });
-    converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
-      return converter.convertType(createMBarrierMemrefType(rewriter, type));
-    });
-    converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {
-      return converter.getPointerType(type.getTensor().getElementType());
-    });
-    populateNVGPUToNVVMConversionPatterns(converter, patterns);
-    LLVMConversionTarget target(getContext());
-    target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
-    target.addLegalDialect<::mlir::memref::MemRefDialect>();
-    target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns))))
-      signalPassFailure();
-  }
-};
-
 /// Returns the constraints for the sparse MMA inline assembly instruction.
 static std::string buildMmaSparseAsmConstraintString(unsigned matASize,
                                                      unsigned matBSize,
@@ -655,11 +624,10 @@
       // memory) of CpAsyncOp is read only for SrcElements number of elements.
       // The rest of the DstElements in the destination (shared memory) are
       // filled with zeros.
-      Value c3I32 = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(3));
-      Value bitwidth = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(),
-          rewriter.getI32IntegerAttr(srcMemrefType.getElementTypeBitWidth()));
+      Value c3I32 =
+          rewriter.create<index::ConstantOp>(loc, rewriter.getIndexAttr(3));
+      Value bitwidth = rewriter.create<index::ConstantOp>(
+          loc, rewriter.getIndexAttr(srcMemrefType.getElementTypeBitWidth()));
       Value srcElementsI32 =
           rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), srcBytes);
       srcBytes = rewriter.create<LLVM::LShrOp>(
@@ -679,9 +647,8 @@
         srcBytes);
 
     // Drop the result token.
-    Value zero = rewriter.create<LLVM::ConstantOp>(
-        op->getLoc(), IntegerType::get(op.getContext(), 32),
-        rewriter.getI32IntegerAttr(0));
+    Value zero = rewriter.create<index::ConstantOp>(op->getLoc(),
+                                                    rewriter.getIndexAttr(0));
     rewriter.replaceOp(op, zero);
     return success();
   }
@@ -697,9 +664,8 @@
                   ConversionPatternRewriter &rewriter) const override {
     rewriter.create<NVVM::CpAsyncCommitGroupOp>(op.getLoc());
     // Drop the result token.
-    Value zero = rewriter.create<LLVM::ConstantOp>(
-        op->getLoc(), IntegerType::get(op.getContext(), 32),
-        rewriter.getI32IntegerAttr(0));
+    Value zero = rewriter.create<index::ConstantOp>(op->getLoc(),
+                                                    rewriter.getIndexAttr(0));
     rewriter.replaceOp(op, zero);
     return success();
   }
@@ -775,8 +741,7 @@
     rewriter.setInsertionPoint(op);
     Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
                                    op.getBarrier(), adaptor.getBarrier());
-
-    Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
+    Value count = truncToI32(rewriter, op->getLoc(), op.getCount());
 
     if (isMbarrierShared(op.getBarrier().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
@@ -824,7 +789,7 @@
                                    op.getBarrier(), adaptor.getBarrier());
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
-    Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
+    Value count = truncToI32(rewriter, op->getLoc(), op.getCount());
     if (isMbarrierShared(op.getBarrier().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
           op, tokenType, barrier, count);
@@ -869,7 +834,7 @@
                   ConversionPatternRewriter &rewriter) const override {
     Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
                                    op.getBarrier(), adaptor.getBarrier());
-    Value txcount = truncToI32(rewriter, op->getLoc(), adaptor.getTxcount());
+    Value txcount = truncToI32(rewriter, op->getLoc(), op.getTxcount());
 
     if (isMbarrierShared(op.getBarrier().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxSharedOp>(
@@ -893,8 +858,8 @@
                   ConversionPatternRewriter &rewriter) const override {
     Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
                                    op.getBarrier(), adaptor.getBarrier());
-    Value ticks = truncToI32(rewriter, op->getLoc(), adaptor.getTicks());
-    Value phase = truncToI32(rewriter, op->getLoc(), adaptor.getPhase());
+    Value ticks = truncToI32(rewriter, op->getLoc(), op.getTicks());
+    Value phase = truncToI32(rewriter, op->getLoc(), op.getPhase());
 
     if (isMbarrierShared(op.getBarrier().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
@@ -919,7 +884,7 @@
     Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
                                    op.getBarrier(), adaptor.getBarrier());
 
-    SmallVector<Value> coords = adaptor.getCoordinates();
+    SmallVector<Value> coords = op.getCoordinates();
     for (auto [index, value] : llvm::enumerate(coords)) {
       coords[index] = truncToI32(rewriter, op->getLoc(), value);
     }
@@ -930,6 +895,9 @@
   }
 };
 
+/// Create an i64 LLVM constant value. This should only be used with unambiguous
+/// sink operations where we know for a fact the underlying LLVM will precisely
+/// want i64.
 static Value makeI64Const(RewriterBase &rewriter, Operation *op,
                           int32_t index) {
   return rewriter.create<LLVM::ConstantOp>(op->getLoc(),
@@ -1063,3 +1031,57 @@
       NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
       NVGPUMmaSparseSyncLowering>(converter);
 }
+
+static IntegerType getIndexTypeForMemRef(MemRefType type) {
+  if (type.getMemorySpaceAsInt() ==
+      nvgpu::NVGPUDialect::kSharedMemoryAddressSpace)
+    return IntegerType::get(type.getContext(), 32);
+  return IntegerType::get(type.getContext(), 64);
+}
+
+namespace {
+
+struct ConvertNVGPUToNVVMPass
+    : public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {
+  using Base::Base;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<index::IndexDialect, memref::MemRefDialect,
+                    LLVM::LLVMDialect, NVVM::NVVMDialect>();
+  }
+
+  void runOnOperation() override {
+    LowerToLLVMOptions options(&getContext());
+    options.useOpaquePointers = useOpaquePointers;
+    options.memrefIndexTypeConverter = getIndexTypeForMemRef;
+    RewritePatternSet patterns(&getContext());
+    LLVMTypeConverter converter(&getContext(), options);
+    IRRewriter rewriter(&getContext());
+    /// device-side async tokens cannot be materialized in nvvm. We just
+    /// convert them to a dummy i32 type in order to easily drop them during
+    /// conversion.
+    converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {
+      return converter.convertType(IntegerType::get(type.getContext(), 32));
+    });
+    converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {
+      return converter.convertType(IntegerType::get(type.getContext(), 64));
+    });
+    converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
+      return converter.convertType(createMBarrierMemrefType(rewriter, type));
+    });
+    converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {
+      return converter.getPointerType(type.getTensor().getElementType());
+    });
+    populateNVGPUToNVVMConversionPatterns(converter, patterns);
+    LLVMConversionTarget target(getContext());
+    target.addLegalDialect<::mlir::index::IndexDialect>();
+    target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<::mlir::memref::MemRefDialect>();
+    target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir
--- a/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir
@@ -1,15 +1,5 @@
-// RUN: mlir-opt %s --convert-nvgpu-to-nvvm -gpu-kernel-outlining \
-// RUN:     -convert-scf-to-cf -convert-nvvm-to-llvm \
-// RUN:     -convert-vector-to-llvm \
-// RUN:     -convert-math-to-llvm \
-// RUN:     -expand-strided-metadata \
-// RUN:     -lower-affine \
-// RUN:     -convert-index-to-llvm=index-bitwidth=32 \
-// RUN:     -convert-arith-to-llvm \
-// RUN:     -finalize-memref-to-llvm \
-// RUN:     -convert-func-to-llvm \
-// RUN:     -canonicalize \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}))' \
+// RUN: mlir-opt %s 
+// RUN:    -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_90 cubin-features=+ptx80 dump-ptx"
 // RUN: 2&>1 | FileCheck %s --check-prefixes=CHECK-PTX
 
 // CHECK-PTX: mbarrier.init.shared.b64
@@ -87,4 +77,4 @@
     }
     return
   }
-}
\ No newline at end of file
+}
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
+#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
@@ -70,6 +71,9 @@
       *this, "cubin-features",
       llvm::cl::desc("Features to use to serialize to cubin."),
       llvm::cl::init("+ptx76")};
+  PassOptions::Option<bool> dumpPtx{
+      *this, "dump-ptx", llvm::cl::desc("Whether to dump the produced ptx)"),
+      llvm::cl::init(false)};
 };
 
 //===----------------------------------------------------------------------===//
@@ -124,6 +128,14 @@
   pm.addNestedPass<gpu::GPUModuleOp>(
       createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
 
+  // TODO: C++20 designated initializers.
+  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
+  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
+  pm.addNestedPass<gpu::GPUModuleOp>(
+      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
+
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
+
   // TODO: C++20 designated initializers.
   // The following pass is inconsistent.
   // ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
@@ -134,13 +146,6 @@
       createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/
                                      options.kernelIndexBitWidth));
 
-  // TODO: C++20 designated initializers.
-  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
-  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-
   // TODO: C++20 designated initializers.
   GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
   // Note: hostBarePtrCallConv must be false for now otherwise
@@ -164,6 +169,8 @@
   pm.addNestedPass<gpu::GPUModuleOp>(
       createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
 
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
+
   // Sprinkle some cleanups.
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
@@ -173,7 +180,8 @@
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
   pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
-      options.cubinTriple, options.cubinChip, options.cubinFeatures));
+      options.cubinTriple, options.cubinChip, options.cubinFeatures,
+      /*optLevel=*/2, /*dumpPtx=*/options.dumpPtx));
 #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
 }
 
@@ -182,8 +190,6 @@
   //===----------------------------------------------------------------------===//
   // Host-specific stuff.
   //===----------------------------------------------------------------------===//
-  // Important, must be run at the top-level.
-  pm.addPass(createGpuKernelOutliningPass());
 
   // Important, all host passes must be run at the func level so that host
   // conversions can remain with 64 bit indices without polluting the GPU
@@ -228,17 +234,6 @@
   pm.addNestedPass<func::FuncOp>(
       createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
 
-  // TODO: C++20 designated initializers.
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
-  pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
-
   // Sprinkle some cleanups.
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addNestedPass<func::FuncOp>(createCSEPass());
@@ -246,6 +241,20 @@
   //===----------------------------------------------------------------------===//
   // GPUModule-specific stuff.
   //===----------------------------------------------------------------------===//
+
+  // Due to gpu::LaunchOp and gpu::LaunchFuncOp layering and conversions, there
+  // is currently a need to call convertNVGPUToNVVM at the top-level to get
+  // proper types at the function boundary for the TMADescriptors.
+  // TODO: Fix this broken layering: conversion of TMA descriptor should be
+  // separated from introducing LLVM types.
+  // TODO: C++20 designated initializers.
+  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
+  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
+  pm.addPass(createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
+
+  // Important, must be run at the top-level.
+  pm.addPass(createGpuKernelOutliningPass());
+
   buildGpuPassPipeline(pm, options);
 
   //===----------------------------------------------------------------------===//
@@ -258,6 +267,8 @@
   pm.addNestedPass<func::FuncOp>(
       createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
 
+  pm.addPass(createConvertNVVMToLLVMPass());
+
   ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
   // Must be 64b on the host, things don't compose properly around
   // gpu::LaunchOp and gpu::HostRegisterOp.
@@ -265,6 +276,8 @@
   convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
   pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
 
+  pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
+
   // This must happen after cubin translation otherwise gpu.launch_func is
   // illegal if no cubin annotation is present.
   // TODO: C++20 designated initializers.