diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -45,8 +45,7 @@
 /// instead uses a small wrapper library that exports a stable and conveniently
 /// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
 std::unique_ptr<OperationPass<ModuleOp>>
-createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
-    StringRef gpuBinaryAnnotation = "");
+createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation = "");
 
 /// Collect a set of patterns to convert from the GPU dialect to LLVM.
 void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -82,14 +82,12 @@
 // GPUCommon
 //===----------------------------------------------------------------------===//
 
-def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime",
-                                                 "ModuleOp"> {
-  let summary = "Convert all launch_func ops to GPU runtime calls";
-  let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()";
+def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
+  let summary = "Convert GPU dialect to LLVM dialect with GPU runtime calls";
+  let constructor = "mlir::createGpuToLLVMConversionPass()";
   let options = [
     Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
-           "\"nvvm.cubin\"",
-           "Annotation attribute string for GPU binary">,
+           "", "Annotation attribute string for GPU binary">,
   ];
 }
 
diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
--- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
+++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
@@ -440,6 +440,10 @@
                                 ConversionPatternRewriter &rewriter,
                                 SmallVectorImpl<Value> &sizes) const;
 
+  /// Computes the size of type in bytes.
+  Value getSizeInBytes(Location loc, Type type,
+                       ConversionPatternRewriter &rewriter) const;
+
   /// Computes total size in bytes of to store the given shape.
   Value getCumulativeSizeInBytes(Location loc, Type elementType,
                                  ArrayRef<Value> shape,
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -741,4 +741,16 @@
   let printer = [{ p << getOperationName(); }];
 }
 
+def GPU_HostRegisterOp : GPU_Op<"host_register">,
+    Arguments<(ins AnyUnrankedMemRef:$value)>, Results<(outs)> {
+  let summary = "Registers a memref for access from device.";
+  let description = [{
+    This op registers the host memory pointed to by a memref to be accessed from
+    a device.
+  }];
+
+  let assemblyFormat = "$value attr-dict `:` type($value)";
+  let verifier = [{ return success(); }];
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -39,11 +39,10 @@
 
 namespace {
 
-class GpuLaunchFuncToGpuRuntimeCallsPass
-    : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
-          GpuLaunchFuncToGpuRuntimeCallsPass> {
+class GpuToLLVMConversionPass
+    : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
 public:
-  GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
+  GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
     if (!gpuBinaryAnnotation.empty())
       this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
   }
@@ -118,6 +117,24 @@
       "mgpuStreamSynchronize",
       llvmVoidType,
       {llvmPointerType /* void *stream */}};
+  FunctionCallBuilder hostRegisterCallBuilder = {
+      "mgpuMemHostRegisterMemRef",
+      llvmVoidType,
+      {llvmIntPtrType /* intptr_t rank */,
+       llvmPointerType /* void *memrefDesc */,
+       llvmIntPtrType /* intptr_t elementSizeBytes */}};
+};
+
+class ConvertHostRegisterOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
+public:
+  ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
 };
 
 /// A rewrite patter to convert gpu.launch_func operations into a sequence of
@@ -142,10 +159,8 @@
         gpuBinaryAnnotation(gpuBinaryAnnotation) {}
 
 private:
-  void addParamToArray(OpBuilder &builder, Location loc, Value param,
-                       Value array, unsigned pos, Value one) const;
-  Value generateParamsArray(gpu::LaunchFuncOp launchOp, unsigned numArguments,
-                            OpBuilder &builder) const;
+  Value generateParamsArray(gpu::LaunchFuncOp launchOp,
+                            ArrayRef<Value> operands, OpBuilder &builder) const;
   Value generateKernelNameConstant(StringRef moduleName, StringRef name,
                                    Location loc, OpBuilder &builder) const;
 
@@ -170,9 +185,10 @@
 
 } // namespace
 
-void GpuLaunchFuncToGpuRuntimeCallsPass::runOnOperation() {
+void GpuToLLVMConversionPass::runOnOperation() {
   LLVMTypeConverter converter(&getContext());
   OwningRewritePatternList patterns;
+  populateStdToLLVMConversionPatterns(converter, patterns);
   populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
 
   LLVMConversionTarget target(getContext());
@@ -194,91 +210,82 @@
       builder.getSymbolRefAttr(function), arguments);
 }
 
-/// Emits the IR with the following structure:
-///
-///   %data = llvm.alloca 1 x type-of(<param>)
-///   llvm.store <param>, %data
-///   %typeErased = llvm.bitcast %data to !llvm<"i8*">
-///   %addr = llvm.getelementptr <array>[<pos>]
-///   llvm.store %typeErased, %addr
-///
-/// This is necessary to construct the array of arguments passed to the kernel
-/// function as accepted by cuLaunchKernel, i.e. as a void** that points to
-/// array of stack-allocated type-erased pointers to the actual arguments.
-void ConvertLaunchFuncOpToGpuRuntimeCallPattern::addParamToArray(
-    OpBuilder &builder, Location loc, Value param, Value array, unsigned pos,
-    Value one) const {
-  auto memLocation = builder.create<LLVM::AllocaOp>(
-      loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
-      /*alignment=*/0);
-  builder.create<LLVM::StoreOp>(loc, param, memLocation);
-  auto casted =
-      builder.create<LLVM::BitcastOp>(loc, llvmPointerType, memLocation);
-
-  auto index = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
-                                                builder.getI32IntegerAttr(pos));
-  auto gep = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, array,
-                                         index.getResult());
-  builder.create<LLVM::StoreOp>(loc, casted, gep);
+LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
+    Operation *op, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  if (!llvm::all_of(operands, [](Value operand) {
+        return operand.getType().isa<LLVM::LLVMType>();
+      }))
+    return rewriter.notifyMatchFailure(
+        op, "Cannot convert if operands aren't of LLVM type.");
+
+  auto hostRegisterOp = cast<gpu::HostRegisterOp>(op);
+  Location loc = op->getLoc();
+
+  auto memRefType = hostRegisterOp.value().getType();
+  auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
+  auto elementSize = getSizeInBytes(loc, elementType, rewriter);
+
+  auto arguments =
+      typeConverter.promoteOperands(loc, op->getOperands(), operands, rewriter);
+  arguments.push_back(elementSize);
+  hostRegisterCallBuilder.create(loc, rewriter, arguments);
+
+  rewriter.eraseOp(op);
+  return success();
 }
 
-// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
-// launch call. The arguments are extracted from the launchOp.
+// Creates a struct containing all kernel parameters on the stack and returns
+// an array of type-erased pointers to the fields of the struct. The array can
+// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
 // The generated code is essentially as follows:
 //
-// %array = alloca(numparams * sizeof(void *))
-// for (i : [0, NumKernelOperands))
-//   %array[i] = cast<void*>(KernelOperand[i])
+// %struct = alloca(sizeof(struct { Parameters... }))
+// %array = alloca(NumParameters * sizeof(void *))
+// for (i : [0, NumParameters))
+//   %fieldPtr = llvm.getelementptr %struct[0, i]
+//   llvm.store parameters[i], %fieldPtr
+//   %elementPtr = llvm.getelementptr %array[i]
+//   llvm.store %fieldPtr, %elementPtr
 // return %array
 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
-    gpu::LaunchFuncOp launchOp, unsigned numArguments,
+    gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
     OpBuilder &builder) const {
+  auto loc = launchOp.getLoc();
   auto numKernelOperands = launchOp.getNumKernelOperands();
-  Location loc = launchOp.getLoc();
+  auto arguments = typeConverter.promoteOperands(
+      loc, launchOp.getOperands().take_back(numKernelOperands),
+      operands.take_back(numKernelOperands), builder);
+  auto numArguments = arguments.size();
+  SmallVector<LLVM::LLVMType, 4> argumentTypes;
+  argumentTypes.reserve(numArguments);
+  for (auto argument : arguments)
+    argumentTypes.push_back(argument.getType().cast<LLVM::LLVMType>());
+  auto structType = LLVM::LLVMType::createStructTy(argumentTypes, StringRef());
   auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
                                               builder.getI32IntegerAttr(1));
+  auto structPtr = builder.create<LLVM::AllocaOp>(
+      loc, structType.getPointerTo(), one, /*alignment=*/0);
   auto arraySize = builder.create<LLVM::ConstantOp>(
       loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
-  auto array = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
-                                              arraySize, /*alignment=*/0);
-
-  unsigned pos = 0;
-  for (unsigned idx = 0; idx < numKernelOperands; ++idx) {
-    auto operand = launchOp.getKernelOperand(idx);
-    auto llvmType = operand.getType().cast<LLVM::LLVMType>();
-
-    // Assume all struct arguments come from MemRef. If this assumption does not
-    // hold anymore then we `launchOp` to lower from MemRefType and not after
-    // LLVMConversion has taken place and the MemRef information is lost.
-    if (!llvmType.isStructTy()) {
-      addParamToArray(builder, loc, operand, array, pos++, one);
-      continue;
-    }
-
-    // Put individual components of a memref descriptor into the flat argument
-    // list. We cannot use unpackMemref from LLVM lowering here because we have
-    // no access to MemRefType that had been lowered away.
-    for (int32_t j = 0, ej = llvmType.getStructNumElements(); j < ej; ++j) {
-      auto elemType = llvmType.getStructElementType(j);
-      if (elemType.isArrayTy()) {
-        for (int32_t k = 0, ek = elemType.getArrayNumElements(); k < ek; ++k) {
-          Value elem = builder.create<LLVM::ExtractValueOp>(
-              loc, elemType.getArrayElementType(), operand,
-              builder.getI32ArrayAttr({j, k}));
-          addParamToArray(builder, loc, elem, array, pos++, one);
-        }
-      } else {
-        assert((elemType.isIntegerTy() || elemType.isFloatTy() ||
-                elemType.isDoubleTy() || elemType.isPointerTy()) &&
-               "expected scalar type");
-        Value strct = builder.create<LLVM::ExtractValueOp>(
-            loc, elemType, operand, builder.getI32ArrayAttr(j));
-        addParamToArray(builder, loc, strct, array, pos++, one);
-      }
-    }
+  auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
+                                                 arraySize, /*alignment=*/0);
+  auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+                                               builder.getI32IntegerAttr(0));
+  for (auto en : llvm::enumerate(arguments)) {
+    auto index = builder.create<LLVM::ConstantOp>(
+        loc, llvmInt32Type, builder.getI32IntegerAttr(en.index()));
+    auto fieldPtr =
+        builder.create<LLVM::GEPOp>(loc, structType.getPointerTo(), structPtr,
+                                    ArrayRef<Value>{zero, index.getResult()});
+    builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
+    auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType,
+                                                  arrayPtr, index.getResult());
+    auto casted =
+        builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
+    builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
   }
-
-  return array;
+  return arrayPtr;
 }
 
 // Generates an LLVM IR dialect global that contains the name of the given
@@ -320,14 +327,19 @@
 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
     Operation *op, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter) const {
-  Location loc = op->getLoc();
+  if (!llvm::all_of(operands, [](Value operand) {
+        return operand.getType().isa<LLVM::LLVMType>();
+      }))
+    return failure(); // Cannot convert if operands aren't of LLVM type.
+
   auto launchOp = cast<gpu::LaunchFuncOp>(op);
-  auto moduleOp = op->getParentOfType<ModuleOp>();
+  Location loc = launchOp.getLoc();
 
   // Create an LLVM global with CUBIN extracted from the kernel annotation and
   // obtain a pointer to the first byte in it.
   auto kernelModule =
-      moduleOp.lookupSymbol<gpu::GPUModuleOp>(launchOp.getKernelModuleName());
+      launchOp.getParentOfType<ModuleOp>().lookupSymbol<gpu::GPUModuleOp>(
+          launchOp.getKernelModuleName());
   assert(kernelModule && "expected a kernel module");
 
   auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
@@ -350,25 +362,13 @@
       launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter);
   auto function = moduleGetFunctionCallBuilder.create(
       loc, rewriter, {module.getResult(0), kernelName});
-  // Grab the global stream needed for execution.
-  auto stream = streamCreateCallBuilder.create(loc, rewriter, {});
-
-  // Get the launch target.
-  auto gpuFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
-      launchOp, launchOp.kernel());
-  if (!gpuFuncOp) {
-    launchOp.emitOpError() << "corresponding kernel function not found";
-    return failure();
-  }
-  // Build array of kernel parameters.
-  auto kernelParams =
-      generateParamsArray(launchOp, gpuFuncOp.getNumArguments(), rewriter);
-
-  // Invoke the function with required arguments.
   auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
                                                 rewriter.getI32IntegerAttr(0));
-  auto nullpointer =
-      rewriter.create<LLVM::IntToPtrOp>(loc, llvmPointerPointerType, zero);
+  // Grab the global stream needed for execution.
+  auto stream = streamCreateCallBuilder.create(loc, rewriter, {});
+  // Create array of pointers to kernel arguments.
+  auto kernelParams = generateParamsArray(launchOp, operands, rewriter);
+  auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType);
   launchKernelCallBuilder.create(
       loc, rewriter,
       {function.getResult(0), launchOp.gridSizeX(), launchOp.gridSizeY(),
@@ -384,15 +384,14 @@
 }
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
-    StringRef gpuBinaryAnnotation) {
-  return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
-      gpuBinaryAnnotation);
+mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
+  return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);
 }
 
 void mlir::populateGpuToLLVMConversionPatterns(
     LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
     StringRef gpuBinaryAnnotation) {
+  patterns.insert<ConvertHostRegisterOpToGpuRuntimeCallPattern>(converter);
   patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
       converter, gpuBinaryAnnotation);
   patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -927,30 +927,32 @@
                         : createIndexConstant(rewriter, loc, s));
 }
 
-Value ConvertToLLVMPattern::getCumulativeSizeInBytes(
-    Location loc, Type elementType, ArrayRef<Value> sizes,
-    ConversionPatternRewriter &rewriter) const {
-  // Compute the total number of memref elements.
-  Value cumulativeSizeInBytes =
-      sizes.empty() ? createIndexConstant(rewriter, loc, 1) : sizes.front();
-  for (unsigned i = 1, e = sizes.size(); i < e; ++i)
-    cumulativeSizeInBytes = rewriter.create<LLVM::MulOp>(
-        loc, getIndexType(), ArrayRef<Value>{cumulativeSizeInBytes, sizes[i]});
-
+Value ConvertToLLVMPattern::getSizeInBytes(
+    Location loc, Type type, ConversionPatternRewriter &rewriter) const {
   // Compute the size of an individual element. This emits the MLIR equivalent
   // of the following sizeof(...) implementation in LLVM IR:
   //   %0 = getelementptr %elementType* null, %indexType 1
   //   %1 = ptrtoint %elementType* %0 to %indexType
   // which is a common pattern of getting the size of a type in bytes.
-  auto convertedPtrType = typeConverter.convertType(elementType)
-                              .cast<LLVM::LLVMType>()
-                              .getPointerTo();
+  auto convertedPtrType =
+      typeConverter.convertType(type).cast<LLVM::LLVMType>().getPointerTo();
   auto nullPtr = rewriter.create<LLVM::NullOp>(loc, convertedPtrType);
   auto gep = rewriter.create<LLVM::GEPOp>(
       loc, convertedPtrType,
       ArrayRef<Value>{nullPtr, createIndexConstant(rewriter, loc, 1)});
-  auto elementSize =
-      rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gep);
+  return rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gep);
+}
+
+Value ConvertToLLVMPattern::getCumulativeSizeInBytes(
+    Location loc, Type elementType, ArrayRef<Value> sizes,
+    ConversionPatternRewriter &rewriter) const {
+  // Compute the total number of memref elements.
+  Value cumulativeSizeInBytes =
+      sizes.empty() ? createIndexConstant(rewriter, loc, 1) : sizes.front();
+  for (unsigned i = 1, e = sizes.size(); i < e; ++i)
+    cumulativeSizeInBytes = rewriter.create<LLVM::MulOp>(
+        loc, getIndexType(), ArrayRef<Value>{cumulativeSizeInBytes, sizes[i]});
+  auto elementSize = this->getSizeInBytes(loc, elementType, rewriter);
   return rewriter.create<LLVM::MulOp>(
       loc, getIndexType(), ArrayRef<Value>{cumulativeSizeInBytes, elementSize});
 }
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
+// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
 
 module attributes {gpu.container_module} {
 
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir
@@ -25,9 +25,9 @@
   %c6 = constant 6 : index
 
   %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_data : memref<*xi32>
   %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_sum : memref<*xi32>
 
   store %cst0, %data[%c0, %c0] : memref<2x6xi32>
   store %cst1, %data[%c0, %c1] : memref<2x6xi32>
@@ -58,6 +58,5 @@
   return
 }
 
-func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
 func @print_memref_i32(memref<*xi32>)
 
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir
@@ -25,9 +25,9 @@
   %c6 = constant 6 : index
 
   %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_data : memref<*xi32>
   %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_sum : memref<*xi32>
 
   store %cst0, %data[%c0, %c0] : memref<2x6xi32>
   store %cst1, %data[%c0, %c1] : memref<2x6xi32>
@@ -58,6 +58,5 @@
   return
 }
 
-func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
 func @print_memref_i32(memref<*xi32>)
 
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir
@@ -25,9 +25,9 @@
   %c6 = constant 6 : index
 
   %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_data : memref<*xi32>
   %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_sum : memref<*xi32>
 
   store %cst0, %data[%c0, %c0] : memref<2x6xi32>
   store %cst1, %data[%c0, %c1] : memref<2x6xi32>
@@ -58,6 +58,5 @@
   return
 }
 
-func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
 func @print_memref_i32(memref<*xi32>)
 
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
@@ -11,7 +11,7 @@
   %sy = dim %dst, %c1 : memref<?x?x?xf32>
   %sz = dim %dst, %c0 : memref<?x?x?xf32>
   %cast_dst = memref_cast %dst : memref<?x?x?xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> ()
+  gpu.host_register %cast_dst : memref<*xf32>
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz) {
     %t0 = muli %tz, %block_y : index
@@ -28,5 +28,4 @@
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir
@@ -25,9 +25,9 @@
   %c6 = constant 6 : index
 
   %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_data : memref<*xi32>
   %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_sum : memref<*xi32>
 
   store %cst0, %data[%c0, %c0] : memref<2x6xi32>
   store %cst1, %data[%c0, %c1] : memref<2x6xi32>
@@ -58,6 +58,5 @@
   return
 }
 
-func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
 func @print_memref_i32(memref<*xi32>)
 
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
@@ -8,7 +8,7 @@
   %c0 = constant 0 : index
   %sx = dim %dst, %c0 : memref<?xf32>
   %cast_dst = memref_cast %dst : memref<?xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> ()
+  gpu.host_register %cast_dst : memref<*xf32>
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
     %val = index_cast %tx : index to i32
@@ -25,5 +25,4 @@
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @print_memref_f32(memref<*xf32>)
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir
@@ -25,9 +25,9 @@
   %c6 = constant 6 : index
 
   %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_data : memref<*xi32>
   %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_sum : memref<*xi32>
 
   store %cst0, %data[%c0, %c0] : memref<2x6xi32>
   store %cst1, %data[%c0, %c1] : memref<2x6xi32>
@@ -58,6 +58,5 @@
   return
 }
 
-func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
 func @print_memref_i32(memref<*xi32>)
 
diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
--- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
+++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
@@ -18,7 +18,7 @@
   %21 = constant 5 : i32
   %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
   %23 = memref_cast %22 : memref<?xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%23) : (memref<*xf32>) -> ()
+  gpu.host_register %23 : memref<*xf32>
   call @print_memref_f32(%23) : (memref<*xf32>) -> ()
   %24 = constant 1.0 : f32
   call @other_func(%24, %22) : (f32, memref<?xf32>) -> ()
@@ -26,5 +26,4 @@
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir
--- a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir
+++ b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir
@@ -26,11 +26,11 @@
   %c6 = constant 6 : index
 
   %cast_data = memref_cast %data : memref<2x6xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%cast_data) : (memref<*xf32>) -> ()
+  gpu.host_register %cast_data : memref<*xf32>
   %cast_sum = memref_cast %sum : memref<2xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%cast_sum) : (memref<*xf32>) -> ()
+  gpu.host_register %cast_sum : memref<*xf32>
   %cast_mul = memref_cast %mul : memref<2xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%cast_mul) : (memref<*xf32>) -> ()
+  gpu.host_register %cast_mul : memref<*xf32>
 
   store %cst0, %data[%c0, %c0] : memref<2x6xf32>
   store %cst1, %data[%c0, %c1] : memref<2x6xf32>
@@ -66,5 +66,4 @@
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @print_memref_f32(memref<*xf32>)
diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir
--- a/mlir/test/mlir-cuda-runner/shuffle.mlir
+++ b/mlir/test/mlir-cuda-runner/shuffle.mlir
@@ -7,8 +7,8 @@
   %one = constant 1 : index
   %c0 = constant 0 : index
   %sx = dim %dst, %c0 : memref<?xf32>
-  %cast_dest = memref_cast %dst : memref<?xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%cast_dest) : (memref<*xf32>) -> ()
+  %cast_dst = memref_cast %dst : memref<?xf32> to memref<*xf32>
+  gpu.host_register %cast_dst : memref<*xf32>
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
     %t0 = index_cast %tx : index to i32
@@ -24,9 +24,8 @@
     store %value, %dst[%tx] : memref<?xf32>
     gpu.terminator
   }
-  call @print_memref_f32(%cast_dest) : (memref<*xf32>) -> ()
+  call @print_memref_f32(%cast_dst) : (memref<*xf32>) -> ()
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-cuda-runner/two-modules.mlir b/mlir/test/mlir-cuda-runner/two-modules.mlir
--- a/mlir/test/mlir-cuda-runner/two-modules.mlir
+++ b/mlir/test/mlir-cuda-runner/two-modules.mlir
@@ -8,7 +8,7 @@
   %c0 = constant 0 : index
   %sx = dim %dst, %c0 : memref<?xi32>
   %cast_dst = memref_cast %dst : memref<?xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_dst : memref<*xi32>
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
     %t0 = index_cast %tx : index to i32
@@ -25,5 +25,4 @@
   return
 }
 
-func @mgpuMemHostRegisterInt32(%memref : memref<*xi32>)
 func @print_memref_i32(%memref : memref<*xi32>)
diff --git a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
--- a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
+++ b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
@@ -18,7 +18,7 @@
   %21 = constant 5 : i32
   %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
   %cast = memref_cast %22 : memref<?xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%cast) : (memref<*xf32>) -> ()
+  gpu.host_register %cast : memref<*xf32>
   %23 = memref_cast %22 : memref<?xf32> to memref<*xf32>
   call @print_memref_f32(%23) : (memref<*xf32>) -> ()
   %24 = constant 1.0 : f32
@@ -28,6 +28,5 @@
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
 func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-rocm-runner/two-modules.mlir b/mlir/test/mlir-rocm-runner/two-modules.mlir
--- a/mlir/test/mlir-rocm-runner/two-modules.mlir
+++ b/mlir/test/mlir-rocm-runner/two-modules.mlir
@@ -8,7 +8,7 @@
   %c1 = constant 1 : index
   %sx = dim %dst, %c0 : memref<?xi32>
   %cast_dst = memref_cast %dst : memref<?xi32> to memref<*xi32>
-  call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> ()
+  gpu.host_register %cast_dst : memref<*xi32>
   %dst_device = call @mgpuMemGetDeviceMemRef1dInt32(%dst) : (memref<?xi32>) -> (memref<?xi32>)
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %c1, %block_z = %c1) {
@@ -26,6 +26,5 @@
   return
 }
 
-func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
 func @mgpuMemGetDeviceMemRef1dInt32(%ptr : memref<?xi32>) -> (memref<?xi32>)
 func @print_memref_i32(%ptr : memref<*xi32>)
diff --git a/mlir/test/mlir-rocm-runner/vecadd.mlir b/mlir/test/mlir-rocm-runner/vecadd.mlir
--- a/mlir/test/mlir-rocm-runner/vecadd.mlir
+++ b/mlir/test/mlir-rocm-runner/vecadd.mlir
@@ -26,9 +26,9 @@
   %6 = memref_cast %3 : memref<?xf32> to memref<*xf32>
   %7 = memref_cast %4 : memref<?xf32> to memref<*xf32>
   %8 = memref_cast %5 : memref<?xf32> to memref<*xf32>
-  call @mgpuMemHostRegisterFloat(%6) : (memref<*xf32>) -> ()
-  call @mgpuMemHostRegisterFloat(%7) : (memref<*xf32>) -> ()
-  call @mgpuMemHostRegisterFloat(%8) : (memref<*xf32>) -> ()
+  gpu.host_register %6 : memref<*xf32>
+  gpu.host_register %7 : memref<*xf32>
+  gpu.host_register %8 : memref<*xf32>
   %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref<?xf32>) -> (memref<?xf32>)
   %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref<?xf32>) -> (memref<?xf32>)
   %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref<?xf32>) -> (memref<?xf32>)
@@ -38,6 +38,5 @@
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
 func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-rocm-runner/vector-transferops.mlir b/mlir/test/mlir-rocm-runner/vector-transferops.mlir
--- a/mlir/test/mlir-rocm-runner/vector-transferops.mlir
+++ b/mlir/test/mlir-rocm-runner/vector-transferops.mlir
@@ -55,8 +55,8 @@
   %cast0 = memref_cast %22 : memref<?xf32> to memref<*xf32>
   %cast1 = memref_cast %23 : memref<?xf32> to memref<*xf32>
 
-  call @mgpuMemHostRegisterFloat(%cast0) : (memref<*xf32>) -> ()
-  call @mgpuMemHostRegisterFloat(%cast1) : (memref<*xf32>) -> ()
+  gpu.host_register %cast0 : memref<*xf32>
+  gpu.host_register %cast1 : memref<*xf32>
 
   %24 = call @mgpuMemGetDeviceMemRef1dFloat(%22) : (memref<?xf32>) -> (memref<?xf32>)
   %26 = call @mgpuMemGetDeviceMemRef1dFloat(%23) : (memref<?xf32>) -> (memref<?xf32>)
@@ -71,6 +71,5 @@
   return
 }
 
-func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
 func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
 func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -75,17 +75,19 @@
   CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0));
 }
 
-// Allows to register a MemRef with the CUDA runtime. Initializes array with
-// value. Helpful until we have transfer functions implemented.
-template <typename T>
-void mgpuMemHostRegisterMemRef(const DynamicMemRefType<T> &memRef, T value) {
-  llvm::SmallVector<int64_t, 4> denseStrides(memRef.rank);
-  llvm::ArrayRef<int64_t> sizes(memRef.sizes, memRef.rank);
-  llvm::ArrayRef<int64_t> strides(memRef.strides, memRef.rank);
+// Allows to register a MemRef with the CUDA runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void
+mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
+                          int64_t elementSizeBytes) {
+
+  llvm::SmallVector<int64_t, 4> denseStrides(rank);
+  llvm::ArrayRef<int64_t> sizes(descriptor->sizes, rank);
+  llvm::ArrayRef<int64_t> strides(sizes.end(), rank);
 
   std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
                    std::multiplies<int64_t>());
-  auto count = denseStrides.front();
+  auto sizeBytes = denseStrides.front() * elementSizeBytes;
 
   // Only densely packed tensors are currently supported.
   std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
@@ -93,17 +95,6 @@
   denseStrides.back() = 1;
   assert(strides == llvm::makeArrayRef(denseStrides));
 
-  auto *pointer = memRef.data + memRef.offset;
-  std::fill_n(pointer, count, value);
-  mgpuMemHostRegister(pointer, count * sizeof(T));
-}
-
-extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) {
-  UnrankedMemRefType<float> memRef = {rank, ptr};
-  mgpuMemHostRegisterMemRef(DynamicMemRefType<float>(memRef), 1.23f);
-}
-
-extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) {
-  UnrankedMemRefType<int32_t> memRef = {rank, ptr};
-  mgpuMemHostRegisterMemRef(DynamicMemRefType<int32_t>(memRef), 123);
+  auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  mgpuMemHostRegister(ptr, sizeBytes);
 }
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -118,9 +118,7 @@
   kernelPm.addPass(createConvertGPUKernelToBlobPass(
       translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
       "sm_35", "+ptx60", gpuBinaryAnnotation));
-  pm.addPass(createLowerToLLVMPass());
-  pm.addPass(
-      createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
+  pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation));
 
   return pm.run(m);
 }
diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
--- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
+++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
@@ -309,9 +309,7 @@
   kernelPm.addPass(createConvertGPUKernelToBlobPass(
       compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
       features, gpuBinaryAnnotation));
-  pm.addPass(createLowerToLLVMPass());
-  pm.addPass(
-      createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
+  pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation));
 
   return pm.run(m);
 }
diff --git a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
--- a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
@@ -76,17 +76,19 @@
   HIP_REPORT_IF_ERROR(hipHostRegister(ptr, sizeBytes, /*flags=*/0));
 }
 
-// Allows to register a MemRef with the ROCM runtime. Initializes array with
-// value. Helpful until we have transfer functions implemented.
-template <typename T>
-void mgpuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
-                               llvm::ArrayRef<int64_t> strides, T value) {
-  assert(sizes.size() == strides.size());
-  llvm::SmallVector<int64_t, 4> denseStrides(strides.size());
+// Allows to register a MemRef with the CUDA runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void
+mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
+                          int64_t elementSizeBytes) {
+
+  llvm::SmallVector<int64_t, 4> denseStrides(rank);
+  llvm::ArrayRef<int64_t> sizes(descriptor->sizes, rank);
+  llvm::ArrayRef<int64_t> strides(sizes.end(), rank);
 
   std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
                    std::multiplies<int64_t>());
-  auto count = denseStrides.front();
+  auto sizeBytes = denseStrides.front() * elementSizeBytes;
 
   // Only densely packed tensors are currently supported.
   std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
@@ -94,22 +96,8 @@
   denseStrides.back() = 1;
   assert(strides == llvm::makeArrayRef(denseStrides));
 
-  std::fill_n(pointer, count, value);
-  mgpuMemHostRegister(pointer, count * sizeof(T));
-}
-
-extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) {
-  auto *desc = static_cast<StridedMemRefType<float, 1> *>(ptr);
-  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
-  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
-  mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 1.23f);
-}
-
-extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) {
-  auto *desc = static_cast<StridedMemRefType<int32_t, 1> *>(ptr);
-  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
-  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
-  mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 123);
+  auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  mgpuMemHostRegister(ptr, sizeBytes);
 }
 
 template <typename T>