diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -14,10 +14,12 @@
 
 namespace mlir {
 
+class LLVMTypeConverter;
 class Location;
 struct LogicalResult;
 class ModuleOp;
 class Operation;
+class OwningRewritePatternList;
 
 template <typename T>
 class OperationPass;
@@ -46,6 +48,11 @@
 createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
     StringRef gpuBinaryAnnotation = "");
 
+/// Collect a set of patterns to convert from the GPU dialect to LLVM.
+void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                         OwningRewritePatternList &patterns,
+                                         StringRef gpuBinaryAnnotation);
+
 /// Creates a pass to convert kernel functions into GPU target object blobs.
 ///
 /// This transformation takes the body of each function that is annotated with
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -34,4 +34,5 @@
   MLIRLLVMIR
   MLIRPass
   MLIRSupport
+  MLIRStandardToLLVM
 )
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 
 #include "../PassDetail.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
@@ -34,21 +35,94 @@
 
 using namespace mlir;
 
-// To avoid name mangling, these are defined in the mini-runtime file.
-static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad";
-static constexpr const char *kGpuModuleGetFunctionName =
-    "mgpuModuleGetFunction";
-static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel";
-static constexpr const char *kGpuStreamCreateName = "mgpuStreamCreate";
-static constexpr const char *kGpuStreamSynchronizeName =
-    "mgpuStreamSynchronize";
-static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister";
 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
 
 namespace {
 
-/// A pass to convert gpu.launch_func operations into a sequence of GPU
-/// runtime calls. Currently it supports CUDA and ROCm (HIP).
+class GpuLaunchFuncToGpuRuntimeCallsPass
+    : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
+          GpuLaunchFuncToGpuRuntimeCallsPass> {
+public:
+  GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
+    if (!gpuBinaryAnnotation.empty())
+      this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
+  }
+
+  // Run the dialect converter on the module.
+  void runOnOperation() override;
+};
+
+class FunctionCallBuilder {
+public:
+  FunctionCallBuilder(StringRef functionName, LLVM::LLVMType returnType,
+                      ArrayRef<LLVM::LLVMType> argumentTypes)
+      : functionName(functionName),
+        functionType(LLVM::LLVMType::getFunctionTy(returnType, argumentTypes,
+                                                   /*isVarArg=*/false)) {}
+  LLVM::CallOp create(Location loc, OpBuilder &builder,
+                      ArrayRef<Value> arguments) const;
+
+private:
+  StringRef functionName;
+  LLVM::LLVMType functionType;
+};
+
+template <typename OpTy>
+class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
+public:
+  explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
+
+protected:
+  LLVM::LLVMDialect *llvmDialect = this->typeConverter.getDialect();
+
+  LLVM::LLVMType llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect);
+  LLVM::LLVMType llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
+  LLVM::LLVMType llvmPointerPointerType = llvmPointerType.getPointerTo();
+  LLVM::LLVMType llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);
+  LLVM::LLVMType llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);
+  LLVM::LLVMType llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
+  LLVM::LLVMType llvmIntPtrType = LLVM::LLVMType::getIntNTy(
+      llvmDialect,
+      llvmDialect->getLLVMModule().getDataLayout().getPointerSizeInBits());
+
+  FunctionCallBuilder moduleLoadCallBuilder = {
+      "mgpuModuleLoad",
+      llvmPointerType /* void *module */,
+      {llvmPointerType /* void *cubin */}};
+  FunctionCallBuilder moduleGetFunctionCallBuilder = {
+      "mgpuModuleGetFunction",
+      llvmPointerType /* void *function */,
+      {
+          llvmPointerType, /* void *module */
+          llvmPointerType  /* char *name   */
+      }};
+  FunctionCallBuilder launchKernelCallBuilder = {
+      "mgpuLaunchKernel",
+      llvmVoidType,
+      {
+          llvmPointerType,        /* void* f */
+          llvmIntPtrType,         /* intptr_t gridXDim */
+          llvmIntPtrType,         /* intptr_t gridyDim */
+          llvmIntPtrType,         /* intptr_t gridZDim */
+          llvmIntPtrType,         /* intptr_t blockXDim */
+          llvmIntPtrType,         /* intptr_t blockYDim */
+          llvmIntPtrType,         /* intptr_t blockZDim */
+          llvmInt32Type,          /* unsigned int sharedMemBytes */
+          llvmPointerType,        /* void *hstream */
+          llvmPointerPointerType, /* void **kernelParams */
+          llvmPointerPointerType  /* void **extra */
+      }};
+  FunctionCallBuilder streamCreateCallBuilder = {
+      "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
+  FunctionCallBuilder streamSynchronizeCallBuilder = {
+      "mgpuStreamSynchronize",
+      llvmVoidType,
+      {llvmPointerType /* void *stream */}};
+};
+
+/// A rewrite patter to convert gpu.launch_func operations into a sequence of
+/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
 ///
 /// In essence, a gpu.launch_func operations gets compiled into the following
 /// sequence of runtime calls:
@@ -60,172 +134,65 @@
 /// * streamSynchronize -- waits for operations on the stream to finish
 ///
 /// Intermediate data structures are allocated on the stack.
-class GpuLaunchFuncToGpuRuntimeCallsPass
-    : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
-          GpuLaunchFuncToGpuRuntimeCallsPass> {
-private:
-  LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
-
-  llvm::LLVMContext &getLLVMContext() {
-    return getLLVMDialect()->getLLVMContext();
-  }
-
-  void initializeCachedTypes() {
-    const llvm::Module &module = llvmDialect->getLLVMModule();
-    llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect);
-    llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
-    llvmPointerPointerType = llvmPointerType.getPointerTo();
-    llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);
-    llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);
-    llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
-    llvmIntPtrType = LLVM::LLVMType::getIntNTy(
-        llvmDialect, module.getDataLayout().getPointerSizeInBits());
-  }
-
-  LLVM::LLVMType getVoidType() { return llvmVoidType; }
-
-  LLVM::LLVMType getPointerType() { return llvmPointerType; }
-
-  LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }
-
-  LLVM::LLVMType getInt8Type() { return llvmInt8Type; }
-
-  LLVM::LLVMType getInt32Type() { return llvmInt32Type; }
-
-  LLVM::LLVMType getInt64Type() { return llvmInt64Type; }
-
-  LLVM::LLVMType getIntPtrType() {
-    const llvm::Module &module = getLLVMDialect()->getLLVMModule();
-    return LLVM::LLVMType::getIntNTy(
-        getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
-  }
-
-  // Allocate a void pointer on the stack.
-  Value allocatePointer(OpBuilder &builder, Location loc) {
-    auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
-                                                builder.getI32IntegerAttr(1));
-    return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
-                                          /*alignment=*/0);
-  }
+class ConvertLaunchFuncOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
+public:
+  ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
+                                             StringRef gpuBinaryAnnotation)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
+        gpuBinaryAnnotation(gpuBinaryAnnotation) {}
 
-  void declareGpuRuntimeFunctions(Location loc);
-  void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
-                      unsigned pos, Value one);
-  Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
+private:
+  void addParamToArray(OpBuilder &builder, Location loc, Value param,
+                       Value array, unsigned pos, Value one) const;
+  Value generateParamsArray(gpu::LaunchFuncOp launchOp, unsigned numArguments,
+                            OpBuilder &builder) const;
   Value generateKernelNameConstant(StringRef moduleName, StringRef name,
-                                   Location loc, OpBuilder &builder);
-  void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
+                                   Location loc, OpBuilder &builder) const;
 
-public:
-  GpuLaunchFuncToGpuRuntimeCallsPass() = default;
-  GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
-    this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
-  }
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
 
-  // Run the dialect converter on the module.
-  void runOnOperation() override {
-    // Cache the LLVMDialect for the current module.
-    llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
-    // Cache the used LLVM types.
-    initializeCachedTypes();
+  llvm::SmallString<32> gpuBinaryAnnotation;
+};
 
-    getOperation().walk(
-        [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
+class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
+  using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
 
+  LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
+                                PatternRewriter &rewriter) const override {
     // GPU kernel modules are no longer necessary since we have a global
     // constant with the CUBIN, or HSACO data.
-    for (auto m :
-         llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
-      m.erase();
+    rewriter.eraseOp(op);
+    return success();
   }
-
-private:
-  LLVM::LLVMDialect *llvmDialect;
-  LLVM::LLVMType llvmVoidType;
-  LLVM::LLVMType llvmPointerType;
-  LLVM::LLVMType llvmPointerPointerType;
-  LLVM::LLVMType llvmInt8Type;
-  LLVM::LLVMType llvmInt32Type;
-  LLVM::LLVMType llvmInt64Type;
-  LLVM::LLVMType llvmIntPtrType;
 };
 
-} // anonymous namespace
-
-// Adds declarations for the needed helper functions from the runtime wrappers.
-// The types in comments give the actual types expected/returned but the API
-// uses void pointers. This is fine as they have the same linkage in C.
-void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
-    Location loc) {
-  ModuleOp module = getOperation();
-  OpBuilder builder(module.getBody()->getTerminator());
-  if (!module.lookupSymbol(kGpuModuleLoadName)) {
-    builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuModuleLoadName,
-        LLVM::LLVMType::getFunctionTy(getPointerType(),
-                                      {getPointerType()}, /* void *cubin */
-                                      /*isVarArg=*/false));
-  }
-  if (!module.lookupSymbol(kGpuModuleGetFunctionName)) {
-    // The helper uses void* instead of CUDA's opaque CUmodule and
-    // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t.
-    builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuModuleGetFunctionName,
-        LLVM::LLVMType::getFunctionTy(getPointerType(),
-                                      {
-                                          getPointerType(), /* void *module */
-                                          getPointerType()  /* char *name   */
-                                      },
-                                      /*isVarArg=*/false));
-  }
-  if (!module.lookupSymbol(kGpuLaunchKernelName)) {
-    // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to
-    // match the LLVM type if MLIR's index type, which the GPU dialect uses.
-    // Furthermore, they use void* instead of CUDA's opaque CUfunction and
-    // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t.
-    builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuLaunchKernelName,
-        LLVM::LLVMType::getFunctionTy(
-            getVoidType(),
-            {
-                getPointerType(),        /* void* f */
-                getIntPtrType(),         /* intptr_t gridXDim */
-                getIntPtrType(),         /* intptr_t gridyDim */
-                getIntPtrType(),         /* intptr_t gridZDim */
-                getIntPtrType(),         /* intptr_t blockXDim */
-                getIntPtrType(),         /* intptr_t blockYDim */
-                getIntPtrType(),         /* intptr_t blockZDim */
-                getInt32Type(),          /* unsigned int sharedMemBytes */
-                getPointerType(),        /* void *hstream */
-                getPointerPointerType(), /* void **kernelParams */
-                getPointerPointerType()  /* void **extra */
-            },
-            /*isVarArg=*/false));
-  }
-  if (!module.lookupSymbol(kGpuStreamCreateName)) {
-    // Helper function to get the current GPU compute stream. Uses void*
-    // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t.
-    builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuStreamCreateName,
-        LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
-  }
-  if (!module.lookupSymbol(kGpuStreamSynchronizeName)) {
-    builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuStreamSynchronizeName,
-        LLVM::LLVMType::getFunctionTy(getVoidType(),
-                                      {getPointerType()}, /* void *stream */
-                                      /*isVarArg=*/false));
-  }
-  if (!module.lookupSymbol(kGpuMemHostRegisterName)) {
-    builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuMemHostRegisterName,
-        LLVM::LLVMType::getFunctionTy(getVoidType(),
-                                      {
-                                          getPointerType(), /* void *ptr */
-                                          getInt64Type()    /* int64 sizeBytes*/
-                                      },
-                                      /*isVarArg=*/false));
-  }
+} // namespace
+
+void GpuLaunchFuncToGpuRuntimeCallsPass::runOnOperation() {
+  LLVMTypeConverter converter(&getContext());
+  OwningRewritePatternList patterns;
+  populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
+
+  LLVMConversionTarget target(getContext());
+  if (failed(applyPartialConversion(getOperation(), target, patterns)))
+    signalPassFailure();
+}
+
+LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
+                                         ArrayRef<Value> arguments) const {
+  auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
+  auto function = [&] {
+    if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
+      return function;
+    return OpBuilder(module.getBody()->getTerminator())
+        .create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
+  }();
+  return builder.create<LLVM::CallOp>(
+      loc, const_cast<LLVM::LLVMType &>(functionType).getFunctionResultType(),
+      builder.getSymbolRefAttr(function), arguments);
 }
 
 /// Emits the IR with the following structure:
@@ -233,27 +200,25 @@
 ///   %data = llvm.alloca 1 x type-of(<param>)
 ///   llvm.store <param>, %data
 ///   %typeErased = llvm.bitcast %data to !llvm<"i8*">
-///   %addr = llvm.getelementptr <list>[<pos>]
+///   %addr = llvm.getelementptr <array>[<pos>]
 ///   llvm.store %typeErased, %addr
 ///
-/// This is necessary to construct the list of arguments passed to the kernel
-/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list
-/// of stack-allocated type-erased pointers to the actual arguments.
-void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
-                                                        Location loc,
-                                                        Value param, Value list,
-                                                        unsigned pos,
-                                                        Value one) {
+/// This is necessary to construct the array of arguments passed to the kernel
+/// function as accepted by cuLaunchKernel, i.e. as a void** that points to
+/// array of stack-allocated type-erased pointers to the actual arguments.
+void ConvertLaunchFuncOpToGpuRuntimeCallPattern::addParamToArray(
+    OpBuilder &builder, Location loc, Value param, Value array, unsigned pos,
+    Value one) const {
   auto memLocation = builder.create<LLVM::AllocaOp>(
       loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
       /*alignment=*/1);
   builder.create<LLVM::StoreOp>(loc, param, memLocation);
   auto casted =
-      builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+      builder.create<LLVM::BitcastOp>(loc, llvmPointerType, memLocation);
 
-  auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+  auto index = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
                                                 builder.getI32IntegerAttr(pos));
-  auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), list,
+  auto gep = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, array,
                                          ArrayRef<Value>{index});
   builder.create<LLVM::StoreOp>(loc, casted, gep);
 }
@@ -266,24 +231,16 @@
 // for (i : [0, NumKernelOperands))
 //   %array[i] = cast<void*>(KernelOperand[i])
 // return %array
-Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
-    gpu::LaunchFuncOp launchOp, OpBuilder &builder) {
-
-  // Get the launch target.
-  auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
-      launchOp, launchOp.kernel());
-  if (!gpuFunc)
-    return {};
-
-  unsigned numArgs = gpuFunc.getNumArguments();
-
+Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
+    gpu::LaunchFuncOp launchOp, unsigned numArguments,
+    OpBuilder &builder) const {
   auto numKernelOperands = launchOp.getNumKernelOperands();
   Location loc = launchOp.getLoc();
-  auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+  auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
                                               builder.getI32IntegerAttr(1));
   auto arraySize = builder.create<LLVM::ConstantOp>(
-      loc, getInt32Type(), builder.getI32IntegerAttr(numArgs));
-  auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
+      loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
+  auto array = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
                                               arraySize, /*alignment=*/0);
 
   unsigned pos = 0;
@@ -295,7 +252,7 @@
     // hold anymore then we `launchOp` to lower from MemRefType and not after
     // LLVMConversion has taken place and the MemRef information is lost.
     if (!llvmType.isStructTy()) {
-      addParamToList(builder, loc, operand, array, pos++, one);
+      addParamToArray(builder, loc, operand, array, pos++, one);
       continue;
     }
 
@@ -309,7 +266,7 @@
           Value elem = builder.create<LLVM::ExtractValueOp>(
               loc, elemType.getArrayElementType(), operand,
               builder.getI32ArrayAttr({j, k}));
-          addParamToList(builder, loc, elem, array, pos++, one);
+          addParamToArray(builder, loc, elem, array, pos++, one);
         }
       } else {
         assert((elemType.isIntegerTy() || elemType.isFloatTy() ||
@@ -317,7 +274,7 @@
                "expected scalar type");
         Value strct = builder.create<LLVM::ExtractValueOp>(
             loc, elemType, operand, builder.getI32ArrayAttr(j));
-        addParamToList(builder, loc, strct, array, pos++, one);
+        addParamToArray(builder, loc, strct, array, pos++, one);
       }
     }
   }
@@ -335,8 +292,9 @@
 //   %1 = llvm.constant (0 : index)
 //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
 // }
-Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
-    StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
+Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
+    StringRef moduleName, StringRef name, Location loc,
+    OpBuilder &builder) const {
   // Make sure the trailing zero is included in the constant.
   std::vector<char> kernelName(name.begin(), name.end());
   kernelName.push_back('\0');
@@ -357,93 +315,90 @@
 // %2 = <see generateKernelNameConstant>
 // %3 = call %moduleGetFunction(%1, %2)
 // %4 = call %streamCreate()
-// %5 = <see setupParamsArray>
+// %5 = <see generateParamsArray>
 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
 // call %streamSynchronize(%4)
-void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
-    mlir::gpu::LaunchFuncOp launchOp) {
-  OpBuilder builder(launchOp);
-  Location loc = launchOp.getLoc();
-  declareGpuRuntimeFunctions(loc);
+LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
+    Operation *op, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  // TODO(csigg): do we need all these ArrayRefs<>{}?
+  Location loc = op->getLoc();
+  OpBuilder builder(op);
+  auto launchOp = cast<gpu::LaunchFuncOp>(op);
+  auto moduleOp = op->getParentOfType<ModuleOp>();
 
-  auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
-                                               builder.getI32IntegerAttr(0));
   // Create an LLVM global with CUBIN extracted from the kernel annotation and
   // obtain a pointer to the first byte in it.
-  auto kernelModule = getOperation().lookupSymbol<gpu::GPUModuleOp>(
-      launchOp.getKernelModuleName());
+  auto kernelModule =
+      moduleOp.lookupSymbol<gpu::GPUModuleOp>(launchOp.getKernelModuleName());
   assert(kernelModule && "expected a kernel module");
 
   auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
   if (!binaryAttr) {
     kernelModule.emitOpError()
         << "missing " << gpuBinaryAnnotation << " attribute";
-    return signalPassFailure();
+    return failure();
   }
 
   SmallString<128> nameBuffer(kernelModule.getName());
   nameBuffer.append(kGpuBinaryStorageSuffix);
-  Value data = LLVM::createGlobalString(
-      loc, builder, nameBuffer.str(), binaryAttr.getValue(),
-      LLVM::Linkage::Internal, getLLVMDialect());
-
-  // Emit the load module call to load the module data. Error checking is done
-  // in the called helper function.
-  auto gpuModuleLoad =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleLoadName);
-  auto module = builder.create<LLVM::CallOp>(
-      loc, ArrayRef<Type>{getPointerType()},
-      builder.getSymbolRefAttr(gpuModuleLoad), ArrayRef<Value>{data});
+  Value data = LLVM::createGlobalString(loc, builder, nameBuffer.str(),
+                                        binaryAttr.getValue(),
+                                        LLVM::Linkage::Internal, llvmDialect);
+
+  auto module =
+      moduleLoadCallBuilder.create(loc, builder, ArrayRef<Value>{data});
   // Get the function from the module. The name corresponds to the name of
   // the kernel function.
   auto kernelName = generateKernelNameConstant(
       launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);
-  auto gpuModuleGetFunction =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleGetFunctionName);
-  auto function = builder.create<LLVM::CallOp>(
-      loc, ArrayRef<Type>{getPointerType()},
-      builder.getSymbolRefAttr(gpuModuleGetFunction),
-      ArrayRef<Value>{module.getResult(0), kernelName});
+  auto function = moduleGetFunctionCallBuilder.create(
+      loc, builder, ArrayRef<Value>{module.getResult(0), kernelName});
   // Grab the global stream needed for execution.
-  auto gpuStreamCreate =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamCreateName);
-  auto stream = builder.create<LLVM::CallOp>(
-      loc, ArrayRef<Type>{getPointerType()},
-      builder.getSymbolRefAttr(gpuStreamCreate), ArrayRef<Value>{});
-  // Invoke the function with required arguments.
-  auto gpuLaunchKernel =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuLaunchKernelName);
-  auto paramsArray = setupParamsArray(launchOp, builder);
-  if (!paramsArray) {
-    launchOp.emitOpError() << "cannot pass given parameters to the kernel";
-    return signalPassFailure();
+  auto stream = streamCreateCallBuilder.create(loc, builder, ArrayRef<Value>{});
+
+  // Get the launch target.
+  auto gpuFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
+      launchOp, launchOp.kernel());
+  if (!gpuFuncOp) {
+    launchOp.emitOpError() << "corresponding kernel function not found";
+    return failure();
   }
+  // Build array of kernel parameters.
+  auto kernelParams =
+      generateParamsArray(launchOp, gpuFuncOp.getNumArguments(), builder);
+
+  // Invoke the function with required arguments.
+  auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+                                               builder.getI32IntegerAttr(0));
   auto nullpointer =
-      builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
-  builder.create<LLVM::CallOp>(
-      loc, ArrayRef<Type>{getVoidType()},
-      builder.getSymbolRefAttr(gpuLaunchKernel),
-      ArrayRef<Value>{function.getResult(0), launchOp.getOperand(0),
-                      launchOp.getOperand(1), launchOp.getOperand(2),
-                      launchOp.getOperand(3), launchOp.getOperand(4),
-                      launchOp.getOperand(5), zero, /* sharedMemBytes */
-                      stream.getResult(0),          /* stream */
-                      paramsArray,                  /* kernel params */
+      builder.create<LLVM::IntToPtrOp>(loc, llvmPointerPointerType, zero);
+  launchKernelCallBuilder.create(
+      loc, builder,
+      ArrayRef<Value>{function.getResult(0), launchOp.gridSizeX(),
+                      launchOp.gridSizeY(), launchOp.gridSizeZ(),
+                      launchOp.blockSizeX(), launchOp.blockSizeY(),
+                      launchOp.blockSizeZ(), zero, /* sharedMemBytes */
+                      stream.getResult(0),         /* stream */
+                      kernelParams,                /* kernel params */
                       nullpointer /* extra */});
-  // Sync on the stream to make it synchronous.
-  auto gpuStreamSync =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamSynchronizeName);
-  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getVoidType()},
-                               builder.getSymbolRefAttr(gpuStreamSync),
-                               ArrayRef<Value>(stream.getResult(0)));
-  launchOp.erase();
+  streamSynchronizeCallBuilder.create(loc, builder, stream.getResult(0));
+
+  rewriter.eraseOp(op);
+  return success();
 }
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
     StringRef gpuBinaryAnnotation) {
-  if (gpuBinaryAnnotation.empty())
-    return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
   return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
       gpuBinaryAnnotation);
 }
+
+void mlir::populateGpuToLLVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
+    StringRef gpuBinaryAnnotation) {
+  patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
+      converter, gpuBinaryAnnotation);
+  patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
+}
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -110,15 +110,17 @@
   PassManager pm(m.getContext());
   applyPassManagerCLOptions(pm);
 
+  const char gpuBinaryAnnotation[] = "nvvm.cubin";
   pm.addPass(createGpuKernelOutliningPass());
   auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
   kernelPm.addPass(createStripDebugInfoPass());
   kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
   kernelPm.addPass(createConvertGPUKernelToBlobPass(
       translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
-      "sm_35", "+ptx60", "nvvm.cubin"));
+      "sm_35", "+ptx60", gpuBinaryAnnotation));
   pm.addPass(createLowerToLLVMPass());
-  pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());
+  pm.addPass(
+      createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
 
   return pm.run(m);
 }
diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
--- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
+++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
@@ -299,16 +299,17 @@
   // Configure target features per ROCm / HIP version.
   configTargetFeatures();
 
+  const char gpuBinaryAnnotation[] = "rocdl.hsaco";
   pm.addPass(createGpuKernelOutliningPass());
   auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
   kernelPm.addPass(createStripDebugInfoPass());
   kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());
   kernelPm.addPass(createConvertGPUKernelToBlobPass(
       compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
-      features, /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+      features, gpuBinaryAnnotation));
   pm.addPass(createLowerToLLVMPass());
-  pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
-      /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+  pm.addPass(
+      createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));
 
   return pm.run(m);
 }