diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_GPU_PASSES_H_
 #define MLIR_DIALECT_GPU_PASSES_H_
 
+#include "mlir/Support/LLVM.h"
 #include <memory>
 
 namespace mlir {
@@ -21,6 +22,26 @@
 class ModuleOp;
 template <typename T> class OpPassBase;
 class OwningRewritePatternList;
+class Value;
+
+namespace gpu {
+class GPUFuncOp;
+class LaunchOp;
+} // namespace gpu
+
+/// Get a gpu.func created from outlining the region of a gpu.launch op with the
+/// given `kernelFnName`. The region of the `launchOp` can use values from
+/// above. These need to be captured and passed as arguments to the generated
+/// gpu.func. The generated function has arguments
+/// - corresponding to the values passed in as `operands`, in that order.
+/// - any additional values that might be used within the region of the
+///   `launchOp` and defined above it. These captured values are appended to the
+///   `operands` list.
+/// A best effort is done to "sink" as many of the operations from "above" the
+/// `launchOp` as possible to reduce the number of values that are appended to
+/// `operands`.
+gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName,
+                                 SmallVectorImpl<Value> &operands);
 
 std::unique_ptr<OpPassBase<ModuleOp>> createGpuKernelOutliningPass();
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -32,12 +32,15 @@
 }
 
 // Add operations generating block/thread ids and grid/block dimensions at the
-// beginning of the `body` region and replace uses of the respective function
-// arguments.
-static void injectGpuIndexOperations(Location loc, Region &body) {
+// beginning of the `launchFuncOpBody` region. Add mapping from argument in
+// entry block of `launchOpBody`, to the corresponding result value of the added
+// operations.
+static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
+                                     Region &launchOpBody,
+                                     BlockAndValueMapping &map) {
   OpBuilder builder(loc->getContext());
-  Block &firstBlock = body.front();
-  builder.setInsertionPointToStart(&firstBlock);
+  Block &firstBlock = launchOpBody.front();
+  builder.setInsertionPointToStart(&launchFuncOpBody.front());
   SmallVector<Value, 12> indexOps;
   createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
@@ -45,73 +48,48 @@
   createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
   // Replace the leading 12 function args with the respective thread/block index
   // operations. Iterate backwards since args are erased and indices change.
-  for (int i = 11; i >= 0; --i) {
-    firstBlock.getArgument(i).replaceAllUsesWith(indexOps[i]);
-    firstBlock.eraseArgument(i);
-  }
+  for (auto indexOp : enumerate(indexOps))
+    map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
 }
 
-static bool isInliningBeneficiary(Operation *op) {
+static bool isSinkingBeneficiary(Operation *op) {
   return isa<ConstantOp>(op) || isa<DimOp>(op);
 }
 
-// Move arguments of the given kernel function into the function if this reduces
-// the number of kernel arguments.
-static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
-                                              gpu::LaunchFuncOp launch) {
-  OpBuilder kernelBuilder(kernelFunc.getBody());
-  auto &firstBlock = kernelFunc.getBody().front();
-  SmallVector<Value, 8> newLaunchArgs;
-  BlockAndValueMapping map;
-  for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) {
-    map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i));
-  }
-  for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) {
-    auto operandOp = launch.getKernelOperand(i).getDefiningOp();
-    if (!operandOp || !isInliningBeneficiary(operandOp)) {
-      newLaunchArgs.push_back(launch.getKernelOperand(i));
-      continue;
-    }
-    // Only inline operations that do not create new arguments.
-    if (!llvm::all_of(operandOp->getOperands(),
-                      [map](Value value) { return map.contains(value); })) {
-      continue;
-    }
-    auto clone = kernelBuilder.clone(*operandOp, map);
-    firstBlock.getArgument(i).replaceAllUsesWith(clone->getResult(0));
-    firstBlock.eraseArgument(i);
-  }
-  if (newLaunchArgs.size() == launch.getNumKernelOperands())
-    return launch;
-
-  std::reverse(newLaunchArgs.begin(), newLaunchArgs.end());
-  OpBuilder LaunchBuilder(launch);
-  SmallVector<Type, 8> newArgumentTypes;
-  newArgumentTypes.reserve(firstBlock.getNumArguments());
-  for (auto value : firstBlock.getArguments()) {
-    newArgumentTypes.push_back(value.getType());
-  }
-  kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {}));
-  auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>(
-      launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(),
-      launch.getBlockSizeOperandValues(), newLaunchArgs);
-  launch.erase();
-  return newLaunch;
-}
-
 // Outline the `gpu.launch` operation body into a kernel function. Replace
 // `gpu.terminator` operations by `gpu.return` in the generated function.
-static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp,
-                                        llvm::SetVector<Value> &operands) {
+static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
+                                            StringRef kernelFnName,
+                                            llvm::SetVector<Value> &operands) {
   Location loc = launchOp.getLoc();
   // Create a builder with no insertion point, insertion will happen separately
   // due to symbol table manipulation.
   OpBuilder builder(launchOp.getContext());
+  Region &launchOpBody = launchOp.body();
 
   // Identify uses from values defined outside of the scope of the launch
   // operation.
-  getUsedValuesDefinedAbove(launchOp.body(), operands);
+  getUsedValuesDefinedAbove(launchOpBody, operands);
+
+  // Prune those that will be sunk into the gpu.func operation.
+  llvm::SetVector<Value> elidedOperands;
+  llvm::SetVector<Operation *> sunkOperations;
+  for (Value operand : operands) {
+    Operation *operandOp = operand.getDefiningOp();
+    if (!operandOp || !isSinkingBeneficiary(operandOp))
+      continue;
+    // Only sink operations that do not create new arguments.
+    if (!llvm::all_of(operandOp->getOperands(), [&operands,
+                                                 &elidedOperands](Value value) {
+          return operands.count(value) && !elidedOperands.count(value);
+        }))
+      continue;
+    elidedOperands.insert(operand);
+    sunkOperations.insert(operandOp);
+  }
+  operands.set_subtract(elidedOperands);
 
+  // Create the gpu.func operation.
   SmallVector<Type, 4> kernelOperandTypes;
   kernelOperandTypes.reserve(operands.size());
   for (Value operand : operands) {
@@ -119,27 +97,61 @@
   }
   FunctionType type =
       FunctionType::get(kernelOperandTypes, {}, launchOp.getContext());
-  std::string kernelFuncName =
-      Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str();
-  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFuncName, type);
+  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type);
   outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                        builder.getUnitAttr());
-  outlinedFunc.body().takeBody(launchOp.body());
-  injectGpuIndexOperations(loc, outlinedFunc.body());
-  Block &entryBlock = outlinedFunc.body().front();
-  for (Value operand : operands) {
-    BlockArgument newArg = entryBlock.addArgument(operand.getType());
-    replaceAllUsesInRegionWith(operand, newArg, outlinedFunc.body());
-  }
+  BlockAndValueMapping map;
+
+  // Map the arguments corresponding to the launch parameters like blockIdx,
+  // threadIdx, etc.
+  Region &outlinedFuncBody = outlinedFunc.body();
+  injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
+
+  // Map arguments from gpu.launch region to the arguments of the gpu.func
+  // operation.
+  Block &entryBlock = outlinedFuncBody.front();
+  for (auto operand : enumerate(operands))
+    map.map(operand.value(), entryBlock.getArgument(operand.index()));
+
+  // Clone the sunk operations into the gpu.func operation.
+  for (auto *sunkOp : sunkOperations)
+    entryBlock.push_back(sunkOp->clone(map));
+
+  // Clone the region of the gpu.launch operation into the gpu.func operation.
+  // TODO(ravishankarm): If cloneInto can be modified such that if a mapping for
+  // a block exists, that block will be used to clone operations into (at the
+  // end of the block), instead of creating a new block, this would be much
+  // cleaner.
+  launchOpBody.cloneInto(&outlinedFuncBody, map);
+
+  // Branch from enty of the gpu.func operation to the block that is cloned from
+  // the entry block of the gpu.launch operation.
+  Block &launchOpEntry = launchOpBody.front();
+  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
+  builder.setInsertionPointToEnd(&entryBlock);
+  builder.create<BranchOp>(loc, clonedLaunchOpEntry);
+
   outlinedFunc.walk([](gpu::TerminatorOp op) {
     OpBuilder replacer(op);
     replacer.create<gpu::ReturnOp>(op.getLoc());
     op.erase();
   });
-
   return outlinedFunc;
 }
 
+gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
+                                       StringRef kernelFnName,
+                                       llvm::SmallVectorImpl<Value> &operands) {
+  llvm::SetVector<Value> inputOperandSet(operands.begin(), operands.end());
+  llvm::SetVector<Value> operandSet(operands.begin(), operands.end());
+  auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
+  for (auto operand : operandSet) {
+    if (!inputOperandSet.count(operand))
+      operands.push_back(operand);
+  }
+  return funcOp;
+}
+
 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
 // constant region arguments inlined.
@@ -147,10 +159,9 @@
                                   gpu::GPUFuncOp kernelFunc,
                                   ValueRange operands) {
   OpBuilder builder(launchOp);
-  auto launchFuncOp = builder.create<gpu::LaunchFuncOp>(
+  builder.create<gpu::LaunchFuncOp>(
       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
       launchOp.getBlockSizeOperandValues(), operands);
-  inlineBeneficiaryOps(kernelFunc, launchFuncOp);
   launchOp.erase();
 }
 
@@ -175,7 +186,10 @@
       Block::iterator insertPt(func.getOperation()->getNextNode());
       func.walk([&](gpu::LaunchOp op) {
         llvm::SetVector<Value> operands;
-        gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op, operands);
+        std::string kernelFnName =
+            Twine(op.getParentOfType<FuncOp>().getName(), "_kernel").str();
+        gpu::GPUFuncOp outlinedFunc =
+            outlineKernelFuncImpl(op, kernelFnName, operands);
 
         // Create nested module and insert outlinedFunc. The module will
         // originally get the same name as the function, but may be renamed on