diff --git a/mlir/include/mlir/Dialect/GPU/Utils.h b/mlir/include/mlir/Dialect/GPU/Utils.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Utils.h
@@ -0,0 +1,44 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines utility functions exposed by the GPU dialect
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_UTILS_H_
+#define MLIR_DIALECT_GPU_UTILS_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+struct LogicalResult;
+class Value;
+
+namespace gpu {
+class GPUFuncOp;
+class LaunchOp;
+} // namespace gpu
+
+/// Get a gpu.func created from outlining the region of a gpu.launch op with the
+/// given `kernelFnName`. The region of the `launchOp` can use values from
+/// above. These need to be captured and passed as arguments to the generated
+/// gpu.func. The generated function has arguments
+/// - corresponding to the values passed in as `operands`, in that order.
+/// - any additional values that might be used within the region of the
+///   `launchOp` and defined above it. These captured values are appended to the
+///   `operands` list.
+gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName,
+                                 SmallVectorImpl<Value> &operands);
+
+/// Sink operations into the `launchOp` to reduce the number of values that are
+/// used within the region of the operation, but defined outside of the
+/// region.
+LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp);
+
+} // namespace mlir
+#endif // MLIR_DIALECT_GPU_UTILS_H_
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/GPU/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
@@ -32,12 +33,15 @@
 }
 
 // Add operations generating block/thread ids and grid/block dimensions at the
-// beginning of the `body` region and replace uses of the respective function
-// arguments.
-static void injectGpuIndexOperations(Location loc, Region &body) {
+// beginning of the `launchFuncOpBody` region. Add mapping from argument in
+// entry block of `launchOpBody`, to the corresponding result value of the added
+// operations.
+static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
+                                     Region &launchOpBody,
+                                     BlockAndValueMapping &map) {
   OpBuilder builder(loc->getContext());
-  Block &firstBlock = body.front();
-  builder.setInsertionPointToStart(&firstBlock);
+  Block &firstBlock = launchOpBody.front();
+  builder.setInsertionPointToStart(&launchFuncOpBody.front());
   SmallVector<Value, 12> indexOps;
   createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
@@ -45,73 +49,89 @@
   createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
   // Replace the leading 12 function args with the respective thread/block index
   // operations. Iterate backwards since args are erased and indices change.
-  for (int i = 11; i >= 0; --i) {
-    firstBlock.getArgument(i).replaceAllUsesWith(indexOps[i]);
-    firstBlock.eraseArgument(i);
-  }
+  for (auto indexOp : enumerate(indexOps))
+    map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
 }
 
-static bool isInliningBeneficiary(Operation *op) {
+static bool isSinkingBeneficiary(Operation *op) {
   return isa<ConstantOp>(op) || isa<DimOp>(op);
 }
 
-// Move arguments of the given kernel function into the function if this reduces
-// the number of kernel arguments.
-static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
-                                              gpu::LaunchFuncOp launch) {
-  OpBuilder kernelBuilder(kernelFunc.getBody());
-  auto &firstBlock = kernelFunc.getBody().front();
-  SmallVector<Value, 8> newLaunchArgs;
-  BlockAndValueMapping map;
-  for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) {
-    map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i));
-  }
-  for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) {
-    auto operandOp = launch.getKernelOperand(i).getDefiningOp();
-    if (!operandOp || !isInliningBeneficiary(operandOp)) {
-      newLaunchArgs.push_back(launch.getKernelOperand(i));
+LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
+  Region &launchOpBody = launchOp.body();
+
+  // Identify uses from values defined outside of the scope of the launch
+  // operation.
+  llvm::SetVector<Value> sinkCandidates;
+  getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
+
+  llvm::SetVector<Value> sunkValues;
+  llvm::SetVector<Operation *> sunkOperations;
+  for (Value operand : sinkCandidates) {
+    Operation *operandOp = operand.getDefiningOp();
+    if (!operandOp || !isSinkingBeneficiary(operandOp))
       continue;
-    }
-    // Only inline operations that do not create new arguments.
-    if (!llvm::all_of(operandOp->getOperands(),
-                      [map](Value value) { return map.contains(value); })) {
+    // Only sink operations that do not create new sinkCandidates.
+    if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) {
+          return sinkCandidates.count(value);
+        }))
       continue;
-    }
-    auto clone = kernelBuilder.clone(*operandOp, map);
-    firstBlock.getArgument(i).replaceAllUsesWith(clone->getResult(0));
-    firstBlock.eraseArgument(i);
+    sunkValues.insert(operand);
+    sunkOperations.insert(operandOp);
   }
-  if (newLaunchArgs.size() == launch.getNumKernelOperands())
-    return launch;
-
-  std::reverse(newLaunchArgs.begin(), newLaunchArgs.end());
-  OpBuilder LaunchBuilder(launch);
-  SmallVector<Type, 8> newArgumentTypes;
-  newArgumentTypes.reserve(firstBlock.getNumArguments());
-  for (auto value : firstBlock.getArguments()) {
-    newArgumentTypes.push_back(value.getType());
+
+  // Insert operations so that the defs get cloned before uses.
+  BlockAndValueMapping map;
+  OpBuilder builder(launchOpBody);
+  DenseSet<Operation *> processed;
+  SmallVector<Operation *, 2> clonedOps;
+  while (processed.size() != sunkOperations.size()) {
+    auto startSize = processed.size();
+    for (Operation *sunkOperation : sunkOperations) {
+      if (processed.count(sunkOperation))
+        continue;
+
+      // Operation cant be cloned yet if any of its operands is also being sunk,
+      // but isnt cloned yet.
+      if (llvm::any_of(
+              sunkOperation->getOperands(), [&sunkValues, &map](Value value) {
+                return sunkValues.count(value) && !map.lookupOrNull(value);
+              }))
+        continue;
+
+      Operation *clonedOp = builder.clone(*sunkOperation, map);
+      // Only replace uses within the launch op.
+      for (auto result : llvm::enumerate(sunkOperation->getResults())) {
+        auto replacement = clonedOp->getResult(result.index());
+        for (auto &use : llvm::make_early_inc_range(result.value().getUses()))
+          if (use.getOwner()->getParentOfType<gpu::LaunchOp>() == launchOp)
+            use.set(replacement);
+      }
+      processed.insert(sunkOperation);
+    }
+    if (startSize == processed.size())
+      return launchOp.emitError(
+          "found illegal cyclic dependency between operations while sinking");
   }
-  kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {}));
-  auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>(
-      launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(),
-      launch.getBlockSizeOperandValues(), newLaunchArgs);
-  launch.erase();
-  return newLaunch;
+  return success();
 }
 
 // Outline the `gpu.launch` operation body into a kernel function. Replace
 // `gpu.terminator` operations by `gpu.return` in the generated function.
-static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp,
-                                        llvm::SetVector<Value> &operands) {
+static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
+                                            StringRef kernelFnName,
+                                            llvm::SetVector<Value> &operands) {
   Location loc = launchOp.getLoc();
   // Create a builder with no insertion point, insertion will happen separately
   // due to symbol table manipulation.
   OpBuilder builder(launchOp.getContext());
+  Region &launchOpBody = launchOp.body();
 
   // Identify uses from values defined outside of the scope of the launch
   // operation.
-  getUsedValuesDefinedAbove(launchOp.body(), operands);
+  getUsedValuesDefinedAbove(launchOpBody, operands);
 
+  // Create the gpu.func operation.
   SmallVector<Type, 4> kernelOperandTypes;
   kernelOperandTypes.reserve(operands.size());
   for (Value operand : operands) {
@@ -119,38 +139,68 @@
   }
   FunctionType type =
       FunctionType::get(kernelOperandTypes, {}, launchOp.getContext());
-  std::string kernelFuncName =
-      Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str();
-  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFuncName, type);
+  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type);
   outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                        builder.getUnitAttr());
-  outlinedFunc.body().takeBody(launchOp.body());
-  injectGpuIndexOperations(loc, outlinedFunc.body());
-  Block &entryBlock = outlinedFunc.body().front();
-  for (Value operand : operands) {
-    BlockArgument newArg = entryBlock.addArgument(operand.getType());
-    replaceAllUsesInRegionWith(operand, newArg, outlinedFunc.body());
-  }
+  BlockAndValueMapping map;
+
+  // Map the arguments corresponding to the launch parameters like blockIdx,
+  // threadIdx, etc.
+  Region &outlinedFuncBody = outlinedFunc.body();
+  injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
+
+  // Map arguments from gpu.launch region to the arguments of the gpu.func
+  // operation.
+  Block &entryBlock = outlinedFuncBody.front();
+  for (auto operand : enumerate(operands))
+    map.map(operand.value(), entryBlock.getArgument(operand.index()));
+
+  // Clone the region of the gpu.launch operation into the gpu.func operation.
+  // TODO(ravishankarm): If cloneInto can be modified such that if a mapping for
+  // a block exists, that block will be used to clone operations into (at the
+  // end of the block), instead of creating a new block, this would be much
+  // cleaner.
+  launchOpBody.cloneInto(&outlinedFuncBody, map);
+
+  // Branch from enty of the gpu.func operation to the block that is cloned from
+  // the entry block of the gpu.launch operation.
+  Block &launchOpEntry = launchOpBody.front();
+  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
+  builder.setInsertionPointToEnd(&entryBlock);
+  builder.create<BranchOp>(loc, clonedLaunchOpEntry);
+
   outlinedFunc.walk([](gpu::TerminatorOp op) {
     OpBuilder replacer(op);
     replacer.create<gpu::ReturnOp>(op.getLoc());
     op.erase();
   });
-
   return outlinedFunc;
 }
 
+gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
+                                       StringRef kernelFnName,
+                                       llvm::SmallVectorImpl<Value> &operands) {
+  DenseSet<Value> inputOperandSet;
+  inputOperandSet.insert(operands.begin(), operands.end());
+  llvm::SetVector<Value> operandSet(operands.begin(), operands.end());
+  auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
+  for (auto operand : operandSet) {
+    if (!inputOperandSet.count(operand))
+      operands.push_back(operand);
+  }
+  return funcOp;
+}
+
 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
 // constant region arguments inlined.
-static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp,
+static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
                                   gpu::GPUFuncOp kernelFunc,
                                   ValueRange operands) {
   OpBuilder builder(launchOp);
-  auto launchFuncOp = builder.create<gpu::LaunchFuncOp>(
+  builder.create<gpu::LaunchFuncOp>(
       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
       launchOp.getBlockSizeOperandValues(), operands);
-  inlineBeneficiaryOps(kernelFunc, launchFuncOp);
   launchOp.erase();
 }
 
@@ -173,9 +223,16 @@
     for (auto func : getModule().getOps<FuncOp>()) {
       // Insert just after the function.
       Block::iterator insertPt(func.getOperation()->getNextNode());
-      func.walk([&](gpu::LaunchOp op) {
+      auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
         llvm::SetVector<Value> operands;
-        gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op, operands);
+        std::string kernelFnName =
+            Twine(op.getParentOfType<FuncOp>().getName(), "_kernel").str();
+
+        // Pull in instructions that can be sunk
+        if (failed(sinkOperationsIntoLaunchOp(op)))
+          return WalkResult::interrupt();
+        gpu::GPUFuncOp outlinedFunc =
+            outlineKernelFuncImpl(op, kernelFnName, operands);
 
         // Create nested module and insert outlinedFunc. The module will
         // originally get the same name as the function, but may be renamed on
@@ -186,7 +243,10 @@
         // Potentially changes signature, pulling in constants.
         convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef());
         modified = true;
+        return WalkResult::advance();
       });
+      if (funcWalkResult.wasInterrupted())
+        return signalPassFailure();
     }
 
     // If any new module was inserted in this module, annotate this module as
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -51,6 +51,8 @@
 // CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
 // CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index
 // CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index
+// CHECK-NEXT: br ^[[BLOCK:.*]]
+// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
 // CHECK-NEXT: = load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
@@ -108,6 +110,28 @@
 
 // -----
 
+func @multiple_uses(%arg0 : memref<?xf32>) {
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  // CHECK: gpu.func {{.*}} {
+  // CHECK: %[[C2:.*]] = constant 2 : index
+  // CHECK: "use1"(%[[C2]], %[[C2]])
+  // CHECK: "use2"(%[[C2]])
+  // CHECK: gpu.return
+  // CHECK: }
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
+                                       %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
+	                                %block_z = %c1) {
+    "use1"(%c2, %c2) : (index, index) -> ()
+    "use2"(%c2) : (index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// -----
+
 llvm.mlir.global internal @global(42 : i64) : !llvm.i64
 
 func @function_call(%arg0 : memref<?xf32>) {