diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -25,8 +25,9 @@
 namespace mlir {
 /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
 /// a separate kernel function.
-std::unique_ptr<OperationPass<ModuleOp>>
-createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef());
+std::unique_ptr<OperationPass<ModuleOp>> createGpuKernelOutliningPass(
+    StringRef dataLayoutStr = StringRef(),
+    std::function<bool(Operation *)> isSinkingBeneficiary = nullptr);
 
 /// Rewrites a function region so that GPU ops execute asynchronously.
 std::unique_ptr<OperationPass<FuncOp>> createGpuAsyncRegionPass();
diff --git a/mlir/include/mlir/Dialect/GPU/Utils.h b/mlir/include/mlir/Dialect/GPU/Utils.h
--- a/mlir/include/mlir/Dialect/GPU/Utils.h
+++ b/mlir/include/mlir/Dialect/GPU/Utils.h
@@ -38,7 +38,9 @@
 /// Sink operations into the `launchOp` to reduce the number of values that are
 /// used within the region of the operation, but defined outside of the
 /// region.
-LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp);
+LogicalResult sinkOperationsIntoLaunchOp(
+    gpu::LaunchOp launchOp,
+    llvm::function_ref<bool(Operation *)> isSinkingBeneficiary = nullptr);
 
 } // namespace mlir
 #endif // MLIR_DIALECT_GPU_UTILS_H_
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -59,7 +59,7 @@
 /// Identifies operations that are beneficial to sink into kernels. These
 /// operations may not have side-effects, as otherwise sinking (and hence
 /// duplicating them) is not legal.
-static bool isSinkingBeneficiary(Operation *op) {
+static bool isSinkingBeneficiaryDefault(Operation *op) {
   return isa<arith::ConstantOp, ConstantOp, memref::DimOp, arith::SelectOp,
              arith::CmpIOp>(op);
 }
@@ -75,11 +75,11 @@
 /// the order they should appear in the kernel. Furthermore, `availableValues`
 /// is updated with results that will be available after sinking the identified
 /// ops.
-static bool
-extractBeneficiaryOps(Operation *op,
-                      const SetVector<Value> &existingDependencies,
-                      SetVector<Operation *> &beneficiaryOps,
-                      llvm::SmallPtrSetImpl<Value> &availableValues) {
+static bool extractBeneficiaryOps(
+    Operation *op, const SetVector<Value> &existingDependencies,
+    SetVector<Operation *> &beneficiaryOps,
+    llvm::SmallPtrSetImpl<Value> &availableValues,
+    llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
   if (beneficiaryOps.count(op))
     return true;
 
@@ -93,9 +93,9 @@
     // Else check whether it can be made available via sinking or already is a
     // dependency.
     Operation *definingOp = operand.getDefiningOp();
-    if ((!definingOp ||
-         !extractBeneficiaryOps(definingOp, existingDependencies,
-                                beneficiaryOps, availableValues)) &&
+    if ((!definingOp || !extractBeneficiaryOps(definingOp, existingDependencies,
+                                               beneficiaryOps, availableValues,
+                                               isSinkingBeneficiary)) &&
         !existingDependencies.count(operand))
       return false;
   }
@@ -106,7 +106,9 @@
   return true;
 }
 
-LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
+LogicalResult mlir::sinkOperationsIntoLaunchOp(
+    gpu::LaunchOp launchOp,
+    llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
   Region &launchOpBody = launchOp.body();
 
   // Identify uses from values defined outside of the scope of the launch
@@ -120,7 +122,9 @@
     Operation *operandOp = operand.getDefiningOp();
     if (!operandOp)
       continue;
-    extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues);
+    extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues,
+                          isSinkingBeneficiary ? isSinkingBeneficiary
+                                               : isSinkingBeneficiaryDefault);
   }
 
   // Insert operations so that the defs get cloned before uses.
@@ -240,13 +244,16 @@
 class GpuKernelOutliningPass
     : public GpuKernelOutliningBase<GpuKernelOutliningPass> {
 public:
-  GpuKernelOutliningPass(StringRef dlStr) {
+  GpuKernelOutliningPass(StringRef dlStr,
+                         std::function<bool(Operation *)> isSinkingBeneficiary_)
+      : isSinkingBeneficiary(std::move(isSinkingBeneficiary_)) {
     if (!dlStr.empty() && !dataLayoutStr.hasValue())
       dataLayoutStr = dlStr.str();
   }
 
   GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
-      : dataLayoutSpec(other.dataLayoutSpec) {
+      : dataLayoutSpec(other.dataLayoutSpec),
+        isSinkingBeneficiary(other.isSinkingBeneficiary) {
     dataLayoutStr = other.dataLayoutStr;
   }
 
@@ -277,8 +284,13 @@
             Twine(op->getParentOfType<FuncOp>().getName(), "_kernel").str();
 
         // Pull in instructions that can be sunk
-        if (failed(sinkOperationsIntoLaunchOp(op)))
-          return WalkResult::interrupt();
+        if (isSinkingBeneficiary) {
+          if (failed(sinkOperationsIntoLaunchOp(op, isSinkingBeneficiary)))
+            return WalkResult::interrupt();
+        } else {
+          if (failed(sinkOperationsIntoLaunchOp(op)))
+            return WalkResult::interrupt();
+        }
         gpu::GPUFuncOp outlinedFunc =
             outlineKernelFuncImpl(op, kernelFnName, operands);
 
@@ -352,11 +364,14 @@
                      "attached to the GPU kernel module")};
 
   DataLayoutSpecInterface dataLayoutSpec;
+  std::function<bool(Operation *)> isSinkingBeneficiary;
 };
 
 } // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>>
-mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
-  return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
+std::unique_ptr<OperationPass<ModuleOp>> mlir::createGpuKernelOutliningPass(
+    StringRef dataLayoutStr,
+    std::function<bool(Operation *)> isSinkingBeneficiary) {
+  return std::make_unique<GpuKernelOutliningPass>(
+      dataLayoutStr, std::move(isSinkingBeneficiary));
 }