diff --git a/mlir/include/mlir/Dialect/Async/Passes.h b/mlir/include/mlir/Dialect/Async/Passes.h
--- a/mlir/include/mlir/Dialect/Async/Passes.h
+++ b/mlir/include/mlir/Dialect/Async/Passes.h
@@ -19,9 +19,9 @@
 
 std::unique_ptr<Pass> createAsyncParallelForPass();
 
-std::unique_ptr<Pass> createAsyncParallelForPass(bool asyncDispatch,
-                                                 int32_t numWorkerThreads,
-                                                 int32_t targetBlockSize);
+std::unique_ptr<Pass>
+createAsyncParallelForPass(bool asyncDispatch, int32_t numWorkerThreads,
+                           bool useCostModel, int32_t minTaskSize = 512 * 1024);
 
 std::unique_ptr<OperationPass<ModuleOp>> createAsyncToAsyncRuntimePass();
 
diff --git a/mlir/include/mlir/Dialect/Async/Passes.td b/mlir/include/mlir/Dialect/Async/Passes.td
--- a/mlir/include/mlir/Dialect/Async/Passes.td
+++ b/mlir/include/mlir/Dialect/Async/Passes.td
@@ -29,7 +29,12 @@
 
     Option<"minTaskSize", "min-task-size",
       "int32_t", /*default=*/"1000",
-      "The minimum task size for sharding parallel operation.">
+      "The minimum task size for sharding parallel operation.">,
+
+    Option<"useCostModel", "use-cost-model",
+      "bool", /*default=*/"false",
+      "Use experimental cost model to determine the parallelism granularity.">
+
   ];
 
   let dependentDialects = [
diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
--- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
+++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CostModel.h"
 #include "PassDetail.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Async/IR/Async.h"
@@ -93,9 +94,10 @@
   AsyncParallelForPass() = default;
 
   AsyncParallelForPass(bool asyncDispatch, int32_t numWorkerThreads,
-                       int32_t minTaskSize) {
+                       bool useCostModel, int32_t minTaskSize) {
     this->asyncDispatch = asyncDispatch;
     this->numWorkerThreads = numWorkerThreads;
+    this->useCostModel = useCostModel;
     this->minTaskSize = minTaskSize;
   }
 
@@ -115,6 +117,7 @@
 private:
   bool asyncDispatch;
   int32_t numWorkerThreads;
+  bool useCostModel;
   int32_t minTaskSize;
 };
 
@@ -672,6 +675,11 @@
   auto dispatch = [&](OpBuilder &nestedBuilder, Location loc) {
     ImplicitLocOpBuilder nb(loc, nestedBuilder);
 
+    // Create a parallel compute function that takes a block id and computes the
+    // parallel operation body for a subset of iteration space.
+    ParallelComputeFunction parallelComputeFunction =
+        createParallelComputeFunction(op, rewriter);
+
     // With large number of threads the value of creating many compute blocks
     // is reduced because the problem typically becomes memory bound. For small
     // number of threads it helps with stragglers.
@@ -693,16 +701,20 @@
     //   blockSize = min(tripCount,
     //                   max(ceil_div(tripCount, maxComputeBlocks),
     //                       ceil_div(minTaskSize, bodySize)))
+    Value bodySize;
+    if (useCostModel) {
+      CostModel helper(b, *op);
+      Cost cost = helper.estimateCost(op.getLoopBody());
+      bodySize = helper.costToNanoseconds(cost);
+    } else {
+      bodySize = b.create<arith::ConstantIndexOp>(32);
+    }
     Value bs0 = b.create<arith::CeilDivSIOp>(tripCount, maxComputeBlocks);
-    Value bs1 = b.create<arith::MaxSIOp>(bs0, minTaskSizeCst);
-    Value blockSize = b.create<arith::MinSIOp>(tripCount, bs1);
+    Value bs1 = b.create<arith::CeilDivSIOp>(minTaskSizeCst, bodySize);
+    Value bs2 = b.create<arith::MaxSIOp>(bs0, bs1);
+    Value blockSize = b.create<arith::MinSIOp>(tripCount, bs2);
     Value blockCount = b.create<arith::CeilDivSIOp>(tripCount, blockSize);
 
-    // Create a parallel compute function that takes a block id and computes the
-    // parallel operation body for a subset of iteration space.
-    ParallelComputeFunction parallelComputeFunction =
-        createParallelComputeFunction(op, rewriter);
-
     // Dispatch parallel compute function using async recursive work splitting,
     // or by submitting compute task sequentially from a caller thread.
     if (asyncDispatch) {
@@ -742,7 +754,8 @@
 
 std::unique_ptr<Pass> mlir::createAsyncParallelForPass(bool asyncDispatch,
                                                        int32_t numWorkerThreads,
+                                                       bool useCostModel,
                                                        int32_t minTaskSize) {
   return std::make_unique<AsyncParallelForPass>(asyncDispatch, numWorkerThreads,
-                                                minTaskSize);
+                                                useCostModel, minTaskSize);
 }
diff --git a/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt
--- a/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Async/Transforms/CMakeLists.txt
@@ -3,6 +3,7 @@
   AsyncRuntimeRefCounting.cpp
   AsyncRuntimeRefCountingOpt.cpp
   AsyncToAsyncRuntime.cpp
+  CostModel.cpp
   PassDetail.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -15,10 +16,13 @@
   MLIRArithmetic
   MLIRAsync
   MLIRIR
+  MLIRMath
+  MLIRMemref
   MLIRPass
   MLIRSCF
   MLIRSCFToStandard
   MLIRStandard
   MLIRTransforms
   MLIRTransformUtils
+  MLIRVector
 )
diff --git a/mlir/lib/Dialect/Async/Transforms/CostModel.h b/mlir/lib/Dialect/Async/Transforms/CostModel.h
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Async/Transforms/CostModel.h
@@ -0,0 +1,78 @@
+//===- CostModel.h - Declaration of the cost model.           --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===////
+//
+// This file declares a cost model that drives deciding the parallelization
+// granularity when lowering parallel operations into async tasks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DIALECT_ASYNC_TRANSFORMS_COSTMODEL_H_
+#define DIALECT_ASYNC_TRANSFORMS_COSTMODEL_H_
+
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+
+namespace mlir {
+
+// Cost measuring unit.
+//
+// For CPU roughly corresponds to cycles. For RAM roughly corresponds to bytes.
+using InverseThroughput = Value;
+
+// Approximate cost of executing an op on a modern CPU.
+//
+// Encapsulates the leaf nodes of the IR that is being emitted as the cost
+// estimation progresses. The leaf nodes correspond to CPU and RAM
+// inverse-throughputs.
+class Cost {
+private:
+  InverseThroughput ram;
+  InverseThroughput cpu;
+  ImplicitLocOpBuilder builder;
+
+public:
+  Cost(ImplicitLocOpBuilder &builder, size_t ramCost = 0, size_t cpuCost = 0);
+
+  Cost(ImplicitLocOpBuilder &builder, InverseThroughput &ramCost,
+       InverseThroughput &cpuCost);
+
+  InverseThroughput &getRAM();
+  InverseThroughput &getCPU();
+
+  Cost &operator*=(const size_t &rhs);
+  Cost operator*(const size_t &rhs);
+  Cost &operator+=(const Cost &rhs);
+  Cost operator+(const Cost &rhs);
+};
+
+// Estimates execution time for an op on a modern CPU.
+//
+// Errs on the lower end, but is not strictly a lower bound estimate. Targeting
+// being within an order of magnitude of the correct value.
+class CostModel {
+private:
+  ImplicitLocOpBuilder builder;
+  Operation &rootOp;
+
+public:
+  CostModel(ImplicitLocOpBuilder &builder, Operation &rootOp)
+      : builder(builder), rootOp(rootOp) {}
+
+  ImplicitLocOpBuilder &getBuilder();
+  bool isBeforeRoot(Operation &op);
+  bool isBeforeRoot(Value &value);
+  Optional<Value> getIterations(Value lowerBound, Value upperBound, Value step);
+  Cost newCost(size_t ramCost, size_t cpuCost);
+  Cost newCost(Value ramCost, Value cpuCost);
+  Cost zeroCost();
+  Cost estimateCost(Operation &op);
+  Cost estimateCost(Region &region);
+  Value costToNanoseconds(Cost cost);
+};
+
+} // namespace mlir
+#endif // DIALECT_ASYNC_TRANSFORMS_COSTMODEL_H_
diff --git a/mlir/lib/Dialect/Async/Transforms/CostModel.cpp b/mlir/lib/Dialect/Async/Transforms/CostModel.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Async/Transforms/CostModel.cpp
@@ -0,0 +1,232 @@
+//===- CostModel.cpp - Implementation of the cost model.           --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a cost model that drives deciding the parallelization
+// granularity when lowering parallel operations into async tasks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CostModel.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
+
+namespace mlir {
+
+Cost::Cost(ImplicitLocOpBuilder &builder, size_t ramCost, size_t cpuCost)
+    : builder(builder), ram(builder.create<arith::ConstantIndexOp>(ramCost)),
+      cpu(builder.create<arith::ConstantIndexOp>(cpuCost)) {}
+
+Cost::Cost(ImplicitLocOpBuilder &builder, InverseThroughput &ramCost,
+           InverseThroughput &cpuCost)
+    : builder(builder), ram(ramCost), cpu(cpuCost) {}
+
+InverseThroughput &Cost::getRAM() { return ram; }
+InverseThroughput &Cost::getCPU() { return cpu; }
+
+Cost &Cost::operator*=(const size_t &rhs) {
+  auto factor = builder.create<arith::ConstantIndexOp>(rhs);
+  auto scale = [&](InverseThroughput resource) {
+    return builder.create<arith::MulIOp>(resource, factor);
+  };
+  ram = scale(ram);
+  cpu = scale(cpu);
+  return *this;
+}
+
+Cost Cost::operator*(const size_t &rhs) {
+  Cost ret = *this;
+  ret *= rhs;
+  return ret;
+}
+
+Cost &Cost::operator+=(const Cost &rhs) {
+  ram = builder.create<arith::AddIOp>(ram, rhs.ram);
+  cpu = builder.create<arith::AddIOp>(cpu, rhs.cpu);
+  return *this; // return the result by reference
+}
+
+Cost Cost::operator+(const Cost &rhs) {
+  Cost ret = *this;
+  ret += rhs;
+  return ret;
+}
+
+Cost estimateCostVector(CostModel &helper, Operation &op) {
+
+  auto newCost = [&](Value value) {
+    if (auto type = value.getType().dyn_cast<ShapedType>()) {
+      if (type.hasStaticShape()) {
+        return helper.newCost(/* ram */ type.getNumElements() *
+                                  type.getElementTypeBitWidth() / 8,
+                              /* cpu */ 0);
+      }
+    }
+    return helper.zeroCost();
+  };
+
+  if (auto loadOp = dyn_cast<vector::LoadOp>(op)) {
+    return newCost(loadOp.getResult());
+  }
+
+  if (auto storeOp = dyn_cast<vector::StoreOp>(op)) {
+    return newCost(storeOp.getOperand(0));
+  }
+
+  return helper.zeroCost();
+}
+
+Cost estimateCostMemref(CostModel &helper, Operation &op) {
+
+  auto newCost = [&](Value value) {
+    return helper.newCost(/* ram */ value.getType().getIntOrFloatBitWidth() / 8,
+                          /* cpu */ 0);
+  };
+
+  if (auto loadOp = dyn_cast<memref::LoadOp>(op)) {
+    return newCost(loadOp.getResult());
+  }
+
+  if (auto storeOp = dyn_cast<memref::StoreOp>(op)) {
+    return newCost(storeOp.getOperand(0));
+  }
+
+  return helper.zeroCost();
+}
+
+Cost estimateCostArith(CostModel &helper, Operation &op) {
+
+  if (isa<arith::DivUIOp, arith::DivSIOp, arith::CeilDivSIOp,
+          arith::FloorDivSIOp, arith::RemSIOp, arith::RemUIOp>(op)) {
+    return helper.newCost(/* ram */ 0, /* cpu */ 3);
+  }
+
+  return helper.newCost(/* ram */ 0, /* cpu */ 1);
+}
+
+Cost estimateCostMath(CostModel &helper, Operation &op) {
+
+  if (isa<math::AtanOp, math::Atan2Op, math::CosOp, math::SinOp, math::ExpOp,
+          math::Exp2Op, math::ExpM1Op, math::LogOp, math::Log10Op,
+          math::Log1pOp, math::Log2Op, math::PowFOp, math::RsqrtOp,
+          math::SqrtOp, math::TanhOp>(op)) {
+    return helper.newCost(/* ram */ 0, /* cpu */ 100);
+  }
+
+  return helper.newCost(/* ram */ 0, /* cpu */ 1);
+}
+
+Cost estimateCostSCF(CostModel &helper, Operation &op) {
+
+  auto scaleCost = [&](Value iterations, Cost cost) {
+    return helper.newCost(
+        helper.getBuilder().create<arith::MulIOp>(cost.getRAM(), iterations),
+        helper.getBuilder().create<arith::MulIOp>(cost.getCPU(), iterations));
+  };
+
+  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+    Cost cost = helper.estimateCost(forOp.getLoopBody());
+    if (auto iterations = helper.getIterations(
+            forOp.lowerBound(), forOp.upperBound(), forOp.step())) {
+      return scaleCost(*iterations, cost);
+    }
+    return cost;
+  }
+
+  if (auto parallelOp = dyn_cast<scf::ParallelOp>(op)) {
+    Cost cost = helper.estimateCost(parallelOp.getLoopBody());
+    for (auto &inductionVariableDomain : llvm::enumerate(
+             llvm::zip(parallelOp.lowerBound(), parallelOp.upperBound(),
+                       parallelOp.step()))) {
+      Value lb, ub, step;
+      std::tie(lb, ub, step) = inductionVariableDomain.value();
+
+      if (auto iterations = helper.getIterations(lb, ub, step)) {
+        cost = scaleCost(*iterations, cost);
+      }
+    }
+    return cost;
+  }
+
+  return helper.zeroCost();
+}
+
+ImplicitLocOpBuilder &CostModel::getBuilder() { return builder; }
+
+bool CostModel::isBeforeRoot(Operation &op) {
+  return op.getBlock() == rootOp.getBlock() && op.isBeforeInBlock(&rootOp);
+}
+
+bool CostModel::isBeforeRoot(Value &value) {
+  return isBeforeRoot(*value.getDefiningOp());
+}
+
+Optional<Value> CostModel::getIterations(Value lowerBound, Value upperBound,
+                                         Value step) {
+  if (isBeforeRoot(lowerBound) && isBeforeRoot(upperBound) &&
+      isBeforeRoot(step)) {
+    return (Value)builder.create<arith::CeilDivSIOp>(
+        builder.create<arith::SubIOp>(upperBound, lowerBound), step);
+  }
+  return llvm::None;
+}
+
+Cost CostModel::newCost(size_t ramCost, size_t cpuCost) {
+  return Cost(builder, ramCost, cpuCost);
+}
+
+Cost CostModel::newCost(Value ramCost, Value cpuCost) {
+  return Cost(builder, ramCost, cpuCost);
+}
+
+Cost CostModel::zeroCost() { return Cost(builder, 0, 0); }
+
+Cost CostModel::estimateCost(Operation &op) {
+  MLIRContext *ctx = op.getContext();
+  if (op.getDialect() == ctx->getLoadedDialect("scf")) {
+    return estimateCostSCF(*this, op);
+  }
+  if (op.getDialect() == ctx->getLoadedDialect("memref")) {
+    return estimateCostMemref(*this, op);
+  }
+  if (op.getDialect() == ctx->getLoadedDialect("vector")) {
+    return estimateCostVector(*this, op);
+  }
+  if (op.getDialect() == ctx->getLoadedDialect("arith")) {
+    return estimateCostArith(*this, op);
+  }
+  if (op.getDialect() == ctx->getLoadedDialect("math")) {
+    return estimateCostMath(*this, op);
+  }
+  return zeroCost();
+}
+
+Cost CostModel::estimateCost(Region &region) {
+  Cost cost = zeroCost();
+  for (auto &op : region.getOps()) {
+    cost += estimateCost(op);
+  }
+  return cost;
+}
+
+Value CostModel::costToNanoseconds(Cost cost) {
+  // Assume that RAM throughput is 16 GB/s and that CPU runs at 3 GHz.
+  auto ramRuntime = builder.create<arith::DivUIOp>(
+      cost.getRAM(), builder.create<arith::ConstantIndexOp>(16));
+  auto cpuRuntime = builder.create<arith::DivUIOp>(
+      cost.getCPU(), builder.create<arith::ConstantIndexOp>(3));
+  auto max = [&](Value x, Value y) {
+    return builder.create<arith::MaxSIOp>(x, y);
+  };
+  return max(max(ramRuntime, cpuRuntime),
+             builder.create<arith::ConstantIndexOp>(1));
+}
+
+} // namespace mlir
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2014,6 +2014,8 @@
         ":Async",
         ":AsyncPassIncGen",
         ":IR",
+        ":MathDialect",
+        ":MemRefDialect",
         ":Pass",
         ":SCFDialect",
         ":SCFToStandard",
@@ -2022,6 +2024,7 @@
         ":TransformUtils",
         ":Transforms",
         ":TransformsPassIncGen",
+        ":VectorOps",
         "//llvm:Core",
         "//llvm:Support",
     ],