diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
@@ -0,0 +1,50 @@
+//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utilities to generate mappings for parallel
+// loops to GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+
+namespace mlir {
+
+struct Region;
+
+namespace gpu {
+
+/// Name of the mapping attribute produced by loop mappers.
+static constexpr const char *kMappingAttributeName = "mapping";
+/// Name of the processor sub-attribute that identifies the hardware id
+/// to map a loop to.
+static constexpr const char *kProcessorEntryName = "processor";
+/// Name of the map sub-attribute that identifies the affine map to apply
+/// to the hardware id to compute the iteration number of the loop. This
+/// map is expected to be extended by step and lower bound computations:
+///   index = map(hardware_id) * step + lowerbound
+static constexpr const char *kIndexMapEntryName = "map";
+/// Name of the bound sub-attribute that itendities the affine map to
+/// compute an upper bound of iterations for the hardware id. This is
+/// applied to an upper bound on the number of iterations:
+///   launchBound = bound(upperbound-lowerbound ceildiv step)
+static constexpr const char *kBoundMapEntryName = "bound";
+
+} // end namespace gpu
+
+/// Maps the parallel loops found in the given function to workgroups. The first
+/// loop encountered will be mapped to the global workgroup and the second loop
+/// encountered to the local workgroup. Within each mapping, the first three
+/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
+/// mapped to sequential loops.
+void greedilyMapParallelLoopsToGPU(Region &region);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
--- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -289,6 +289,9 @@
 
   let extraClassDeclaration = [{
     Block *getBody() { return &region().front(); }
+    unsigned getNumInductionVars() {
+      return getBody()->getNumArguments();
+    }
     iterator_range<Block::args_iterator> getInductionVars() {
       return {getBody()->args_begin(), getBody()->args_end()};
     }
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExpr.h"
@@ -508,23 +509,20 @@
 
 } // namespace
 
-static constexpr const char *kProcessorEntryName = "processor";
-static constexpr const char *kIndexMapEntryName = "map";
-static constexpr const char *kBoundMapEntryName = "bound";
-
 /// Extracts the mapping annotations from the provided attribute. The attribute
 /// is expected to be of the form
 /// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
 /// where the bound is optional.
 static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
   DictionaryAttr dict = attribute.cast<DictionaryAttr>();
-  unsigned processor = dict.get(kProcessorEntryName)
+  unsigned processor = dict.get(gpu::kProcessorEntryName)
                            .cast<IntegerAttr>()
                            .getValue()
                            .getSExtValue();
-  AffineMap map = dict.get(kIndexMapEntryName).cast<AffineMapAttr>().getValue();
+  AffineMap map =
+      dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
   AffineMapAttr boundAttr =
-      dict.get(kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
+      dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
   AffineMap bound;
   if (boundAttr)
     bound = boundAttr.getValue();
@@ -583,7 +581,8 @@
                                          PatternRewriter &rewriter) {
   // TODO(herhut): Verify that this is a valid GPU mapping.
   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
-  ArrayAttr mapping = parallelOp.getAttrOfType<ArrayAttr>("mapping");
+  ArrayAttr mapping =
+      parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
 
   // TODO(herhut): Support reductions.
   if (!mapping || parallelOp.getNumResults() != 0)
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -3,6 +3,7 @@
   Transforms/AllReduceLowering.cpp
   Transforms/KernelOutlining.cpp
   Transforms/MemoryPromotion.cpp
+  Transforms/ParallelLoopMapper.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -0,0 +1,89 @@
+//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities to generate mappings for parallel loops to
+// GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::loop;
+
+namespace {
+
+enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
+
+static constexpr int kNumHardwareIds = 3;
+
+} // namespace
+
+/// Bounded increment on MappingLevel. Increments to the next
+/// level unless Sequential was already reached.
+MappingLevel &operator++(MappingLevel &mappingLevel) {
+  if (mappingLevel < Sequential) {
+    mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
+  }
+  return mappingLevel;
+}
+
+/// Computed the hardware id to use for a given mapping level. Will
+/// assign x,y and z hardware ids for the first 3 dimensions and use
+/// sequential after.
+static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
+  if (dimension >= kNumHardwareIds || level == Sequential)
+    return Sequential * kNumHardwareIds;
+  return (level * kNumHardwareIds) + dimension;
+}
+
+/// Add mapping information to the given parallel loop. Do not add
+/// mapping information if the loop already has it. Also, don't
+/// start a mapping at a nested loop.
+static void mapParallelOp(ParallelOp parallelOp,
+                          MappingLevel mappingLevel = MapGrid) {
+  // Do not try to add a mapping to already mapped loops or nested loops.
+  if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
+      ((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
+    return;
+
+  MLIRContext *ctx = parallelOp.getContext();
+  Builder b(ctx);
+  SmallVector<Attribute, 4> attrs;
+  attrs.reserve(parallelOp.getNumInductionVars());
+  for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
+    SmallVector<NamedAttribute, 3> entries;
+    entries.emplace_back(b.getNamedAttr(
+        kProcessorEntryName,
+        b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
+    entries.emplace_back(b.getNamedAttr(
+        kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
+    entries.emplace_back(b.getNamedAttr(
+        kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
+    attrs.push_back(DictionaryAttr::get(entries, ctx));
+  }
+  parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
+  ++mappingLevel;
+  // Parallel loop operations are immediately nested, so do not use
+  // walk but just iterate over the operations.
+  for (Operation &op : *parallelOp.getBody()) {
+    if (ParallelOp nested = dyn_cast<ParallelOp>(op))
+      mapParallelOp(nested, mappingLevel);
+  }
+}
+
+void mlir::greedilyMapParallelLoopsToGPU(Region &region) {
+  region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
+}
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]}
+// CHECK-NOT: mapping
+
+// -----
+
+func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
+                       %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3)
+                                       step (%four, %four, %four, %four)  {
+    loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                             step (%one, %one, %one, %one)  {
+      loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                               step (%one, %one, %one, %one)  {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop_4d(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:               loop.parallel
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK-NOT: mapping
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@
   TestConstantFold.cpp
   TestLoopFusion.cpp
   TestGpuMemoryPromotion.cpp
+  TestGpuParallelLoopMapping.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
diff --git a/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp
@@ -0,0 +1,38 @@
+//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for mapping parallel
+// loops to gpu hardware ids.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the mapping of parallel loops to hardware ids using
+/// a greedy mapping stratgegy.
+class TestGpuGreedyParallelLoopMappingPass
+    : public OperationPass<TestGpuGreedyParallelLoopMappingPass, FuncOp> {
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    for (Region &region : op->getRegions())
+      greedilyMapParallelLoopsToGPU(region);
+  }
+};
+} // end namespace
+
+namespace mlir {
+void registerTestGpuParallelLoopMappingPass() {
+  PassRegistration<TestGpuGreedyParallelLoopMappingPass> registration(
+      "test-gpu-greedy-parallel-loop-mapping",
+      "Greedily maps all parallel loops to gpu hardware ids.");
+}
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -50,6 +50,7 @@
 void registerTestMemRefStrideCalculation();
 void registerTestOpaqueLoc();
 void registerTestParallelismDetection();
+void registerTestGpuParallelLoopMappingPass();
 void registerTestVectorConversions();
 void registerTestVectorToLoopsPass();
 void registerVectorizerTestPass();
@@ -103,6 +104,7 @@
   registerTestMemRefStrideCalculation();
   registerTestOpaqueLoc();
   registerTestParallelismDetection();
+  registerTestGpuParallelLoopMappingPass();
   registerTestVectorConversions();
   registerTestVectorToLoopsPass();
   registerVectorizerTestPass();