diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
@@ -0,0 +1,30 @@
+//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utilities to generate mappings for parallel
+// loops to GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
+
+namespace mlir {
+
+struct FuncOp;
+
+/// Maps the parallel loops found in the given function to workgroups. The first
+/// loop encountered will be mapped to the global workgroup and the second loop
+/// encountered to the local workgroup. Within each mapping, the first three
+/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
+/// mapped to sequential loops.
+void greedilyMapParallelLoopsToGPU(FuncOp op);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
--- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -190,6 +190,9 @@
 
   let extraClassDeclaration = [{
     Block *getBody() { return &region().front(); }
+    unsigned getNumInductionVars() {
+      return getBody()->getNumArguments();
+    }
     iterator_range<Block::args_iterator> getInductionVars() {
       return {getBody()->args_begin(), getBody()->args_end()};
     }
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -3,6 +3,7 @@
   Transforms/AllReduceLowering.cpp
   Transforms/KernelOutlining.cpp
   Transforms/MemoryPromotion.cpp
+  Transforms/ParallelLoopMapper.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -0,0 +1,88 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities to generate mappings for parallel loops to
+// GPU devices.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::loop;
+
+namespace {
+
+enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
+
+static constexpr int kNumHardwareIds = 3;
+
+/// Bounded increment on MappingLevel. Increments to the next
+/// level unless Sequential was already reached.
+MappingLevel &operator++(MappingLevel &mappingLevel) {
+  if (mappingLevel < Sequential) {
+    mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
+  }
+  return mappingLevel;
+}
+
+/// Computed the hardware id to use for a given mapping level. Will
+/// assign x,y and z hardware ids for the first 3 dimensions and use
+/// sequential after.
+int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
+  if (dimension >= kNumHardwareIds || level == Sequential)
+    return Sequential * kNumHardwareIds;
+  return (level * kNumHardwareIds) + dimension;
+}
+
+/// Add mapping information to the given parallel loop. Do not add
+/// mapping information if the loop already has it. Also, don't
+/// start a mapping at a nested loop.
+void mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid) {
+  // Do not try to add a mapping to already mapped loops or nested loops.
+  if (parallelOp.getAttr("mapping") ||
+      ((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
+    return;
+
+  MLIRContext *ctx = parallelOp.getContext();
+  Builder b(ctx);
+  SmallVector<Attribute, 4> attrs;
+  attrs.reserve(parallelOp.getNumInductionVars());
+  for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
+    SmallVector<NamedAttribute, 3> entries;
+    entries.emplace_back(
+        Identifier::get("processor", ctx),
+        b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i)));
+    entries.emplace_back(Identifier::get("map", ctx),
+                         AffineMapAttr::get(b.getDimIdentityMap()));
+    entries.emplace_back(Identifier::get("bound", ctx),
+                         AffineMapAttr::get(b.getDimIdentityMap()));
+    attrs.push_back(DictionaryAttr::get(entries, ctx));
+  }
+  parallelOp.setAttr("mapping", ArrayAttr::get(attrs, ctx));
+  ++mappingLevel;
+  // Parallel loop operations are immediately nested, so do not use
+  // walk but just iterate over the operations.
+  for (Operation &op : *parallelOp.getBody()) {
+    if (ParallelOp nested = dyn_cast<ParallelOp>(op))
+      mapParallelOp(nested, mappingLevel);
+  }
+}
+
+} // namespace
+
+void mlir::greedilyMapParallelLoopsToGPU(FuncOp func) {
+  func.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]}
+// CHECK-NOT: mapping
+
+// -----
+
+func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
+                       %arg3 : index) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3)
+                                       step (%four, %four, %four, %four)  {
+    loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                             step (%one, %one, %one, %one)  {
+      loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
+                                               step (%one, %one, %one, %one)  {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop_4d(
+// CHECK:           loop.parallel 
+// CHECK:             loop.parallel 
+// CHECK:               loop.parallel
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK:      {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64}, 
+// CHECK-SAME:             {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
+// CHECK-NOT: mapping
\ No newline at end of file
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@
   TestConstantFold.cpp
   TestLoopFusion.cpp
   TestGpuMemoryPromotion.cpp
+  TestGpuParallelLoopMapping.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
diff --git a/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp
@@ -0,0 +1,36 @@
+//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for mapping parallel
+// loops to gpu hardware ids.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the mapping of parallel loops to hardware ids using
+/// a greedy mapping stratgegy.
+class TestGpuGreedyParallelLoopMappingPass
+    : public OperationPass<TestGpuGreedyParallelLoopMappingPass, FuncOp> {
+  void runOnOperation() override {
+    greedilyMapParallelLoopsToGPU(getOperation());
+  }
+};
+} // end namespace
+
+namespace mlir {
+void registerTestGpuParallelLoopMappingPass() {
+  PassRegistration<TestGpuGreedyParallelLoopMappingPass> registration(
+      "test-gpu-greedy-parallel-loop-mapping",
+      "Greedily maps all parallel loops to gpu hardware ids.");
+}
+} // namespace mlir
\ No newline at end of file
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -49,6 +49,7 @@
 void registerTestMemRefStrideCalculation();
 void registerTestOpaqueLoc();
 void registerTestParallelismDetection();
+void registerTestGpuParallelLoopMappingPass();
 void registerTestVectorConversions();
 void registerTestVectorToLoopsPass();
 void registerVectorizerTestPass();
@@ -101,6 +102,7 @@
   registerTestMemRefStrideCalculation();
   registerTestOpaqueLoc();
   registerTestParallelismDetection();
+  registerTestGpuParallelLoopMappingPass();
   registerTestVectorConversions();
   registerTestVectorToLoopsPass();
   registerVectorizerTestPass();