Index: mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
===================================================================
--- mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
+++ mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
@@ -30,38 +30,47 @@
 ///   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
 ///                                             step (%arg4*tileSize[0],
 ///                                                   %arg5*tileSize[1])
-///     scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%j0)
-///                                           min(tileSize[1], %arg3-%j1))
+///     scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0)
+///                                           min(%arg5*tileSize[1], %arg3-%i1))
 ///                                        step (%arg4, %arg5)
 /// The old loop is replaced with the new one.
 void mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
   OpBuilder b(op);
   auto zero = b.create<ConstantIndexOp>(op.getLoc(), 0);
-  SmallVector<Value, 2> tileSizeConstants;
-  tileSizeConstants.reserve(op.upperBound().size());
+
+  // Store the compile-time loop information if known.
+  SmallVector<ConstantIndexOp, 2> tileSizeConstants(op.upperBound().size());
+  SmallVector<Optional<int64_t>, 2> compileTimeLengths(op.upperBound().size());
+  SmallVector<Optional<int64_t>, 2> compileTimeSteps(op.upperBound().size());
   for (size_t i = 0, end = op.upperBound().size(); i != end; ++i) {
-    if (i < tileSizes.size())
-      tileSizeConstants.push_back(
-          b.create<ConstantIndexOp>(op.getLoc(), tileSizes[i]));
-    else
-      // Just pick 1 for the remaining dimensions.
-      tileSizeConstants.push_back(b.create<ConstantIndexOp>(op.getLoc(), 1));
+    // Create the tile size constants and set them to one by default.
+    tileSizeConstants[i] = b.create<ConstantIndexOp>(
+        op.getLoc(), i < tileSizes.size() ? tileSizes[i] : 1);
+    // Compute the loop length and step if known at compile-time.
+    auto lowerBound =
+        dyn_cast_or_null<ConstantIndexOp>(op.lowerBound()[i].getDefiningOp());
+    auto upperBound =
+        dyn_cast_or_null<ConstantIndexOp>(op.upperBound()[i].getDefiningOp());
+    auto step = dyn_cast_or_null<ConstantIndexOp>(op.step()[i].getDefiningOp());
+    if (lowerBound && upperBound && step) {
+      compileTimeLengths[i] = upperBound.getValue() - lowerBound.getValue();
+      compileTimeSteps[i] = step.getValue() * tileSizeConstants[i].getValue();
+    }
   }
 
   // Create the outer loop with adjusted steps.
   SmallVector<Value, 2> newSteps;
   newSteps.reserve(op.step().size());
-  for (auto step : llvm::zip(op.step(), tileSizeConstants)) {
+  for (size_t i = 0, end = op.step().size(); i != end; ++i)
     newSteps.push_back(
-        b.create<MulIOp>(op.getLoc(), std::get<0>(step), std::get<1>(step)));
-  }
+        b.create<MulIOp>(op.getLoc(), op.step()[i], tileSizeConstants[i]));
   auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.lowerBound(),
                                         op.upperBound(), newSteps);
   b.setInsertionPointToStart(outerLoop.getBody());
 
   // Compute min(size, dim - offset) to avoid out-of-bounds accesses.
-  // FIXME: Instead of using min, we want to replicate the tail. This would give
-  // the inner loop constant bounds for easy vectorization.
+  // FIXME: Instead of using min, we want to replicate the tail. This would
+  // give the inner loop constant bounds for easy vectorization.
   auto minMap = AffineMap::get(
       /*dimCount=*/3, /*symbolCount=*/0,
       {getAffineDimExpr(/*position=*/0, b.getContext()),
@@ -72,24 +81,56 @@
   // Create the inner loop with adjusted bounds.
   SmallVector<Value, 2> newBounds;
   newBounds.reserve(op.upperBound().size());
-  for (auto bounds : llvm::zip(tileSizeConstants, outerLoop.upperBound(),
-                               outerLoop.getInductionVars())) {
-    newBounds.push_back(b.create<AffineMinOp>(
-        op.getLoc(), b.getIndexType(), minMap,
-        ValueRange{std::get<0>(bounds), std::get<1>(bounds),
-                   std::get<2>(bounds)}));
+  for (size_t i = 0, end = op.upperBound().size(); i != end; ++i) {
+    // Set the bounds of the inner loop to the tile size if it is known at
+    // compile-time that the length of the outer loop is an integer multiple of
+    // the its step. Otherwise, adjust the bounds at runtime.
+    if (compileTimeLengths[i].hasValue() &&
+        compileTimeLengths[i].getValue() % compileTimeSteps[i].getValue() == 0)
+      newBounds.push_back(newSteps[i]);
+    else
+      newBounds.push_back(b.create<AffineMinOp>(
+          op.getLoc(), b.getIndexType(), minMap,
+          ValueRange{newSteps[i], outerLoop.upperBound()[i],
+                     outerLoop.getInductionVars()[i]}));
   }
   auto innerLoop = b.create<ParallelOp>(
       op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
       op.step());
 
-  // Steal the body of the old parallel loop and erase it.
-  innerLoop.region().takeBody(op.region());
+  // Move the body of the old parallel loop.
+  innerLoop.getBody()->getOperations().splice(
+      innerLoop.getBody()->begin(), op.getBody()->getOperations(),
+      op.getBody()->begin(), std::prev(op.getBody()->end()));
+
+  // Replace all uses of the old induction variables.
+  b.setInsertionPointToStart(innerLoop.getBody());
+  for (size_t i = 0, end = innerLoop.getInductionVars().size(); i != end; ++i) {
+    // Get the induction variables of the inner and outer loop.
+    Value outerIV = outerLoop.getInductionVars()[i];
+    Value innerIV = innerLoop.getInductionVars()[i];
+    // Replace the induction variables by the lower bounds if the loop
+    // performs a single iteration.
+    if (compileTimeLengths[i].hasValue() &&
+        compileTimeLengths[i].getValue() <= compileTimeSteps[i].getValue())
+      outerIV = outerLoop.lowerBound()[i];
+    if (tileSizeConstants[i].getValue() == 1)
+      innerIV = innerLoop.lowerBound()[i];
+    // Replace all uses of the old induction variables by the sum of the
+    // induction variables of the inner and the outer loop.
+    Value result = b.create<AffineApplyOp>(
+        op.getLoc(),
+        AffineMap::get(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),
+        ValueRange{outerIV, innerIV});
+    op.getBody()->getArgument(i).replaceAllUsesWith(result);
+  }
+
+  // Erase the old parallel loop.
   op.erase();
 }
 
-/// Get a list of most nested parallel loops. Assumes that ParallelOps are only
-/// directly nested.
+/// Get a list of most nested parallel loops. Assumes that ParallelOps are
+/// only directly nested.
 static bool getInnermostNestedLoops(Block *block,
                                     SmallVectorImpl<ParallelOp> &loops) {
   bool hasInnerLoop = false;
Index: mlir/test/Dialect/SCF/parallel-loop-tiling.mlir
===================================================================
--- mlir/test/Dialect/SCF/parallel-loop-tiling.mlir
+++ mlir/test/Dialect/SCF/parallel-loop-tiling.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s -pass-pipeline='func(parallel-loop-tiling{parallel-loop-tile-sizes=1,4})' -split-input-file | FileCheck %s
 
-func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+func @dynamic_loop(%arg0 : index, %arg1 : index, %arg2 : index,
                     %arg3 : index, %arg4 : index, %arg5 : index,
 		    %A: memref<?x?xf32>, %B: memref<?x?xf32>,
                     %C: memref<?x?xf32>, %result: memref<?x?xf32>) {
@@ -14,7 +14,8 @@
 }
 
 // CHECK:       #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
-// CHECK-LABEL:   func @parallel_loop(
+// CHECK:       #map1 = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-LABEL:   func @dynamic_loop(
 // CHECK-SAME:                        [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: index, [[VAL_6:%.*]]: memref<?x?xf32>, [[VAL_7:%.*]]: memref<?x?xf32>, [[VAL_8:%.*]]: memref<?x?xf32>, [[VAL_9:%.*]]: memref<?x?xf32>) {
 // CHECK:           [[VAL_10:%.*]] = constant 0 : index
 // CHECK:           [[VAL_11:%.*]] = constant 1 : index
@@ -22,13 +23,54 @@
 // CHECK:           [[VAL_13:%.*]] = muli [[VAL_4]], [[VAL_11]] : index
 // CHECK:           [[VAL_14:%.*]] = muli [[VAL_5]], [[VAL_12]] : index
 // CHECK:           scf.parallel ([[VAL_15:%.*]], [[VAL_16:%.*]]) = ([[VAL_0]], [[VAL_1]]) to ([[VAL_2]], [[VAL_3]]) step ([[VAL_13]], [[VAL_14]]) {
-// CHECK:             [[VAL_17:%.*]] = affine.min #map0([[VAL_11]], [[VAL_2]], [[VAL_15]])
-// CHECK:             [[VAL_18:%.*]] = affine.min #map0([[VAL_12]], [[VAL_3]], [[VAL_16]])
+// CHECK:             [[VAL_17:%.*]] = affine.min #map0([[VAL_13]], [[VAL_2]], [[VAL_15]])
+// CHECK:             [[VAL_18:%.*]] = affine.min #map0([[VAL_14]], [[VAL_3]], [[VAL_16]])
 // CHECK:             scf.parallel ([[VAL_19:%.*]], [[VAL_20:%.*]]) = ([[VAL_10]], [[VAL_10]]) to ([[VAL_17]], [[VAL_18]]) step ([[VAL_4]], [[VAL_5]]) {
-// CHECK:               [[VAL_21:%.*]] = load [[VAL_7]]{{\[}}[[VAL_19]], [[VAL_20]]] : memref<?x?xf32>
-// CHECK:               [[VAL_22:%.*]] = load [[VAL_8]]{{\[}}[[VAL_19]], [[VAL_20]]] : memref<?x?xf32>
-// CHECK:               [[VAL_23:%.*]] = addf [[VAL_21]], [[VAL_22]] : f32
-// CHECK:               store [[VAL_23]], [[VAL_9]]{{\[}}[[VAL_19]], [[VAL_20]]] : memref<?x?xf32>
+// CHECK:               [[VAL_21:%.*]] = affine.apply #map1([[VAL_15]], [[VAL_10]])
+// CHECK:               [[VAL_22:%.*]] = affine.apply #map1([[VAL_16]], [[VAL_20]])
+// CHECK:               [[VAL_23:%.*]] = load [[VAL_7]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>
+// CHECK:               [[VAL_24:%.*]] = load [[VAL_8]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>
+// CHECK:               [[VAL_25:%.*]] = addf [[VAL_23]], [[VAL_24]] : f32
+// CHECK:               store [[VAL_25]], [[VAL_9]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+
+// -----
+
+func @static_loop(%A: memref<24x24xf32>, %B: memref<24x24xf32>,
+                    %C: memref<24x24xf32>, %result: memref<24x24xf32>) {
+  %c0 = constant 0 : index
+  %c3 = constant 3 : index
+  %c24 = constant 24 : index
+  scf.parallel (%i0, %i1) = (%c0, %c0) to (%c24, %c24) step (%c3, %c3) {
+    %B_elem = load %B[%i0, %i1] : memref<24x24xf32>
+    %C_elem = load %C[%i0, %i1] : memref<24x24xf32>
+    %sum_elem = addf %B_elem, %C_elem : f32
+    store %sum_elem, %result[%i0, %i1] : memref<24x24xf32>
+  }
+  return
+}
+
+// CHECK:       #map0 = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-LABEL:   func @static_loop(
+// CHECK-SAME:                       [[VAL_0:%.*]]: memref<24x24xf32>, [[VAL_1:%.*]]: memref<24x24xf32>, [[VAL_2:%.*]]: memref<24x24xf32>, [[VAL_3:%.*]]: memref<24x24xf32>) {
+// CHECK:           [[VAL_4:%.*]] = constant 0 : index
+// CHECK:           [[VAL_5:%.*]] = constant 3 : index
+// CHECK:           [[VAL_6:%.*]] = constant 24 : index
+// CHECK:           [[VAL_7:%.*]] = constant 0 : index
+// CHECK:           [[VAL_8:%.*]] = constant 1 : index
+// CHECK:           [[VAL_9:%.*]] = constant 4 : index
+// CHECK:           [[VAL_10:%.*]] = muli [[VAL_5]], [[VAL_8]] : index
+// CHECK:           [[VAL_11:%.*]] = muli [[VAL_5]], [[VAL_9]] : index
+// CHECK:           scf.parallel ([[VAL_12:%.*]], [[VAL_13:%.*]]) = ([[VAL_4]], [[VAL_4]]) to ([[VAL_6]], [[VAL_6]]) step ([[VAL_10]], [[VAL_11]]) {
+// CHECK:             scf.parallel ([[VAL_14:%.*]], [[VAL_15:%.*]]) = ([[VAL_7]], [[VAL_7]]) to ([[VAL_10]], [[VAL_11]]) step ([[VAL_5]], [[VAL_5]]) {
+// CHECK:               [[VAL_16:%.*]] = affine.apply #map0([[VAL_12]], [[VAL_7]])
+// CHECK:               [[VAL_17:%.*]] = affine.apply #map0([[VAL_13]], [[VAL_15]])
+// CHECK:               [[VAL_18:%.*]] = load [[VAL_1]]{{\[}}[[VAL_16]], [[VAL_17]]] : memref<24x24xf32>
+// CHECK:               [[VAL_19:%.*]] = load [[VAL_2]]{{\[}}[[VAL_16]], [[VAL_17]]] : memref<24x24xf32>
+// CHECK:               [[VAL_20:%.*]] = addf [[VAL_18]], [[VAL_19]] : f32
+// CHECK:               store [[VAL_20]], [[VAL_3]]{{\[}}[[VAL_16]], [[VAL_17]]] : memref<24x24xf32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
@@ -49,31 +91,33 @@
 }
 
 // CHECK-LABEL:   func @tile_nested_innermost() {
-// CHECK:           [[VAL_24:%.*]] = constant 2 : index
-// CHECK:           [[VAL_25:%.*]] = constant 0 : index
-// CHECK:           [[VAL_26:%.*]] = constant 1 : index
-// CHECK:           scf.parallel ([[VAL_27:%.*]], [[VAL_28:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_26]], [[VAL_26]]) {
-// CHECK:             [[VAL_29:%.*]] = constant 0 : index
-// CHECK:             [[VAL_30:%.*]] = constant 1 : index
-// CHECK:             [[VAL_31:%.*]] = constant 4 : index
-// CHECK:             [[VAL_32:%.*]] = muli [[VAL_26]], [[VAL_30]] : index
-// CHECK:             [[VAL_33:%.*]] = muli [[VAL_26]], [[VAL_31]] : index
-// CHECK:             scf.parallel ([[VAL_34:%.*]], [[VAL_35:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_32]], [[VAL_33]]) {
-// CHECK:               [[VAL_36:%.*]] = affine.min #map0([[VAL_30]], [[VAL_24]], [[VAL_34]])
-// CHECK:               [[VAL_37:%.*]] = affine.min #map0([[VAL_31]], [[VAL_24]], [[VAL_35]])
-// CHECK:               scf.parallel ([[VAL_38:%.*]], [[VAL_39:%.*]]) = ([[VAL_29]], [[VAL_29]]) to ([[VAL_36]], [[VAL_37]]) step ([[VAL_26]], [[VAL_26]]) {
+// CHECK:           [[VAL_0:%.*]] = constant 2 : index
+// CHECK:           [[VAL_1:%.*]] = constant 0 : index
+// CHECK:           [[VAL_2:%.*]] = constant 1 : index
+// CHECK:           scf.parallel ([[VAL_3:%.*]], [[VAL_4:%.*]]) = ([[VAL_1]], [[VAL_1]]) to ([[VAL_0]], [[VAL_0]]) step ([[VAL_2]], [[VAL_2]]) {
+// CHECK:             [[VAL_5:%.*]] = constant 0 : index
+// CHECK:             [[VAL_6:%.*]] = constant 1 : index
+// CHECK:             [[VAL_7:%.*]] = constant 4 : index
+// CHECK:             [[VAL_8:%.*]] = muli [[VAL_2]], [[VAL_6]] : index
+// CHECK:             [[VAL_9:%.*]] = muli [[VAL_2]], [[VAL_7]] : index
+// CHECK:             scf.parallel ([[VAL_10:%.*]], [[VAL_11:%.*]]) = ([[VAL_1]], [[VAL_1]]) to ([[VAL_0]], [[VAL_0]]) step ([[VAL_8]], [[VAL_9]]) {
+// CHECK:               [[VAL_12:%.*]] = affine.min #map0([[VAL_9]], [[VAL_0]], [[VAL_11]])
+// CHECK:               scf.parallel ([[VAL_13:%.*]], [[VAL_14:%.*]]) = ([[VAL_5]], [[VAL_5]]) to ([[VAL_8]], [[VAL_12]]) step ([[VAL_2]], [[VAL_2]]) {
+// CHECK:                 = affine.apply #map1([[VAL_10]], [[VAL_5]])
+// CHECK:                 = affine.apply #map1([[VAL_1]], [[VAL_14]])
 // CHECK:               }
 // CHECK:             }
 // CHECK:           }
-// CHECK:           [[VAL_40:%.*]] = constant 0 : index
-// CHECK:           [[VAL_41:%.*]] = constant 1 : index
-// CHECK:           [[VAL_42:%.*]] = constant 4 : index
-// CHECK:           [[VAL_43:%.*]] = muli [[VAL_26]], [[VAL_41]] : index
-// CHECK:           [[VAL_44:%.*]] = muli [[VAL_26]], [[VAL_42]] : index
-// CHECK:           scf.parallel ([[VAL_45:%.*]], [[VAL_46:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_43]], [[VAL_44]]) {
-// CHECK:             [[VAL_47:%.*]] = affine.min #map0([[VAL_41]], [[VAL_24]], [[VAL_45]])
-// CHECK:             [[VAL_48:%.*]] = affine.min #map0([[VAL_42]], [[VAL_24]], [[VAL_46]])
-// CHECK:             scf.parallel ([[VAL_49:%.*]], [[VAL_50:%.*]]) = ([[VAL_40]], [[VAL_40]]) to ([[VAL_47]], [[VAL_48]]) step ([[VAL_26]], [[VAL_26]]) {
+// CHECK:           [[VAL_15:%.*]] = constant 0 : index
+// CHECK:           [[VAL_16:%.*]] = constant 1 : index
+// CHECK:           [[VAL_17:%.*]] = constant 4 : index
+// CHECK:           [[VAL_18:%.*]] = muli [[VAL_2]], [[VAL_16]] : index
+// CHECK:           [[VAL_19:%.*]] = muli [[VAL_2]], [[VAL_17]] : index
+// CHECK:           scf.parallel ([[VAL_20:%.*]], [[VAL_21:%.*]]) = ([[VAL_1]], [[VAL_1]]) to ([[VAL_0]], [[VAL_0]]) step ([[VAL_18]], [[VAL_19]]) {
+// CHECK:             [[VAL_22:%.*]] = affine.min #map0([[VAL_19]], [[VAL_0]], [[VAL_21]])
+// CHECK:             scf.parallel ([[VAL_23:%.*]], [[VAL_24:%.*]]) = ([[VAL_15]], [[VAL_15]]) to ([[VAL_18]], [[VAL_22]]) step ([[VAL_2]], [[VAL_2]]) {
+// CHECK:               = affine.apply #map1([[VAL_20]], [[VAL_15]])
+// CHECK:               = affine.apply #map1([[VAL_1]], [[VAL_24]])
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return