Index: docs/LangRef.rst
===================================================================
--- docs/LangRef.rst
+++ docs/LangRef.rst
@@ -3142,6 +3142,21 @@
 
    !0 = metadata !{ metadata !"llvm.loop.unroll.full" }
 
+'``llvm.loop.minimum_dependence_distance``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata specifies a minimum dependence distance for accesses marked
+with ``llvm.mem.parallel_loop_access``. The first operand is the string
+``llvm.loop.minimum_dependence_distance`` and the second operand is a
+positive integer specifying the minimum distance.
+For example:
+
+.. code-block:: llvm
+
+   !0 = metadata !{metadata !"llvm.loop.minimum_dependence_distance", i32 4}
+
+specifies that parallel accesses have a minimum dependence distance of 4.
+
 '``llvm.mem``'
 ^^^^^^^^^^^^^^^
 
@@ -3154,19 +3169,25 @@
 The ``llvm.mem.parallel_loop_access`` metadata refers to a loop identifier, 
 or metadata containing a list of loop identifiers for nested loops. 
 The metadata is attached to memory accessing instructions and denotes that 
-no loop carried memory dependence exist between it and other instructions denoted 
-with the same loop identifier.
+any loop carried memory dependence between it and other instructions denoted 
+with the same loop identifier have a minimum dependence distance of ``k``,
+where ``k`` is specified by ``llvm.mem.minimum_dependence_distance`` metadata
+on the loop.  If ``k`` is not specified by such metadata, it is positive 
+infinity.
 
 Precisely, given two instructions ``m1`` and ``m2`` that both have the 
 ``llvm.mem.parallel_loop_access`` metadata, with ``L1`` and ``L2`` being the 
 set of loops associated with that metadata, respectively, then there is no loop 
-carried dependence between ``m1`` and ``m2`` for loops in both ``L1`` and 
-``L2``.
+carried dependence with distance less than ``k1`` and ``k2`` between 
+``m1`` and ``m2`` for loops in both ``L1`` and ``L2``, where ``k1`` and ``k2``
+are the minimum distancesd specified by ``llvm.loop.minimum_dependence_distance``
+on ``L1`` and ``L2`` respectively (or the default of infinity if not specified).  
 
 As a special case, if all memory accessing instructions in a loop have 
-``llvm.mem.parallel_loop_access`` metadata that refers to that loop, then the 
-loop has no loop carried memory dependences and is considered to be a parallel 
-loop.  
+``llvm.mem.parallel_loop_access`` metadata that refers to that loop and
+``llvm.loop.minimum_dependence_distance`` is not specified for the loop,
+then the loop has no loop carried memory dependences and is considered to 
+be a parallel loop.  
 
 Note that if not all memory access instructions have such metadata referring to 
 the loop, then the loop is considered not being trivially parallel. Additional 
Index: include/llvm/Analysis/LoopInfo.h
===================================================================
--- include/llvm/Analysis/LoopInfo.h
+++ include/llvm/Analysis/LoopInfo.h
@@ -407,7 +407,8 @@
   /// isSafeToClone - Return true if the loop body is safe to clone in practice.
   bool isSafeToClone() const;
 
-  /// Returns true if the loop is annotated parallel.
+  /// Returns true if the loop is annotated parallel *without* distance
+  /// restrictions.
   ///
   /// A parallel loop can be assumed to not contain any dependencies between
   /// iterations by the compiler. That is, any loop-carried dependency checking
@@ -419,7 +420,20 @@
   /// iterations is not guaranteed, thus, the end result might or might not
   /// implement actual concurrent execution of instructions across multiple
   /// iterations.
-  bool isAnnotatedParallel() const;
+  bool isAnnotatedParallel() const {
+    return getAnnotatedRestrictedParallel().second == 0;
+  };
+
+  /// Determine if the loop is annotated parallel, possibly with distance
+  /// restrictions.
+  ///
+  /// Let (p,k) denote the returned pair.
+  /// If p==false, no assumptions can be made.  k is always 1 in this case.
+  /// If p==true && k==0, then the loop can be assumed to have no
+  /// dependencies between iterations, i.e. isAnnotatedParallel()==true.
+  /// If p==true && k>0, the loop can be assumed to have no dependencies
+  /// with distance less than k between iterations.
+  std::pair<bool, unsigned> getAnnotatedRestrictedParallel() const;
 
   /// Return the llvm.loop loop id metadata node for this loop if it is present.
   ///
Index: lib/Analysis/LoopInfo.cpp
===================================================================
--- lib/Analysis/LoopInfo.cpp
+++ lib/Analysis/LoopInfo.cpp
@@ -285,11 +285,11 @@
   }
 }
 
-bool Loop::isAnnotatedParallel() const {
+std::pair<bool, unsigned> Loop::getAnnotatedRestrictedParallel() const {
   MDNode *desiredLoopIdMetadata = getLoopID();
 
   if (!desiredLoopIdMetadata)
-      return false;
+    return std::make_pair(false, 1u);
 
   // The loop branch contains the parallel loop metadata. In order to ensure
   // that any parallel-loop-unaware optimization pass hasn't added loop-carried
@@ -310,7 +310,7 @@
       MDNode *loopIdMD = II->getMetadata("llvm.mem.parallel_loop_access");
 
       if (!loopIdMD)
-        return false;
+        return std::make_pair(false, 1u);
 
       bool loopIdMDFound = false;
       for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) {
@@ -321,12 +321,26 @@
       }
 
       if (!loopIdMDFound)
-        return false;
+        return std::make_pair(false, 1);
     }
   }
-  return true;
-}
 
+  // Check for restriction to distance d.  If present, it appears as a non-first
+  // operand of the loopId with the form
+  //   'metadata !"llvm.loop.minimum_dependence_distance", i32 d'
+  for (unsigned i = 1, ie = desiredLoopIdMetadata->getNumOperands(); i < ie;
+       ++i)
+    if (const MDNode *MD =
+            dyn_cast<MDNode>(desiredLoopIdMetadata->getOperand(i)))
+      if (MD->getNumOperands() == 2)
+        if (MDString *s = dyn_cast<MDString>(MD->getOperand(0)))
+          if (s->getString() == "llvm.loop.minimum_dependence_distance")
+            return std::make_pair(
+                true, dyn_cast<ConstantInt>(MD->getOperand(1))->getZExtValue());
+
+  // Zero is convention for unrestricted.
+  return std::make_pair(true, 0);
+}
 
 /// hasDedicatedExits - Return true if no exit block for the loop
 /// has a predecessor that is outside the loop.
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -581,8 +581,8 @@
                             AliasAnalysis *AA, Function *F)
       : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
         DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr),
-        WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
-  }
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U),
+        AnnotatedMinDepDist(1) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -759,6 +759,13 @@
 
   unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
+  void adjustForAnnotatedMinDepDist(unsigned WidestTypeBytes) {
+    if (AnnotatedMinDepDist != 1)
+      MaxSafeDepDistBytes = AnnotatedMinDepDist < -1u / WidestTypeBytes
+                                ? AnnotatedMinDepDist * WidestTypeBytes
+                                : -1u;
+  }
+
   bool hasStride(Value *V) { return StrideSet.count(V); }
   bool mustCheckStrides() { return !StrideSet.empty(); }
   SmallPtrSet<Value *, 8>::iterator strides_begin() {
@@ -866,6 +873,7 @@
   bool HasFunNoNaNAttr;
 
   unsigned MaxSafeDepDistBytes;
+  unsigned AnnotatedMinDepDist;
 
   ValueToValueMap Strides;
   SmallPtrSet<Value *, 8> StrideSet;
@@ -986,7 +994,8 @@
   enum HintKind {
     HK_WIDTH,
     HK_UNROLL,
-    HK_FORCE
+    HK_FORCE,
+    HK_MINIMUM_DEPEENDENCE_DISTANCE
   };
 
   /// Hint - associates name and validation with the hint value.
@@ -1006,9 +1015,16 @@
         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
       case HK_FORCE:
         return (Val <= 1);
+      case HK_MINIMUM_DEPEENDENCE_DISTANCE:
+        return Val >= 1;
       }
       return false;
     }
+
+    /// True if hint should not be written out after vectorization.
+    bool squash() const {
+      return Value == 0 && Kind == HK_MINIMUM_DEPEENDENCE_DISTANCE;
+    }
   };
 
   /// Vectorization width.
@@ -1017,8 +1033,11 @@
   Hint Interleave;
   /// Vectorization forced
   Hint Force;
+  /// Minimum dependence distance. Technically, is not a hint, but it's handled
+  /// syntactically the same way as a hint.
+  Hint MinimumDependenceDistance;
   /// Array to help iterating through all hints.
-  Hint *Hints[3]; // avoiding initialisation due to MSVC2012
+  Hint *Hints[4]; // avoiding initialisation due to MSVC2012
 
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
@@ -1034,11 +1053,14 @@
       : Width("vectorize.width", VectorizationFactor, HK_WIDTH),
         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
         Force("vectorize.enable", FK_Undefined, HK_FORCE),
+        MinimumDependenceDistance("minimum_dependence_distance", 0,
+                                  HK_MINIMUM_DEPEENDENCE_DISTANCE),
         TheLoop(L) {
     // FIXME: Move this up initialisation when MSVC requirement is 2013+
     Hints[0] = &Width;
     Hints[1] = &Interleave;
     Hints[2] = &Force;
+    Hints[3] = &MinimumDependenceDistance;
 
     // Populate values with existing loop metadata.
     getHintsFromMetadata();
@@ -1054,12 +1076,17 @@
   /// Mark the loop L as already vectorized by setting the width to 1.
   void setAlreadyVectorized() {
     Width.Value = Interleave.Value = 1;
+    // If distance was finite, conservatively assume it shrunk to 1
+    if (MinimumDependenceDistance.Value > 0)
+      MinimumDependenceDistance.Value = 1;
+
     // FIXME: Change all lines below for this when we can use MSVC 2013+
-    //writeHintsToMetadata({ Width, Unroll });
+    // writeHintsToMetadata({ Width, Unroll, MinimumDependenceDistance });
     std::vector<Hint> hints;
-    hints.reserve(2);
+    hints.reserve(3);
     hints.emplace_back(Width);
     hints.emplace_back(Interleave);
+    hints.emplace_back(MinimumDependenceDistance);
     writeHintsToMetadata(std::move(hints));
   }
 
@@ -1187,8 +1214,10 @@
 
     // Now, add the missing hints.
     for (auto H : HintTypes)
-      Vals.push_back(
-          createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+      // Elide hint with value of zero.
+      if (!H.squash())
+        Vals.push_back(
+            createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
 
     // Replace current metadata node with new one.
     LLVMContext &Context = TheLoop->getHeader()->getContext();
@@ -4710,7 +4739,12 @@
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
-  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+  const auto Annotations = TheLoop->getAnnotatedRestrictedParallel();
+  // Logic below handles fully parallel loops by treating them
+  // as having a very large dependence distance.
+  AnnotatedMinDepDist = Annotations.second ? Annotations.second : -1u;
+
+  const bool NeedDepCheck = AnnotatedMinDepDist == 1;
   MemoryDepChecker DepChecker(SE, DL, TheLoop);
 
   // For each block.
@@ -4733,7 +4767,7 @@
           continue;
 
         LoadInst *Ld = dyn_cast<LoadInst>(it);
-        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+        if (!Ld || (!Ld->isSimple() && NeedDepCheck)) {
           emitAnalysis(Report(Ld)
                        << "read with atomic ordering or volatile read");
           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
@@ -4752,7 +4786,7 @@
           emitAnalysis(Report(it) << "instruction cannot be vectorized");
           return false;
         }
-        if (!St->isSimple() && !IsAnnotatedParallel) {
+        if (!St->isSimple() && NeedDepCheck) {
           emitAnalysis(Report(St)
                        << "write with atomic ordering or volatile write");
           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
@@ -4814,10 +4848,10 @@
     }
   }
 
-  if (IsAnnotatedParallel) {
-    DEBUG(dbgs()
-          << "LV: A loop annotated parallel, ignore memory dependency "
-          << "checks.\n");
+  if (!NeedDepCheck) {
+    DEBUG(dbgs() << "LV: A loop annotated with min dependence distance "
+                 << AnnotatedMinDepDist
+                 << ", ignore memory dependency checks.\n");
     return true;
   }
 
@@ -5358,10 +5392,11 @@
   unsigned WidestType = getWidestType();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
   unsigned MaxSafeDepDist = -1U;
+  Legal->adjustForAnnotatedMinDepDist(WidestType / 8);
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
-  WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
-                    WidestRegister : MaxSafeDepDist);
+  WidestRegister =
+      ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
   unsigned MaxVectorSize = WidestRegister / WidestType;
   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
   DEBUG(dbgs() << "LV: The Widest register is: "
Index: test/Transforms/LoopVectorize/X86/min-dep-dist.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/min-dep-dist.ll
+++ test/Transforms/LoopVectorize/X86/min-dep-dist.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s  -loop-vectorize -mcpu=corei7-avx -S | FileCheck %s
+; CHECK-NOT: memcheck
+; CHECK-NOT: <8 x float
+; CHECK-NOT: <4 x float
+; CHECK: <2 x float
+; CHECK-NOT: minimum_dependence_distance", i32 2}
+ 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Simple routine that would be vectorized wider than 2
+; if llvm.loop.minimum_dependence_distance were not present.
+define void @foo(float* %a, float* %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %i = getelementptr inbounds float* %a, i64 %indvars.iv
+  %t = load float* %i, align 4, !llvm.mem.parallel_loop_access !0
+  %mul = fmul float %t, %t
+  %j = getelementptr inbounds float* %b, i64 %indvars.iv
+  store float %mul, float* %j, align 4, !llvm.mem.parallel_loop_access !0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = metadata !{metadata !0, metadata !1}
+!1 = metadata !{metadata !"llvm.loop.minimum_dependence_distance", i32 2}