Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -15,7 +15,9 @@
 //
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
 
 using namespace llvm;
@@ -913,6 +915,49 @@
   return true;
 }
 
+/// Return true if we can that the given loan is safe to evaluate anywhere
+/// within the loop body (i.e. does not require predication beyond that
+/// required by the the header itself)
+static bool isSafeToLoadUnconditionallyInLoop(LoadInst *LI, Loop *L,
+                                              const DataLayout &DL,
+                                              ScalarEvolution &SE,
+                                              DominatorTree &DT) {
+  Value *Ptr = LI->getPointerOperand();
+  auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Ptr));
+  if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine())
+    return false;
+  auto* Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
+  if (!Step)
+    return false;
+  APInt StepC = Step->getAPInt();
+  APInt AccessSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
+                   DL.getTypeStoreSize(LI->getType()));
+  // TODO: generalize to access patterns which don't touch every byte
+  if (StepC != AccessSize)
+    return false;
+
+  auto TC = SE.getSmallConstantTripCount(L);
+  if (!TC)
+    return false;
+
+  auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart());
+  if (!StartS || !SE.isLoopInvariant(StartS, L))
+    return false;
+  
+  Value *Base = StartS->getValue();
+
+  // We have to adjust the size down by one iteration as the Loads.cpp adds one
+  // back.  TODO: cleanup
+  APInt Size(DL.getIndexTypeSizeInBits(Ptr->getType()),
+             (TC-1)*DL.getTypeStoreSize(LI->getType()));
+  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
+  return isSafeToLoadUnconditionally(Base,
+                                     LI->getAlignment(),
+                                     Size,
+                                     DL, HeaderFirstNonPHI,
+                                     &DT);
+}
+
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion) {
     reportVectorizationFailure("If-conversion is disabled",
@@ -924,17 +969,55 @@
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
 
-  // A list of pointers that we can safely read and write to.
+  // A list of pointers which are known to be dereferenceable within scope of
+  // the loop body for each iteration of the loop.  That is, the memory pointed
+  // to can be dereferenced (with the access size implied by the value's type)
+  // unconditionally within the loop loop header without introducing a new
+  // fault. 
   SmallPtrSet<Value *, 8> SafePointes;
 
   // Collect safe addresses.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (blockNeedsPredication(BB))
+    if (blockNeedsPredication(BB)) {
+      // For a block which requires predication, a address may be safe to
+      // access in the loop w/o predication if we can prove dereferenceability
+      // facts sufficient to ensure it'll never fault within the loop.
+      // TODO: extend the load reasoning to stores, etc.. (requires some API
+      //   cleanup and renaming in Loads.h) 
+      for (Instruction &I : *BB)
+        if (auto *Ptr = getLoadStorePointerOperand(&I)) {
+          // The choice of header vs preheader is deliberate here to allow loop
+          // varying addresses which can still be proven.
+          Instruction *HeaderFirstNonPHI =
+            TheLoop->getHeader()->getFirstNonPHI();
+          if (isSafeToSpeculativelyExecute(&I, HeaderFirstNonPHI, DT)) {
+            SafePointes.insert(Ptr);
+            continue;
+          }
+          // If the generic version didn't work, try to use SCEV to prove
+          // bounds on the access and discharge them against the allocation.
+          LoadInst *LI = dyn_cast<LoadInst>(&I);
+          if (!LI)
+            continue;
+          if (!LI->isUnordered() ||
+              // Speculative load may create a race that did not exist in the source.
+              LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) ||
+              // Speculative load may load data from dirty regions.
+              LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
+              LI->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
+            continue;
+          auto &DL = LI->getModule()->getDataLayout();
+          if (isSafeToLoadUnconditionallyInLoop(LI, TheLoop, DL,
+                                                *PSE.getSE(), *DT))
+            SafePointes.insert(Ptr);
+        }
       continue;
-
+    }
+                      
     for (Instruction &I : *BB)
       if (auto *Ptr = getLoadStorePointerOperand(&I))
         SafePointes.insert(Ptr);
+    continue;
   }
 
   // Collect the blocks that need predication.
Index: test/Transforms/LoopVectorize/load-deref-pred.ll
===================================================================
--- test/Transforms/LoopVectorize/load-deref-pred.ll
+++ test/Transforms/LoopVectorize/load-deref-pred.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -force-vector-width=4 -loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @init(i32*)
+
+;; We don't need a masked.load for this due to deref facts, and can instead
+;; use a plain vector load
+define i32 @test_explicit_pred(i64 %len) {
+; CHECK-LABEL: @test_explicit_pred(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
+; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[LEN:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> undef, i64 [[LEN]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 4
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i1> [[TMP8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i1> [[TMP9]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP18]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP19]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI6]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP19]], [[TMP18]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF8:%.*]] = shufflevector <4 x i32> [[BIN_RDX7]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX9:%.*]] = add <4 x i32> [[BIN_RDX7]], [[RDX_SHUF8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[BIN_RDX9]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EARLYCND:%.*]] = icmp slt i64 [[IV]], [[LEN]]
+; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop !2
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [4096 x i32]
+  %base = bitcast [4096 x i32]* %alloca to i32*
+  call void @init(i32* %base)
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i64 %iv, 1
+  %earlycnd = icmp slt i64 %iv, %len
+  br i1 %earlycnd, label %pred, label %latch
+pred:
+  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
+  %val = load i32, i32* %addr
+  br label %latch
+latch:
+  %val.phi = phi i32 [0, %loop], [%val, %pred]
+  %accum.next = add i32 %accum, %val.phi
+  %exit = icmp ugt i64 %iv, 4094
+  br i1 %exit, label %loop_exit, label %loop
+
+loop_exit:
+  ret i32 %accum.next
+}
+