diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1282,6 +1282,17 @@
   }
 };
 
+/// This represents the llvm.type.test intrinsic.
+class TypeTestInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::type_test;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INTRINSICINST_H
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -416,17 +416,28 @@
   return isSafeToLoadUnconditionally(V, Alignment, Size, DL, ScanFrom, DT, TLI);
 }
 
-/// DefMaxInstsToScan - the default number of maximum instructions
+/// DefMaxInstsToScan - the default number of maximum ld/st instructions
 /// to scan in the block, used by FindAvailableLoadedValue().
 /// FindAvailableLoadedValue() was introduced in r60148, to improve jump
 /// threading in part by eliminating partially redundant loads.
 /// At that point, the value of MaxInstsToScan was already set to '6'
 /// without documented explanation.
-cl::opt<unsigned>
-llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden,
-  cl::desc("Use this to specify the default maximum number of instructions "
-           "to scan backward from a given instruction, when searching for "
-           "available loaded value"));
+cl::opt<unsigned> llvm::DefMaxInstsToScan(
+    "available-load-scan-limit", cl::init(6), cl::Hidden,
+    cl::desc(
+        "Use this to specify the default maximum number of ld/st instructions "
+        "to scan backward from a given instruction, when searching for "
+        "available loaded value"));
+
+// The default maximum number of non ld/st instructions to skip when finding
+// available loads.
+static cl::opt<unsigned>
+    DefMaxInstsToSkip("available-load-skip-limit", cl::init(32), cl::Hidden,
+                      cl::desc("Use this to specify the default maximum number "
+                               "of non-ld/st instructions "
+                               "to skip when scanning backward from a given "
+                               "instruction, when searching for "
+                               "available loaded value"));
 
 Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
                                       BasicBlock *ScanBB,
@@ -524,6 +535,11 @@
     AAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) {
   if (MaxInstsToScan == 0)
     MaxInstsToScan = ~0U;
+  // If the max insts to scan is larger, presumably the caller wanted to give a
+  // larger budget (potentially unlimited), so bump up the insts to skip
+  // accordingly.
+  int MaxNonLdSt =
+      DefMaxInstsToSkip > MaxInstsToScan ? DefMaxInstsToSkip : MaxInstsToScan;
 
   const DataLayout &DL = ScanBB->getModule()->getDataLayout();
   const Value *StrippedPtr = Loc.Ptr->stripPointerCasts();
@@ -532,8 +548,13 @@
     // We must ignore debug info directives when counting (otherwise they
     // would affect codegen).
     Instruction *Inst = &*--ScanFrom;
-    if (Inst->isDebugOrPseudoInst())
+    if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst) &&
+        !Inst->mayWriteToMemory()) {
+      if (MaxNonLdSt-- == 0) {
+        return nullptr;
+      }
       continue;
+    }
 
     // Restore ScanFrom to expected value in case next test succeeds
     ScanFrom++;
@@ -618,10 +639,20 @@
   // queries until later.
   Value *Available = nullptr;;
   SmallVector<Instruction *> MustNotAliasInsts;
+  // If the max insts to scan is larger, presumably the caller wanted to give a
+  // larger budget (potentially unlimited), so bump up the insts to skip
+  // accordingly.
+  int MaxNonLdSt =
+      DefMaxInstsToSkip > MaxInstsToScan ? DefMaxInstsToSkip : MaxInstsToScan;
   for (Instruction &Inst : make_range(++Load->getReverseIterator(),
                                       ScanBB->rend())) {
-    if (Inst.isDebugOrPseudoInst())
+    if (!isa<LoadInst>(&Inst) && !isa<StoreInst>(&Inst) &&
+        !Inst.mayWriteToMemory()) {
+      if (MaxNonLdSt-- == 0) {
+        return nullptr;
+      }
       continue;
+    }
 
     if (MaxInstsToScan-- == 0)
       return nullptr;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1404,8 +1404,11 @@
     // If PredBB has a single predecessor, continue scanning through the
     // single predecessor.
     BasicBlock *SinglePredBB = PredBB;
+    // Make sure we don't walk backwards into an unreachable loop.
+    SmallPtrSet<BasicBlock *, 16> Visited;
     while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
-           NumScanedInst < DefMaxInstsToScan) {
+           NumScanedInst < DefMaxInstsToScan &&
+           Visited.insert(SinglePredBB).second) {
       SinglePredBB = SinglePredBB->getSinglePredecessor();
       if (SinglePredBB) {
         BBIt = SinglePredBB->end();
diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll
--- a/llvm/test/Transforms/InstCombine/load.ll
+++ b/llvm/test/Transforms/InstCombine/load.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S < %s | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; Set available-load-scan-limit to the current default of 6 so that the test
+; works the same if the limit is adjusted in the future.
+; RUN: opt -instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s
+; RUN: opt -passes=instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
@@ -100,6 +102,37 @@
   ret i32 %X
 }
 
+declare void @llvm.assume(i1)
+declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
+
+; Ensure that load store forwarding not prevented by the bitcast / type test /
+; assume sequences inserted for whole program devirtualization.
+define i32 @test8_type_test_assume(i32* %P, [3 x i8*]* %vtable) {
+; CHECK-LABEL: @test8_type_test_assume(
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    ret i32 1
+;
+  store i32 1, i32* %P
+
+  ; Insert 2 bitcast / type test / assume sequences so that we would be above
+  ; the scan limit of 6 if they were not ignored.
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  tail call void @llvm.assume(i1 %p)
+  %vtablei82 = bitcast [3 x i8*]* %vtable to i8*
+  %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo")
+  tail call void @llvm.assume(i1 %p2)
+
+  %X = load i32, i32* %P		; <i32> [#uses=1]
+  ret i32 %X
+}
+
 define i32 @test9(i32* %P) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/JumpThreading/thread-loads.ll b/llvm/test/Transforms/JumpThreading/thread-loads.ll
--- a/llvm/test/Transforms/JumpThreading/thread-loads.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-loads.ll
@@ -1,6 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -jump-threading -S | FileCheck %s
-; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s
+; Set available-load-scan-limit to the current default of 6 so that the test
+; works the same if the limit is adjusted in the future.
+; RUN: opt < %s -jump-threading -available-load-scan-limit=6 -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -available-load-scan-limit=6 -S | FileCheck %s
+; Try again with a low skip limit.
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -available-load-scan-limit=6 -available-load-skip-limit=6 -S | FileCheck %s --check-prefix=SKIPLIM6
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
@@ -682,6 +686,127 @@
   ret i32 10
 }
 
+declare void @llvm.assume(i1)
+declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
+
+; Test that we can thread through the block with the partially redundant load (%2),
+; ensuring that the threading is not prevented by the bitcast / type test /
+; assume sequences inserted for whole program devirtualization.
+define i32 @test1_type_test_assume(i32* %P, [3 x i8*]* %vtable) nounwind {
+; CHECK-LABEL: @test1_type_test_assume(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 (...) @f1() #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB1:%.*]], label [[BB1_THREAD:%.*]]
+; CHECK:       bb1.thread:
+; CHECK-NEXT:    store i32 42, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOTPR]], 36
+; CHECK-NEXT:    br i1 [[TMP2]], label [[BB3]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 (...) @f2() #[[ATTR0]]
+; CHECK-NEXT:    ret i32 0
+; CHECK:       bb3:
+; CHECK-NEXT:    [[RES_02:%.*]] = phi i32 [ 1, [[BB1_THREAD]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    ret i32 [[RES_02]]
+;
+; We should not thread if we've set the scan limit low.
+; SKIPLIM6-LABEL: @test1_type_test_assume
+; SKIPLIM6-NOT:     thread
+; SKIPLIM6:         ret
+entry:
+  %0 = tail call i32 (...) @f1() nounwind		; <i32> [#uses=1]
+  %1 = icmp eq i32 %0, 0		; <i1> [#uses=1]
+  br i1 %1, label %bb1, label %bb
+
+bb:		; preds = %entry
+  store i32 42, i32* %P, align 4
+
+  ; Insert 3 bitcast / type test / assume sequences so that we would be above
+  ; a skip limit of 6.
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  tail call void @llvm.assume(i1 %p)
+  %vtablei82 = bitcast [3 x i8*]* %vtable to i8*
+  %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo")
+  tail call void @llvm.assume(i1 %p2)
+  %vtablei83 = bitcast [3 x i8*]* %vtable to i8*
+  %p3 = call i1 @llvm.type.test(i8* %vtablei83, metadata !"foo")
+  tail call void @llvm.assume(i1 %p3)
+
+  br label %bb1
+
+bb1:		; preds = %entry, %bb
+  %res.0 = phi i32 [ 1, %bb ], [ 0, %entry ]		; <i32> [#uses=2]
+  %2 = load i32, i32* %P, align 4		; <i32> [#uses=1]
+  %3 = icmp sgt i32 %2, 36		; <i1> [#uses=1]
+  br i1 %3, label %bb3, label %bb2
+
+bb2:		; preds = %bb1
+  %4 = tail call i32 (...) @f2() nounwind		; <i32> [#uses=0]
+  ret i32 %res.0
+
+bb3:		; preds = %bb1
+  ret i32 %res.0
+}
+
+; Make sure we don't infinite loop in the presence of an unreachable loop
+; when searching up through single preds for available pointer loads.
+define i32 @fn_SinglePredUnreachable(i1 %c2,i64* %P) {
+; CHECK-LABEL: @fn_SinglePredUnreachable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L1:%.*]] = load i64, i64* [[P:%.*]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[L1]], 0
+; CHECK-NEXT:    br label [[COND2:%.*]]
+; CHECK:       cond1:
+; CHECK-NEXT:    br i1 false, label [[COND1:%.*]], label %[[CONDPRESPLIT:.*]]
+; CHECK:       [[CONDPRESPLIT]]:
+; CHECK-NEXT:    [[L2PR:%.*]] = load i64, i64* [[P]], align 4
+; CHECK-NEXT:    br label [[COND2:%.*]]
+; CHECK:       cond2:
+; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ [[L2PR]], %[[CONDPRESPLIT]] ]
+; CHECK-NEXT:    call void @fn2(i64 [[L2]])
+; CHECK-NEXT:    [[C3:%.*]] = icmp eq i64 [[L2]], 0
+; CHECK-NEXT:    br i1 %c3, label [[COND3:%.*]], label [[END:%.*]]
+; CHECK:       cond3:
+; CHECK-NEXT:    call void @fn3(i64 [[L2]])
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  %l1 = load i64, i64* %P
+  %c = icmp eq i64 %l1, 0
+  br label %cond2
+
+cond1:
+  br i1 false, label %cond1, label %cond2
+
+cond2:
+  %l2 = load i64, i64* %P
+  call void @fn2(i64 %l2)
+  %c3 = icmp eq i64 %l2,  0
+  br i1 %c3, label %cond3, label %end
+
+cond3:
+  call void @fn3(i64 %l2)
+  br label %end
+
+end:
+  ret i32 0
+}
 
 ; CHECK: [[RNG4]] = !{i32 0, i32 1}