diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1282,6 +1282,17 @@
   }
 };
 
+/// This represents the llvm.type.test intrinsic.
+class TypeTestInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::type_test;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INTRINSICINST_H
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -532,7 +532,17 @@
     // We must ignore debug info directives when counting (otherwise they
     // would affect codegen).
     Instruction *Inst = &*--ScanFrom;
-    if (Inst->isDebugOrPseudoInst())
+    // Ignore unrelated instruction types that in particular may only be
+    // inserted under special compile modes. The scan limit is set quite small
+    // (6 instructions) by default, so these can have an outsized effect on this
+    // optimization. Additionally, if these instructions are only applied when
+    // feeding back instrumentation PGO, different optimization can affect
+    // matching. Specifically, it is not uncommon to apply -g or whole program
+    // devirtualization (which results in bitcast / llvm.type.test / llvm.assume
+    // sequences) only when feeding back profiles and not for the
+    // instrumentation build.
+    if (Inst->isDebugOrPseudoInst() || isa<AssumeInst>(Inst) ||
+        isa<TypeTestInst>(Inst) || isa<BitCastInst>(Inst))
       continue;
 
     // Restore ScanFrom to expected value in case next test succeeds
@@ -620,7 +630,17 @@
   SmallVector<Instruction *> MustNotAliasInsts;
   for (Instruction &Inst : make_range(++Load->getReverseIterator(),
                                       ScanBB->rend())) {
-    if (Inst.isDebugOrPseudoInst())
+    // Ignore unrelated instruction types that in particular may only be
+    // inserted under special compile modes. The scan limit is set quite small
+    // (6 instructions) by default, so these can have an outsized effect on this
+    // optimization. Additionally, if these instructions are only applied when
+    // feeding back instrumentation PGO, different optimization can affect
+    // matching. Specifically, it is not uncommon to apply -g or whole program
+    // devirtualization (which results in bitcast / llvm.type.test / llvm.assume
+    // sequences) only when feeding back profiles and not for the
+    // instrumentation build.
+    if (Inst.isDebugOrPseudoInst() || isa<AssumeInst>(&Inst) ||
+        isa<TypeTestInst>(&Inst) || isa<BitCastInst>(&Inst))
       continue;
 
     if (MaxInstsToScan-- == 0)
diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll
--- a/llvm/test/Transforms/InstCombine/load.ll
+++ b/llvm/test/Transforms/InstCombine/load.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S < %s | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; Set available-load-scan-limit to the current default of 6 so that the test
+; works the same if the limit is adjusted in the future.
+; RUN: opt -instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s
+; RUN: opt -passes=instcombine -available-load-scan-limit=6 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
@@ -100,6 +102,37 @@
   ret i32 %X
 }
 
+declare void @llvm.assume(i1)
+declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
+
+; Ensure that load store forwarding not prevented by the bitcast / type test /
+; assume sequences inserted for whole program devirtualization.
+define i32 @test8_type_test_assume(i32* %P, [3 x i8*]* %vtable) {
+; CHECK-LABEL: @test8_type_test_assume(
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    ret i32 1
+;
+  store i32 1, i32* %P
+
+  ; Insert 2 bitcast / type test / assume sequences so that we would be above
+  ; the scan limit of 6 if they were not ignored.
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  tail call void @llvm.assume(i1 %p)
+  %vtablei82 = bitcast [3 x i8*]* %vtable to i8*
+  %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo")
+  tail call void @llvm.assume(i1 %p2)
+
+  %X = load i32, i32* %P		; <i32> [#uses=1]
+  ret i32 %X
+}
+
 define i32 @test9(i32* %P) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/JumpThreading/thread-loads.ll b/llvm/test/Transforms/JumpThreading/thread-loads.ll
--- a/llvm/test/Transforms/JumpThreading/thread-loads.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-loads.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -jump-threading -S | FileCheck %s
-; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s
+; Set available-load-scan-limit to the current default of 6 so that the test
+; works the same if the limit is adjusted in the future.
+; RUN: opt < %s -jump-threading -available-load-scan-limit=6 -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -available-load-scan-limit=6 -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
@@ -682,6 +684,70 @@
   ret i32 10
 }
 
+declare void @llvm.assume(i1)
+declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
+
+; Test that we can thread through the block with the partially redundant load (%2),
+; ensuring that the threading is not prevented by the bitcast / type test /
+; assume sequences inserted for whole program devirtualization.
+define i32 @test1_type_test_assume(i32* %P, [3 x i8*]* %vtable) nounwind {
+; CHECK-LABEL: @test1_type_test_assume(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 (...) @f1() #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB1:%.*]], label [[BB1_THREAD:%.*]]
+; CHECK:       bb1.thread:
+; CHECK-NEXT:    store i32 42, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    bitcast [3 x i8*]* %vtable to i8*
+; CHECK-NEXT:    call i1 @llvm.type.test
+; CHECK-NEXT:    call void @llvm.assume
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOTPR]], 36
+; CHECK-NEXT:    br i1 [[TMP2]], label [[BB3]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 (...) @f2() #[[ATTR0]]
+; CHECK-NEXT:    ret i32 0
+; CHECK:       bb3:
+; CHECK-NEXT:    [[RES_02:%.*]] = phi i32 [ 1, [[BB1_THREAD]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    ret i32 [[RES_02]]
+;
+entry:
+  %0 = tail call i32 (...) @f1() nounwind		; <i32> [#uses=1]
+  %1 = icmp eq i32 %0, 0		; <i1> [#uses=1]
+  br i1 %1, label %bb1, label %bb
+
+bb:		; preds = %entry
+  store i32 42, i32* %P, align 4
+
+  ; Insert 2 bitcast / type test / assume sequences so that we would be above
+  ; the scan limit of 6 if they were not ignored.
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  tail call void @llvm.assume(i1 %p)
+  %vtablei82 = bitcast [3 x i8*]* %vtable to i8*
+  %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo")
+  tail call void @llvm.assume(i1 %p2)
+
+  br label %bb1
+
+bb1:		; preds = %entry, %bb
+  %res.0 = phi i32 [ 1, %bb ], [ 0, %entry ]		; <i32> [#uses=2]
+  %2 = load i32, i32* %P, align 4		; <i32> [#uses=1]
+  %3 = icmp sgt i32 %2, 36		; <i1> [#uses=1]
+  br i1 %3, label %bb3, label %bb2
+
+bb2:		; preds = %bb1
+  %4 = tail call i32 (...) @f2() nounwind		; <i32> [#uses=0]
+  ret i32 %res.0
+
+bb3:		; preds = %bb1
+  ret i32 %res.0
+}
 
 ; CHECK: [[RNG4]] = !{i32 0, i32 1}