Index: llvm/trunk/include/llvm/Transforms/Scalar/JumpThreading.h
===================================================================
--- llvm/trunk/include/llvm/Transforms/Scalar/JumpThreading.h
+++ llvm/trunk/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
@@ -62,6 +63,7 @@
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = false;
+  bool HasGuards = false;
 #ifdef NDEBUG
   SmallPtrSet<const BasicBlock *, 16> LoopHeaders;
 #else
@@ -122,6 +124,9 @@
   bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
   bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
 
+  bool ProcessGuards(BasicBlock *BB);
+  bool ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI);
+
 private:
   BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
                               const char *Suffix);
Index: llvm/trunk/include/llvm/Transforms/Utils/Cloning.h
===================================================================
--- llvm/trunk/include/llvm/Transforms/Utils/Cloning.h
+++ llvm/trunk/include/llvm/Transforms/Utils/Cloning.h
@@ -245,6 +245,16 @@
 void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
                                ValueToValueMapTy &VMap);
 
+/// Split edge between BB and PredBB and duplicate all non-Phi instructions
+/// from BB between its beginning and the StopAt instruction into the split
+/// block. Phi nodes are not duplicated, but their uses are handled correctly:
+/// we replace them with the uses of corresponding Phi inputs. ValueMapping
+/// is used to map the original instructions from BB to their newly-created
+/// copies. Returns the split block.
+BasicBlock *
+DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
+                                    Instruction *StopAt,
+                                    ValueToValueMapTy &ValueMapping);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CLONING_H
Index: llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp
+++ llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp
@@ -30,11 +30,13 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
@@ -169,6 +171,9 @@
   // When profile data is available, we need to update edge weights after
   // successful jump threading, which requires both BPI and BFI being available.
   HasProfileData = HasProfileData_;
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
   if (HasProfileData) {
     BPI = std::move(BPI_);
     BFI = std::move(BFI_);
@@ -238,10 +243,13 @@
   return EverChanged;
 }
 
-/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it. Stop scanning the block when passing the threshold.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+                                             Instruction *StopAt,
                                              unsigned Threshold) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I(BB->getFirstNonPHI());
 
@@ -249,15 +257,17 @@
   // branch, so they shouldn't count against the duplication cost.
 
   unsigned Bonus = 0;
-  const TerminatorInst *BBTerm = BB->getTerminator();
-  // Threading through a switch statement is particularly profitable.  If this
-  // block ends in a switch, decrease its cost to make it more likely to happen.
-  if (isa<SwitchInst>(BBTerm))
-    Bonus = 6;
-
-  // The same holds for indirect branches, but slightly more so.
-  if (isa<IndirectBrInst>(BBTerm))
-    Bonus = 8;
+  if (BB->getTerminator() == StopAt) {
+    // Threading through a switch statement is particularly profitable.  If this
+    // block ends in a switch, decrease its cost to make it more likely to
+    // happen.
+    if (isa<SwitchInst>(StopAt))
+      Bonus = 6;
+
+    // The same holds for indirect branches, but slightly more so.
+    if (isa<IndirectBrInst>(StopAt))
+      Bonus = 8;
+  }
 
   // Bump the threshold up so the early exit from the loop doesn't skip the
   // terminator-based Size adjustment at the end.
@@ -266,7 +276,7 @@
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
-  for (; !isa<TerminatorInst>(I); ++I) {
+  for (; &*I != StopAt; ++I) {
 
     // Stop scanning the block if we've reached the threshold.
     if (Size > Threshold)
@@ -712,6 +722,10 @@
   if (TryToUnfoldSelectInCurrBB(BB))
     return true;
 
+  // Look if we can propagate guards to predecessors.
+  if (HasGuards && ProcessGuards(BB))
+    return true;
+
   // What kind of constant we're looking for.
   ConstantPreference Preference = WantInteger;
 
@@ -1466,7 +1480,8 @@
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned JumpThreadCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (JumpThreadCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1754,7 +1769,8 @@
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned DuplicationCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (DuplicationCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
@@ -2019,3 +2035,124 @@
   
   return false;
 }
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+  using namespace PatternMatch;
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  if (auto Parent = Pred1->getSinglePredecessor())
+    if (Parent == Pred2->getSinglePredecessor())
+      if (BranchInst *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+        for (auto &I : *BB)
+          if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+            if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+              return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+                                    BranchInst *BI) {
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl =
+        isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  if (Cost > BBDupThreshold)
+    return false;
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, GuardedBlock, AfterGuard, GuardedMapping);
+  assert(GuardedBlock && "Could not create the guarded block?");
+  // Duplicate all instructions before the guard in the unguarded branch.
+  // Since we have successfully duplicated the guarded block and this block
+  // has fewer instructions, we expect it to succeed.
+  UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+                                                       Guard, UnguardedMapping);
+  assert(UnguardedBlock && "Could not create the unguarded block?");
+  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+               << GuardedBlock->getName() << "\n");
+
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    if (!isa<PHINode>(&*BI))
+      ToRemove.push_back(&*BI);
+
+  Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+  assert(InsertionPoint && "Empty block?");
+  // Substitute with Phis & remove.
+  for (auto *Inst : reverse(ToRemove)) {
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(InsertionPoint);
+      Inst->replaceAllUsesWith(NewPN);
+    }
+    Inst->eraseFromParent();
+  }
+  return true;
+}
Index: llvm/trunk/lib/Transforms/Utils/CloneFunction.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Utils/CloneFunction.cpp
+++ llvm/trunk/lib/Transforms/Utils/CloneFunction.cpp
@@ -747,3 +747,40 @@
 
   return NewLoop;
 }
+
+/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *
+llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
+                                          Instruction *StopAt,
+                                          ValueToValueMapTy &ValueMapping) {
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  NewBB->setName(PredBB->getName() + ".split");
+  Instruction *NewTerm = NewBB->getTerminator();
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; StopAt != &*BI; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    New->insertBefore(NewTerm);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        auto I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return NewBB;
+}
Index: llvm/trunk/test/Transforms/JumpThreading/guards.ll
===================================================================
--- llvm/trunk/test/Transforms/JumpThreading/guards.ll
+++ llvm/trunk/test/Transforms/JumpThreading/guards.ll
@@ -0,0 +1,132 @@
+; RUN: opt < %s -jump-threading -dce -S | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @f1()
+declare i32 @f2()
+
+define i32 @branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @branch_implies_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split
+; CHECK:         %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split
+; CHECK:         %v2 = call i32 @f2()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    %condGuard
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
+; CHECK-NEXT:    br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp slt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @not_branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @not_branch_implies_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split:
+; CHECK-NEXT:    %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    %condGuard
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split:
+; CHECK-NEXT:   %v2 = call i32 @f2()
+; CHECK-NEXT:   %retVal
+; CHECK-NEXT:   br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp sgt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_overlaps_guard(i32 %a) {
+; CHECK-LABEL: @branch_overlaps_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         %condGuard = icmp slt i32 %a, 10
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp slt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_doesnt_overlap_guard(i32 %a) {
+; CHECK-LABEL: @branch_doesnt_overlap_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         %condGuard = icmp sgt i32 %a, 20
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp sgt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}
Index: llvm/trunk/unittests/Transforms/Utils/Cloning.cpp
===================================================================
--- llvm/trunk/unittests/Transforms/Utils/Cloning.cpp
+++ llvm/trunk/unittests/Transforms/Utils/Cloning.cpp
@@ -201,6 +201,53 @@
   delete F2;
 }
 
+TEST_F(CloneInstruction, DuplicateInstructionsToSplit) {
+  Type *ArgTy1[] = {Type::getInt32PtrTy(context)};
+  FunctionType *FT = FunctionType::get(Type::getVoidTy(context), ArgTy1, false);
+  V = new Argument(Type::getInt32Ty(context));
+
+  Function *F = Function::Create(FT, Function::ExternalLinkage);
+
+  BasicBlock *BB1 = BasicBlock::Create(context, "", F);
+  IRBuilder<> Builder1(BB1);
+
+  BasicBlock *BB2 = BasicBlock::Create(context, "", F);
+  IRBuilder<> Builder2(BB2);
+
+  Builder1.CreateBr(BB2);
+
+  Instruction *AddInst = cast<Instruction>(Builder2.CreateAdd(V, V));
+  Instruction *MulInst = cast<Instruction>(Builder2.CreateMul(AddInst, V));
+  Instruction *SubInst = cast<Instruction>(Builder2.CreateSub(MulInst, V));
+  Builder2.CreateRetVoid();
+
+  ValueToValueMapTy Mapping;
+
+  auto Split = DuplicateInstructionsInSplitBetween(BB2, BB1, SubInst, Mapping);
+
+  EXPECT_TRUE(Split);
+  EXPECT_EQ(Mapping.size(), 2u);
+  EXPECT_TRUE(Mapping.find(AddInst) != Mapping.end());
+  EXPECT_TRUE(Mapping.find(MulInst) != Mapping.end());
+
+  auto AddSplit = dyn_cast<Instruction>(Mapping[AddInst]);
+  EXPECT_TRUE(AddSplit);
+  EXPECT_EQ(AddSplit->getOperand(0), V);
+  EXPECT_EQ(AddSplit->getOperand(1), V);
+  EXPECT_EQ(AddSplit->getParent(), Split);
+
+  auto MulSplit = dyn_cast<Instruction>(Mapping[MulInst]);
+  EXPECT_TRUE(MulSplit);
+  EXPECT_EQ(MulSplit->getOperand(0), AddSplit);
+  EXPECT_EQ(MulSplit->getOperand(1), V);
+  EXPECT_EQ(MulSplit->getParent(), Split);
+
+  EXPECT_EQ(AddSplit->getNextNode(), MulSplit);
+  EXPECT_EQ(MulSplit->getNextNode(), Split->getTerminator());
+
+  delete F;
+}
+
 class CloneFunc : public ::testing::Test {
 protected:
   void SetUp() override {