Index: include/llvm/Transforms/Scalar/JumpThreading.h
===================================================================
--- include/llvm/Transforms/Scalar/JumpThreading.h
+++ include/llvm/Transforms/Scalar/JumpThreading.h
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
@@ -122,6 +123,9 @@
   bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
   bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
 
+  bool ProcessGuards(BasicBlock *BB);
+  bool ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI);
+
 private:
   BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
                               const char *Suffix);
Index: include/llvm/Transforms/Utils/Cloning.h
===================================================================
--- include/llvm/Transforms/Utils/Cloning.h
+++ include/llvm/Transforms/Utils/Cloning.h
@@ -245,6 +245,18 @@
 void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
                                ValueToValueMapTy &VMap);
 
+/// Split edge between BB and PredBB and duplicate all non-Phi instructions
+/// from BB between its beginning and the StopAt instruction into the split
+/// block. ValueMapping is used to map the original instructions from BB to
+/// their newly-created copies.
+/// Returns nullptr in either of cases:
+///   a) The number of instructions to clone exceeds the cap;
+///   b) There is an instruction that cannot be cloned.
+/// Otherwise returns the split block.
+BasicBlock *DuplicateInstructionsInSplitBetween(
+    BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt,
+    unsigned MaxDuplicatedInstructions,
+    ValueToValueMapTy &ValueMapping);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CLONING_H
Index: lib/Transforms/Scalar/JumpThreading.cpp
===================================================================
--- lib/Transforms/Scalar/JumpThreading.cpp
+++ lib/Transforms/Scalar/JumpThreading.cpp
@@ -30,17 +30,20 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <memory>
 using namespace llvm;
 using namespace jumpthreading;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "jump-threading"
 
@@ -712,6 +715,10 @@
   if (TryToUnfoldSelectInCurrBB(BB))
     return true;
 
+  // Look if we can propagate guards to predecessors.
+  if (ProcessGuards(BB))
+    return true;
+
   // What kind of constant we're looking for.
   ConstantPreference Preference = WantInteger;
 
@@ -2019,3 +2026,121 @@
   
   return false;
 }
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (match(&*I, m_Intrinsic<Intrinsic::experimental_guard>()))
+      if (auto Parent = Pred1->getSinglePredecessor())
+        if (Parent == Pred2->getSinglePredecessor())
+          if (BranchInst *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+            if (ThreadGuard(BB, cast<IntrinsicInst>(&*I), BI))
+              return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+                                    BranchInst *BI) {
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl = isImpliedCondition(BranchCond, GuardCond, DL, true);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  // Duplicate all instructions before the guard in the unguarded branch.
+  UnguardedBlock =
+      DuplicateInstructionsInSplitBetween(BB, UnguardedBlock, Guard,
+                                          BBDupThreshold, UnguardedMapping);
+  if (!UnguardedBlock)
+    return false;
+
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  GuardedBlock =
+      DuplicateInstructionsInSplitBetween(BB, GuardedBlock, AfterGuard,
+                                          BBDupThreshold, GuardedMapping);
+  if (!GuardedBlock)
+    return false;
+
+  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+               << GuardedBlock->getName() << "\n");
+
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    ToRemove.push_back(&*BI);
+
+  // Substitute with Phis & remove.
+  for (auto I = ToRemove.rbegin(), E = ToRemove.rend(); I != E; ++I) {
+    Instruction *Inst = *I;
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(Inst);
+      Inst->replaceAllUsesWith(NewPN);
+    }
+    Inst->eraseFromParent();
+  }
+  return true;
+}
Index: lib/Transforms/Utils/CloneFunction.cpp
===================================================================
--- lib/Transforms/Utils/CloneFunction.cpp
+++ lib/Transforms/Utils/CloneFunction.cpp
@@ -747,3 +747,59 @@
 
   return NewLoop;
 }
+
+/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
+    BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt,
+    unsigned MaxDuplicatedInstructions,
+    ValueToValueMapTy &ValueMapping) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
+
+  unsigned InstructionsToDuplicate = 0;
+
+  // First, check that we can copy all those instructions.
+  // Count non-Phi clonable instructions.
+  for (auto BI = BB->begin(); &*BI != StopAt; ++BI) {
+    Instruction *Inst = &*BI;
+    if (dyn_cast<PHINode>(BI))
+      continue;
+
+    if (auto CI = dyn_cast<CallInst>(Inst))
+      if (CI->cannotDuplicate())
+        return false;
+    if (++InstructionsToDuplicate > MaxDuplicatedInstructions)
+      return false;
+  }
+
+
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  NewBB->setName(PredBB->getName() + ".split");
+  Instruction *NewTerm = NewBB->getTerminator();
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; StopAt != &*BI; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    New->insertBefore(NewTerm);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        auto I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return NewBB;
+}
Index: test/Transforms/JumpThreading/guards.ll
===================================================================
--- test/Transforms/JumpThreading/guards.ll
+++ test/Transforms/JumpThreading/guards.ll
@@ -0,0 +1,143 @@
+; RUN: opt < %s -jump-threading -dce -S | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @f1()
+declare i32 @f2()
+declare void @f3()
+
+define i32 @branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @branch_implies_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split
+; CHECK:         %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal1 = add i32 %v1, 10
+; CHECK-NEXT:    call void @f3()
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split
+; CHECK:         %v2 = call i32 @f2()
+; CHECK-NEXT:    %retVal3 = add i32 %v2, 10
+; CHECK-NEXT:    call void @f3()
+; CHECK-NEXT:    %condGuard4 = icmp slt i32 %a, 20
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard4) [ "deopt"() ]
+; CHECK-NEXT:    br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  call void @f3()
+  %condGuard = icmp slt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @not_branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @not_branch_implies_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split:
+; CHECK-NEXT:    %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal3 = add i32 %v1, 10
+; CHECK-NEXT:    call void @f3()
+; CHECK-NEXT:    %condGuard4 = icmp sgt i32 %a, 10
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard4) [ "deopt"() ]
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split:
+; CHECK-NEXT:   %v2 = call i32 @f2()
+; CHECK-NEXT:   %retVal1 = add i32 %v2, 10
+; CHECK-NEXT:   call void @f3()
+; CHECK-NEXT:   br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  call void @f3()
+  %condGuard = icmp sgt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_overlaps_guard(i32 %a) {
+; CHECK-LABEL: @branch_overlaps_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         call void @f3()
+; CHECK-NEXT:    %condGuard = icmp slt i32 %a, 10
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  call void @f3()
+  %condGuard = icmp slt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_doesnt_overlap_guard(i32 %a) {
+; CHECK-LABEL: @branch_doesnt_overlap_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         call void @f3()
+; CHECK-NEXT:    %condGuard = icmp sgt i32 %a, 20
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  call void @f3()
+  %condGuard = icmp sgt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+  ret i32 %retVal
+}