Index: llvm/trunk/include/llvm/CodeGen/MachineBasicBlock.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/MachineBasicBlock.h
+++ llvm/trunk/include/llvm/CodeGen/MachineBasicBlock.h
@@ -477,6 +477,11 @@
   /// probabilities may need to be normalized.
   void copySuccessor(MachineBasicBlock *Orig, succ_iterator I);
 
+  /// Split the old successor into old plus new and updates the probability
+  /// info.
+  void splitSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New,
+                      bool NormalizeSuccProbs = false);
+
   /// Transfers all the successors from MBB to this machine basic block (i.e.,
   /// copies all the successors FromMBB and remove all the successors from
   /// FromMBB).
Index: llvm/trunk/lib/CodeGen/MachineBasicBlock.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/MachineBasicBlock.cpp
+++ llvm/trunk/lib/CodeGen/MachineBasicBlock.cpp
@@ -659,6 +659,25 @@
   Succ->addPredecessor(this);
 }
 
+void MachineBasicBlock::splitSuccessor(MachineBasicBlock *Old,
+                                       MachineBasicBlock *New,
+                                       bool NormalizeSuccProbs) {
+  succ_iterator OldI = llvm::find(successors(), Old);
+  assert(OldI != succ_end() && "Old is not a successor of this block!");
+  assert(llvm::find(successors(), New) == succ_end() &&
+         "New is already a successor of this block!");
+
+  // Add a new successor with equal probability as the original one. Note
+  // that we directly copy the probability using the iterator rather than
+  // getting a potentially synthetic probability computed when unknown. This
+  // preserves the probabilities as-is and then we can renormalize them and
+  // query them effectively afterward.
+  addSuccessor(New, Probs.empty() ? BranchProbability::getUnknown()
+                                  : *getProbabilityIterator(OldI));
+  if (NormalizeSuccProbs)
+    normalizeSuccProbs();
+}
+
 void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ,
                                         bool NormalizeSuccProbs) {
   succ_iterator I = find(Successors, Succ);
Index: llvm/trunk/lib/Target/X86/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/Target/X86/CMakeLists.txt
+++ llvm/trunk/lib/Target/X86/CMakeLists.txt
@@ -57,6 +57,7 @@
   X86RetpolineThunks.cpp
   X86SelectionDAGInfo.cpp
   X86ShuffleDecodeConstantPool.cpp
+  X86SpeculativeLoadHardening.cpp
   X86Subtarget.cpp
   X86TargetMachine.cpp
   X86TargetObjectFile.cpp
Index: llvm/trunk/lib/Target/X86/X86.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86.h
+++ llvm/trunk/lib/Target/X86/X86.h
@@ -127,6 +127,8 @@
 
 void initializeEvexToVexInstPassPass(PassRegistry &);
 
+FunctionPass *createX86SpeculativeLoadHardeningPass();
+
 } // End llvm namespace
 
 #endif
Index: llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -0,0 +1,1667 @@
+//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Provide a pass which mitigates speculative execution attacks which operate
+/// by speculating incorrectly past some predicate (a type check, bounds check,
+/// or other condition) to reach a load with invalid inputs and leak the data
+/// accessed by that load using a side channel out of the speculative domain.
+///
+/// For details on the attacks, see the first variant in both the Project Zero
+/// writeup and the Spectre paper:
+/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
+/// https://spectreattack.com/spectre.pdf
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-speculative-load-hardening"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
+STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
+STATISTIC(NumAddrRegsHardened,
+          "Number of address mode used registers hardaned");
+STATISTIC(NumPostLoadRegsHardened,
+          "Number of post-load register values hardened");
+STATISTIC(NumInstsInserted, "Number of instructions inserted");
+STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
+
+static cl::opt<bool> HardenEdgesWithLFENCE(
+    PASS_KEY "-lfence",
+    cl::desc(
+        "Use LFENCE along each conditional edge to harden against speculative "
+        "loads rather than conditional movs and poisoned pointers."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EnablePostLoadHardening(
+    PASS_KEY "-post-load",
+    cl::desc("Harden the value loaded *after* it is loaded by "
+             "flushing the loaded bits to 1. This is hard to do "
+             "in general but can be done easily for GPRs."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> FenceCallAndRet(
+    PASS_KEY "-fence-call-and-ret",
+    cl::desc("Use a full speculation fence to harden both call and ret edges "
+             "rather than a lighter weight mitigation."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> HardenInterprocedurally(
+    PASS_KEY "-ip",
+    cl::desc("Harden interprocedurally by passing our state in and out of "
+             "functions in the high bits of the stack pointer."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+    HardenLoads(PASS_KEY "-loads",
+                cl::desc("Sanitize loads from memory. When disable, no "
+                         "significant security is provided."),
+                cl::init(true), cl::Hidden);
+
+namespace llvm {
+
+void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
+public:
+  X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
+    initializeX86SpeculativeLoadHardeningPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "X86 speculative load hardening";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+private:
+  /// The information about a block's conditional terminators needed to trace
+  /// our predicate state through the exiting edges.
+  struct BlockCondInfo {
+    MachineBasicBlock *MBB;
+
+    // We mostly have one conditional branch, and in extremely rare cases have
+    // two. Three and more are so rare as to be unimportant for compile time.
+    SmallVector<MachineInstr *, 2> CondBrs;
+
+    MachineInstr *UncondBr;
+  };
+
+  const X86Subtarget *Subtarget;
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetRegisterClass *PredStateRC;
+
+  void hardenEdgesWithLFENCE(MachineFunction &MF);
+
+  SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
+
+  void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA);
+
+  unsigned saveEFLAGS(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
+  void restoreEFLAGS(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+                     unsigned OFReg);
+
+  void mergePredStateIntoSP(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+                            unsigned PredStateReg);
+  unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertPt,
+                                  DebugLoc Loc);
+
+  void
+  hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
+                 MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA,
+                 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
+  MachineInstr *
+  sinkPostLoadHardenedInst(MachineInstr &MI,
+                           SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
+  void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
+  void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
+  void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
+};
+
+} // end anonymous namespace
+
+char X86SpeculativeLoadHardeningPass::ID = 0;
+
+void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
+                                    MachineBasicBlock &Succ, int SuccCount,
+                                    MachineInstr *Br, MachineInstr *&UncondBr,
+                                    const X86InstrInfo &TII) {
+  assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
+
+  MachineFunction &MF = *MBB.getParent();
+
+  MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+  // We have to insert the new block immediately after the current one as we
+  // don't know what layout-successor relationships the successor has and we
+  // may not be able to (and generally don't want to) try to fix those up.
+  MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+  // Update the branch instruction if necessary.
+  if (Br) {
+    assert(Br->getOperand(0).getMBB() == &Succ &&
+           "Didn't start with the right target!");
+    Br->getOperand(0).setMBB(&NewMBB);
+
+    // If this successor was reached through a branch rather than fallthrough,
+    // we might have *broken* fallthrough and so need to inject a new
+    // unconditional branch.
+    if (!UncondBr) {
+      MachineBasicBlock &OldLayoutSucc =
+          *std::next(MachineFunction::iterator(&NewMBB));
+      assert(MBB.isSuccessor(&OldLayoutSucc) &&
+             "Without an unconditional branch, the old layout successor should "
+             "be an actual successor!");
+      auto BrBuilder =
+          BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
+      // Update the unconditional branch now that we've added one.
+      UncondBr = &*BrBuilder;
+    }
+
+    // Insert unconditional "jump Succ" instruction in the new block if
+    // necessary.
+    if (!NewMBB.isLayoutSuccessor(&Succ)) {
+      SmallVector<MachineOperand, 4> Cond;
+      TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
+    }
+  } else {
+    assert(!UncondBr &&
+           "Cannot have a branchless successor and an unconditional branch!");
+    assert(NewMBB.isLayoutSuccessor(&Succ) &&
+           "A non-branch successor must have been a layout successor before "
+           "and now is a layout successor of the new block.");
+  }
+
+  // If this is the only edge to the successor, we can just replace it in the
+  // CFG. Otherwise we need to add a new entry in the CFG for the new
+  // successor.
+  if (SuccCount == 1) {
+    MBB.replaceSuccessor(&Succ, &NewMBB);
+  } else {
+    MBB.splitSuccessor(&Succ, &NewMBB);
+  }
+
+  // Hook up the edge from the new basic block to the old successor in the CFG.
+  NewMBB.addSuccessor(&Succ);
+
+  // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
+  for (MachineInstr &MI : Succ) {
+    if (!MI.isPHI())
+      break;
+    for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+         OpIdx += 2) {
+      MachineOperand &OpV = MI.getOperand(OpIdx);
+      MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+      assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+      if (OpMBB.getMBB() != &MBB)
+        continue;
+
+      // If this is the last edge to the succesor, just replace MBB in the PHI
+      if (SuccCount == 1) {
+        OpMBB.setMBB(&NewMBB);
+        break;
+      }
+
+      // Otherwise, append a new pair of operands for the new incoming edge.
+      MI.addOperand(MF, OpV);
+      MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+      break;
+    }
+  }
+
+  // Inherit live-ins from the successor
+  for (auto &LI : Succ.liveins())
+    NewMBB.addLiveIn(LI);
+
+  LLVM_DEBUG(dbgs() << "  Split edge from '" << MBB.getName() << "' to '"
+                    << Succ.getName() << "'.\n");
+  return NewMBB;
+}
+
+bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+                    << " **********\n");
+
+  Subtarget = &MF.getSubtarget<X86Subtarget>();
+  MRI = &MF.getRegInfo();
+  TII = Subtarget->getInstrInfo();
+  TRI = Subtarget->getRegisterInfo();
+  // FIXME: Support for 32-bit.
+  PredStateRC = &X86::GR64_NOSPRegClass;
+
+  if (MF.begin() == MF.end())
+    // Nothing to do for a degenerate empty function...
+    return false;
+
+  // We support an alternative hardening technique based on a debug flag.
+  if (HardenEdgesWithLFENCE) {
+    hardenEdgesWithLFENCE(MF);
+    return true;
+  }
+
+  // Create a dummy debug loc to use for all the generated code here.
+  DebugLoc Loc;
+
+  MachineBasicBlock &Entry = *MF.begin();
+  auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
+
+  // Do a quick scan to see if we have any checkable loads.
+  bool HasCheckableLoad = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // Stop searching blocks at an LFENCE.
+      if (MI.getOpcode() == X86::LFENCE)
+        break;
+
+      // Looking for loads only.
+      if (!MI.mayLoad())
+        continue;
+
+      // An MFENCE is modeled as a load but doesn't require hardening.
+      if (MI.getOpcode() == X86::MFENCE)
+        continue;
+
+      HasCheckableLoad = true;
+      break;
+    }
+    if (HasCheckableLoad)
+      break;
+  }
+
+  // See if we have any conditional branching blocks that we will need to trace
+  // predicate state through.
+  SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
+
+  // If we have no interesting conditions or loads, nothing to do here.
+  if (!HasCheckableLoad && Infos.empty())
+    return true;
+
+  unsigned PredStateReg;
+  unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8;
+
+  // The poison value is required to be an all-ones value for many aspects of
+  // this mitigation.
+  const int PoisonVal = -1;
+  unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC);
+  BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg)
+      .addImm(PoisonVal);
+  ++NumInstsInserted;
+
+  // If we have loads being hardened and we've asked for call and ret edges to
+  // get a full fence-based mitigation, inject that fence.
+  if (HasCheckableLoad && FenceCallAndRet) {
+    // We need to insert an LFENCE at the start of the function to suspend any
+    // incoming misspeculation from the caller. This helps two-fold: the caller
+    // may not have been protected as this code has been, and this code gets to
+    // not take any specific action to protect across calls.
+    // FIXME: We could skip this for functions which unconditionally return
+    // a constant.
+    BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
+    ++NumInstsInserted;
+    ++NumLFENCEsInserted;
+  }
+
+  // If we have no conditionals to protect in blocks, then all we needed to do
+  // was protect the entry and so we're done.
+  if (Infos.empty())
+    // We may have changed the function's code at this point to insert fences.
+    return true;
+
+  // For every basic block in the function which can b
+  if (HardenInterprocedurally && !FenceCallAndRet) {
+    // Set up the predicate state by extracting it from the incoming stack
+    // pointer so we pick up any misspeculation in our caller.
+    PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
+  } else {
+    // Otherwise, just build the predicate state itself by zeroing a register
+    // as we don't need any initial state.
+    PredStateReg = MRI->createVirtualRegister(PredStateRC);
+    unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
+                         PredStateSubReg);
+    ++NumInstsInserted;
+    MachineOperand *ZeroEFLAGSDefOp =
+        ZeroI->findRegisterDefOperand(X86::EFLAGS);
+    assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
+           "Must have an implicit def of EFLAGS!");
+    ZeroEFLAGSDefOp->setIsDead(true);
+    BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
+            PredStateReg)
+        .addImm(0)
+        .addReg(PredStateSubReg)
+        .addImm(X86::sub_32bit);
+  }
+
+  // We're going to need to trace predicate state throughout the function's
+  // CFG. Prepare for this by setting up our initial state of PHIs with unique
+  // predecessor entries and all the initial predicate state.
+
+  // FIXME: It's really frustrating that we have to do this, but SSA-form in
+  // MIR isn't what you might expect. We may have multiple entries in PHI nodes
+  // for a single predecessor. This makes CFG-updating extremely complex, so
+  // here we simplify all PHI nodes to a model even simpler than the IR's
+  // model: exactly one entry per predecessor, regardless of how many edges
+  // there are.
+  SmallPtrSet<MachineBasicBlock *, 4> Preds;
+  SmallVector<int, 4> DupIndices;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB) {
+      if (!MI.isPHI())
+        break;
+
+      // First we scan the operands of the PHI looking for duplicate entries
+      // a particular predecessor. We retain the operand index of each duplicate
+      // entry found.
+      for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+           OpIdx += 2)
+        if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
+          DupIndices.push_back(OpIdx);
+
+      // Now walk the duplicate indices, removing both the block and value. Note
+      // that these are stored as a vector making this element-wise removal
+      // :w
+      // potentially quadratic.
+      //
+      // FIXME: It is really frustrating that we have to use a quadratic
+      // removal algorithm here. There should be a better way, but the use-def
+      // updates required make that impossible using the public API.
+      //
+      // Note that we have to process these backwards so that we don't
+      // invalidate other indices with each removal.
+      while (!DupIndices.empty()) {
+        int OpIdx = DupIndices.pop_back_val();
+        // Remove both the block and value operand, again in reverse order to
+        // preserve indices.
+        MI.RemoveOperand(OpIdx + 1);
+        MI.RemoveOperand(OpIdx);
+      }
+
+      Preds.clear();
+    }
+
+  // Track the updated values in an SSA updater to rewrite into SSA form at the
+  // end.
+  MachineSSAUpdater PredStateSSA(MF);
+  PredStateSSA.Initialize(PredStateReg);
+  PredStateSSA.AddAvailableValue(&Entry, PredStateReg);
+  // Collect the inserted instructions so we can rewrite their uses of the
+  // predicate state into SSA form.
+  SmallVector<MachineInstr *, 16> CMovs;
+
+  // Now walk all of the basic blocks looking for ones that end in conditional
+  // jumps where we need to update this register along each edge.
+  for (BlockCondInfo &Info : Infos) {
+    MachineBasicBlock &MBB = *Info.MBB;
+    SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
+    MachineInstr *UncondBr = Info.UncondBr;
+
+    LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
+                      << "\n");
+    ++NumCondBranchesTraced;
+
+    // Compute the non-conditional successor as either the target of any
+    // unconditional branch or the layout successor.
+    MachineBasicBlock *UncondSucc =
+        UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
+                        ? UncondBr->getOperand(0).getMBB()
+                        : nullptr)
+                 : &*std::next(MachineFunction::iterator(&MBB));
+
+    // Count how many edges there are to any given successor.
+    SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
+    if (UncondSucc)
+      ++SuccCounts[UncondSucc];
+    for (auto *CondBr : CondBrs)
+      ++SuccCounts[CondBr->getOperand(0).getMBB()];
+
+    // A lambda to insert cmov instructions into a block checking all of the
+    // condition codes in a sequence.
+    auto BuildCheckingBlockForSuccAndConds =
+        [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
+            MachineInstr *Br, MachineInstr *&UncondBr,
+            ArrayRef<X86::CondCode> Conds) {
+          // First, we split the edge to insert the checking block into a safe
+          // location.
+          auto &CheckingMBB =
+              (SuccCount == 1 && Succ.pred_size() == 1)
+                  ? Succ
+                  : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
+
+          bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
+          if (!LiveEFLAGS)
+            CheckingMBB.addLiveIn(X86::EFLAGS);
+
+          // Now insert the cmovs to implement the checks.
+          auto InsertPt = CheckingMBB.begin();
+          assert(
+              InsertPt == CheckingMBB.end() ||
+              !InsertPt->isPHI() &&
+                  "Should never have a PHI in the initial checking block as it "
+                  "always has a single predecessor!");
+
+          // We will wire each cmov to each other, but need to start with the
+          // incoming pred state.
+          unsigned CurStateReg = PredStateReg;
+
+          for (X86::CondCode Cond : Conds) {
+            auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
+
+            unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC);
+            auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp),
+                                 UpdatedStateReg)
+                             .addReg(CurStateReg)
+                             .addReg(PoisonReg);
+            // If this is the last cmov and the EFLAGS weren't originally
+            // live-in, mark them as killed.
+            if (!LiveEFLAGS && Cond == Conds.back())
+              CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+
+            ++NumInstsInserted;
+            LLVM_DEBUG(dbgs() << "  Inserting cmov: "; CMovI->dump();
+                       dbgs() << "\n");
+
+            // The first one of the cmovs will be using the top level
+            // `PredStateReg` and need to get rewritten into SSA form.
+            if (CurStateReg == PredStateReg)
+              CMovs.push_back(&*CMovI);
+
+            // The next cmov should start from this one's def.
+            CurStateReg = UpdatedStateReg;
+          }
+
+          // And put the last one into the available values for PredStateSSA.
+          PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg);
+        };
+
+    std::vector<X86::CondCode> UncondCodeSeq;
+    for (auto *CondBr : CondBrs) {
+      MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
+      int &SuccCount = SuccCounts[&Succ];
+
+      X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
+      X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
+      UncondCodeSeq.push_back(Cond);
+
+      BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
+                                        {InvCond});
+
+      // Decrement the successor count now that we've split one of the edges.
+      // We need to keep the count of edges to the successor accurate in order
+      // to know above when to *replace* the successor in the CFG vs. just
+      // adding the new successor.
+      --SuccCount;
+    }
+
+    // Since we may have split edges and changed the number of successors,
+    // normalize the probabilities. This avoids doing it each time we split an
+    // edge.
+    MBB.normalizeSuccProbs();
+
+    // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
+    // need to intersect the other condition codes. We can do this by just
+    // doing a cmov for each one.
+    if (!UncondSucc)
+      // If we have no fallthrough to protect (perhaps it is an indirect jump?)
+      // just skip this and continue.
+      continue;
+
+    assert(SuccCounts[UncondSucc] == 1 &&
+           "We should never have more than one edge to the unconditional "
+           "successor at this point because every other edge must have been "
+           "split above!");
+
+    // Sort and unique the codes to minimize them.
+    llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
+    UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
+                        UncondCodeSeq.end());
+
+    // Build a checking version of the successor.
+    BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
+                                      UncondBr, UncondBr, UncondCodeSeq);
+  }
+
+  // We may also enter basic blocks in this function via exception handling
+  // control flow. Here, if we are hardening interprocedurally, we need to
+  // re-capture the predicate state from the throwing code. In the Itanium ABI,
+  // the throw will always look like a call to __cxa_throw and will have the
+  // predicate state in the stack pointer, so extract fresh predicate state from
+  // the stack pointer and make it available in SSA.
+  // FIXME: Handle non-itanium ABI EH models.
+  if (HardenInterprocedurally) {
+    for (MachineBasicBlock &MBB : MF) {
+      assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
+      assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
+      assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
+      if (!MBB.isEHPad())
+        continue;
+      PredStateSSA.AddAvailableValue(
+          &MBB,
+          extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
+    }
+  }
+
+  // Now check all of the loads using the predicate state.
+  checkAllLoads(MF, PredStateSSA);
+
+  // Now rewrite all the uses of the pred state using the SSA updater so that
+  // we track updates through the CFG.
+  for (MachineInstr *CMovI : CMovs)
+    for (MachineOperand &Op : CMovI->operands()) {
+      if (!Op.isReg() || Op.getReg() != PredStateReg)
+        continue;
+
+      PredStateSSA.RewriteUse(Op);
+    }
+
+  // If we are hardening interprocedurally, find each returning block and
+  // protect the caller from being returned to through misspeculation.
+  if (HardenInterprocedurally)
+    for (MachineBasicBlock &MBB : MF) {
+      if (MBB.empty())
+        continue;
+
+      MachineInstr &MI = MBB.back();
+      if (!MI.isReturn())
+        continue;
+
+      checkReturnInstr(MI, PredStateSSA);
+    }
+
+  LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
+             dbgs() << "\n"; MF.verify(this));
+  return true;
+}
+
+/// Implements the naive hardening approach of putting an LFENCE after every
+/// potentially mis-predicted control flow construct.
+///
+/// We include this as an alternative mostly for the purpose of comparison. The
+/// performance impact of this is expected to be extremely severe and not
+/// practical for any real-world users.
+void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
+    MachineFunction &MF) {
+  // First, we scan the function looking for blocks that are reached along edges
+  // that we might want to harden.
+  SmallSetVector<MachineBasicBlock *, 8> Blocks;
+  for (MachineBasicBlock &MBB : MF) {
+    // If there are no or only one successor, nothing to do here.
+    if (MBB.succ_size() <= 1)
+      continue;
+
+    // Skip blocks unless their terminators start with a branch. Other
+    // terminators don't seem interesting for guarding against misspeculation.
+    auto TermIt = MBB.getFirstTerminator();
+    if (TermIt == MBB.end() || !TermIt->isBranch())
+      continue;
+
+    // Add all the non-EH-pad succossors to the blocks we want to harden. We
+    // skip EH pads because there isn't really a condition of interest on
+    // entering.
+    for (MachineBasicBlock *SuccMBB : MBB.successors())
+      if (!SuccMBB->isEHPad())
+        Blocks.insert(SuccMBB);
+  }
+
+  for (MachineBasicBlock *MBB : Blocks) {
+    auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
+    BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
+    ++NumInstsInserted;
+    ++NumLFENCEsInserted;
+  }
+}
+
+SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
+X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
+  SmallVector<BlockCondInfo, 16> Infos;
+
+  // Walk the function and build up a summary for each block's conditions that
+  // we need to trace through.
+  for (MachineBasicBlock &MBB : MF) {
+    // If there are no or only one successor, nothing to do here.
+    if (MBB.succ_size() <= 1)
+      continue;
+
+    // We want to reliably handle any conditional branch terminators in the
+    // MBB, so we manually analyze the branch. We can handle all of the
+    // permutations here, including ones that analyze branch cannot.
+    //
+    // The approach is to walk backwards across the terminators, resetting at
+    // any unconditional non-indirect branch, and track all conditional edges
+    // to basic blocks as well as the fallthrough or unconditional successor
+    // edge. For each conditional edge, we track the target and the opposite
+    // condition code in order to inject a "no-op" cmov into that successor
+    // that will harden the predicate. For the fallthrough/unconditional
+    // edge, we inject a separate cmov for each conditional branch with
+    // matching condition codes. This effectively implements an "and" of the
+    // condition flags, even if there isn't a single condition flag that would
+    // directly implement that. We don't bother trying to optimize either of
+    // these cases because if such an optimization is possible, LLVM should
+    // have optimized the conditional *branches* in that way already to reduce
+    // instruction count. This late, we simply assume the minimal number of
+    // branch instructions is being emitted and use that to guide our cmov
+    // insertion.
+
+    BlockCondInfo Info = {&MBB, {}, nullptr};
+
+    // Now walk backwards through the terminators and build up successors they
+    // reach and the conditions.
+    for (MachineInstr &MI : llvm::reverse(MBB)) {
+      // Once we've handled all the terminators, we're done.
+      if (!MI.isTerminator())
+        break;
+
+      // If we see a non-branch terminator, we can't handle anything so bail.
+      if (!MI.isBranch()) {
+        Info.CondBrs.clear();
+        break;
+      }
+
+      // If we see an unconditional branch, reset our state, clear any
+      // fallthrough, and set this is the "else" successor.
+      if (MI.getOpcode() == X86::JMP_1) {
+        Info.CondBrs.clear();
+        Info.UncondBr = &MI;
+        continue;
+      }
+
+      // If we get an invalid condition, we have an indirect branch or some
+      // other unanalyzable "fallthrough" case. We model this as a nullptr for
+      // the destination so we can still guard any conditional successors.
+      // Consider code sequences like:
+      // ```
+      //   jCC L1
+      //   jmpq *%rax
+      // ```
+      // We still want to harden the edge to `L1`.
+      if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
+        Info.CondBrs.clear();
+        Info.UncondBr = &MI;
+        continue;
+      }
+
+      // We have a vanilla conditional branch, add it to our list.
+      Info.CondBrs.push_back(&MI);
+    }
+    if (Info.CondBrs.empty()) {
+      ++NumBranchesUntraced;
+      LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
+                 MBB.dump());
+      continue;
+    }
+
+    Infos.push_back(Info);
+  }
+
+  return Infos;
+}
+
+/// Returns true if the instruction has no behavior (specified or otherwise)
+/// that is based on the value of any of its register operands
+///
+/// A classical example of something that is inherently not data invariant is an
+/// indirect jump -- the destination is loaded into icache based on the bits set
+/// in the jump destination register.
+///
+/// FIXME: This should become part of our instruction tables.
+static bool isDataInvariant(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    // By default, assume that the instruction is not data invariant.
+    return false;
+
+    // FIXME: For now, we just use a very boring, conservative set of unary
+    // instructions because we're mostly interested in handling simple
+    // transformations.
+  case TargetOpcode::COPY:
+    return true;
+  }
+}
+
+/// Returns true if the instruction has no behavior (specified or otherwise)
+/// that is based on the value loaded from memory or the value of any
+/// non-address register operands.
+///
+/// For example, if the latency of the instruction is dependent on the
+/// particular bits set in any of the registers *or* any of the bits loaded from
+/// memory.
+///
+/// A classical example of something that is inherently not data invariant is an
+/// indirect jump -- the destination is loaded into icache based on the bits set
+/// in the jump destination register.
+///
+/// FIXME: This should become part of our instruction tables.
+static bool isDataInvariantLoad(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    // By default, assume that the load will immediately leak.
+    return false;
+
+    // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+    // However, they set flags and are perhaps the most surprisingly constant
+    // time operations so we call them out here separately.
+  case X86::IMUL16rm:
+  case X86::IMUL16rmi8:
+  case X86::IMUL16rmi:
+  case X86::IMUL32rm:
+  case X86::IMUL32rmi8:
+  case X86::IMUL32rmi:
+  case X86::IMUL64rm:
+  case X86::IMUL64rmi32:
+  case X86::IMUL64rmi8:
+
+    // Bitfield and bit scanning instructions that are somewhat surprisingly
+    // constant time as they scan across bits and do other fairly complex
+    // operations like popcnt, but are believed to be constant time on x86.
+    // However, these set flags.
+  case X86::BLCFILL32rm:
+  case X86::BLCFILL64rm:
+  case X86::BLCI32rm:
+  case X86::BLCI64rm:
+  case X86::BLCIC32rm:
+  case X86::BLCIC64rm:
+  case X86::BLCMSK32rm:
+  case X86::BLCMSK64rm:
+  case X86::BLCS32rm:
+  case X86::BLCS64rm:
+  case X86::BLSFILL32rm:
+  case X86::BLSFILL64rm:
+  case X86::BLSI32rm:
+  case X86::BLSI64rm:
+  case X86::BLSIC32rm:
+  case X86::BLSIC64rm:
+  case X86::BLSMSK32rm:
+  case X86::BLSMSK64rm:
+  case X86::BLSR32rm:
+  case X86::BLSR64rm:
+  case X86::BZHI32rm:
+  case X86::BZHI64rm:
+  case X86::LZCNT16rm:
+  case X86::LZCNT32rm:
+  case X86::LZCNT64rm:
+  case X86::POPCNT16rm:
+  case X86::POPCNT32rm:
+  case X86::POPCNT64rm:
+  case X86::TZCNT16rm:
+  case X86::TZCNT32rm:
+  case X86::TZCNT64rm:
+  case X86::TZMSK32rm:
+  case X86::TZMSK64rm:
+
+    // Basic arithmetic is constant time on the input but does set flags.
+  case X86::ADC8rm:
+  case X86::ADC16rm:
+  case X86::ADC32rm:
+  case X86::ADC64rm:
+  case X86::ADCX32rm:
+  case X86::ADCX64rm:
+  case X86::ADD8rm:
+  case X86::ADD16rm:
+  case X86::ADD32rm:
+  case X86::ADD64rm:
+  case X86::ADOX32rm:
+  case X86::ADOX64rm:
+  case X86::AND8rm:
+  case X86::AND16rm:
+  case X86::AND32rm:
+  case X86::AND64rm:
+  case X86::ANDN32rm:
+  case X86::ANDN64rm:
+  case X86::BSF16rm:
+  case X86::BSF32rm:
+  case X86::BSF64rm:
+  case X86::BSR16rm:
+  case X86::BSR32rm:
+  case X86::BSR64rm:
+  case X86::OR8rm:
+  case X86::OR16rm:
+  case X86::OR32rm:
+  case X86::OR64rm:
+  case X86::SBB8rm:
+  case X86::SBB16rm:
+  case X86::SBB32rm:
+  case X86::SBB64rm:
+  case X86::SUB8rm:
+  case X86::SUB16rm:
+  case X86::SUB32rm:
+  case X86::SUB64rm:
+  case X86::XOR8rm:
+  case X86::XOR16rm:
+  case X86::XOR32rm:
+  case X86::XOR64rm:
+  case X86::BEXTR32rm:
+  case X86::BEXTR64rm:
+  case X86::BEXTRI32mi:
+  case X86::BEXTRI64mi:
+    // Check whether the EFLAGS implicit-def is dead. We assume that this will
+    // always find the implicit-def because this code should only be reached
+    // for instructions that do in fact implicitly def this.
+    if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
+      // If we would clobber EFLAGS that are used, just bail for now.
+      LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
+                 MI.dump(); dbgs() << "\n");
+      return false;
+    }
+
+    // Otherwise, fallthrough to handle these the same as instructions that
+    // don't set EFLAGS.
+    LLVM_FALLTHROUGH;
+
+    // Integer multiply w/o affecting flags is still believed to be constant
+    // time on x86. Called out separately as this is among the most surprising
+    // instructions to exhibit that behavior.
+  case X86::MULX32rm:
+  case X86::MULX64rm:
+
+    // Arithmetic instructions that are both constant time and don't set flags.
+  case X86::PDEP32rm:
+  case X86::PDEP64rm:
+  case X86::PEXT32rm:
+  case X86::PEXT64rm:
+  case X86::RORX32mi:
+  case X86::RORX64mi:
+  case X86::SARX32rm:
+  case X86::SARX64rm:
+  case X86::SHLX32rm:
+  case X86::SHLX64rm:
+  case X86::SHRX32rm:
+  case X86::SHRX64rm:
+
+    // Conversions are believed to be constant time and don't set flags.
+    // FIXME: Add AVX versions.
+  case X86::CVTSD2SI64rm_Int:
+  case X86::CVTSD2SIrm_Int:
+  case X86::CVTSS2SI64rm_Int:
+  case X86::CVTSS2SIrm_Int:
+  case X86::CVTTSD2SI64rm:
+  case X86::CVTTSD2SI64rm_Int:
+  case X86::CVTTSD2SIrm:
+  case X86::CVTTSD2SIrm_Int:
+  case X86::CVTTSS2SI64rm:
+  case X86::CVTTSS2SI64rm_Int:
+  case X86::CVTTSS2SIrm:
+  case X86::CVTTSS2SIrm_Int:
+
+    // Loads to register don't set flags.
+  case X86::MOV8rm:
+  case X86::MOV8rm_NOREX:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::MOVSX16rm8:
+  case X86::MOVSX32rm16:
+  case X86::MOVSX32rm8:
+  case X86::MOVSX32rm8_NOREX:
+  case X86::MOVSX64rm16:
+  case X86::MOVSX64rm32:
+  case X86::MOVSX64rm8:
+  case X86::MOVZX16rm8:
+  case X86::MOVZX32rm16:
+  case X86::MOVZX32rm8:
+  case X86::MOVZX32rm8_NOREX:
+  case X86::MOVZX64rm16:
+  case X86::MOVZX64rm8:
+    return true;
+  }
+}
+
+static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                         const TargetRegisterInfo &TRI) {
+  // Check if EFLAGS are alive by seeing if there is a def of them or they
+  // live-in, and then seeing if that def is in turn used.
+  for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
+    if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
+      // If the def is dead, then EFLAGS is not live.
+      if (DefOp->isDead())
+        return false;
+
+      // Otherwise we've def'ed it, and it is live.
+      return true;
+    }
+    // While at this instruction, also check if we use and kill EFLAGS
+    // which means it isn't live.
+    if (MI.killsRegister(X86::EFLAGS, &TRI))
+      return false;
+  }
+
+  // If we didn't find anything conclusive (neither definitely alive or
+  // definitely dead) return whether it lives into the block.
+  return MBB.isLiveIn(X86::EFLAGS);
+}
+
+void X86SpeculativeLoadHardeningPass::checkAllLoads(
+    MachineFunction &MF, MachineSSAUpdater &PredStateSSA) {
+  // If the actual checking of loads is disabled, skip doing anything here.
+  if (!HardenLoads)
+    return;
+
+  SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
+  SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
+
+  SmallSet<unsigned, 16> HardenedAddrRegs;
+
+  SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
+
+  // Track the set of load-dependent registers through the basic block. Because
+  // the values of these registers have an existing data dependency on a loaded
+  // value which we would have checked, we can omit any checks on them.
+  SparseBitVector<> LoadDepRegs;
+
+  for (MachineBasicBlock &MBB : MF) {
+    // We harden the loads of a basic block in several passes:
+    //
+    // 1) Collect all the loads which can have their loaded value hardened
+    //    and all the loads that instead need their address hardened. During
+    //    this walk we propagate load dependence for address hardened loads and
+    //    also look for LFENCE to stop hardening wherever possible. When
+    //    deciding whether or not to harden the loaded value or not, we check
+    //    to see if any registers used in the address will have been hardened
+    //    at this point and if so, harden any remaining address registers as
+    //    that often successfully re-uses hardened addresses and minimizes
+    //    instructions. FIXME: We should consider an aggressive mode where we
+    //    continue to keep as many loads value hardened even when some address
+    //    register hardening would be free (due to reuse).
+    for (MachineInstr &MI : MBB) {
+      // We naively assume that all def'ed registers of an instruction have
+      // a data dependency on all of their operands.
+      // FIXME: Do a more careful analysis of x86 to build a conservative model
+      // here.
+      if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
+            return Op.isReg() && LoadDepRegs.test(Op.getReg());
+          }))
+        for (MachineOperand &Def : MI.defs())
+          if (Def.isReg())
+            LoadDepRegs.set(Def.getReg());
+
+      // Both Intel and AMD are guiding that they will change the semantics of
+      // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
+      // no more need to guard things in this block.
+      if (MI.getOpcode() == X86::LFENCE)
+        break;
+
+      // If this instruction cannot load, nothing to do.
+      if (!MI.mayLoad())
+        continue;
+
+      // Some instructions which "load" are trivially safe or unimportant.
+      if (MI.getOpcode() == X86::MFENCE)
+        continue;
+
+      // Extract the memory operand information about this instruction.
+      // FIXME: This doesn't handle loading pseudo instructions which we often
+      // could handle with similarly generic logic. We probably need to add an
+      // MI-layer routine similar to the MC-layer one we use here which maps
+      // pseudos much like this maps real instructions.
+      const MCInstrDesc &Desc = MI.getDesc();
+      int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+      if (MemRefBeginIdx < 0) {
+        LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
+                   MI.dump());
+        continue;
+      }
+
+      MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+      MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+      MachineOperand &IndexMO =
+          MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+
+      // If we have at least one (non-frame-index, non-RIP) register operand,
+      // and neither operand is load-dependent, we need to check the load.
+      unsigned BaseReg = 0, IndexReg = 0;
+      if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
+          BaseMO.getReg() != X86::NoRegister)
+        BaseReg = BaseMO.getReg();
+      if (IndexMO.getReg() != X86::NoRegister)
+        IndexReg = IndexMO.getReg();
+
+      if (!BaseReg && !IndexReg)
+        // No register operands!
+        continue;
+
+      // If any register operand is dependent, this load is dependent and we
+      // needn't check it.
+      // FIXME: Is this true in the case where we are hardening loads after
+      // they complete? Unclear, need to investigate.
+      if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
+          (IndexReg && LoadDepRegs.test(IndexReg)))
+        continue;
+
+      // If post-load hardening is enabled, this load is known to be
+      // data-invariant, and we aren't already going to harden one of the
+      // address registers, queue it up to be hardened post-load. Notably, even
+      // once hardened this won't introduce a useful dependency that could prune
+      // out subsequent loads.
+      if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
+          !HardenedAddrRegs.count(BaseReg) &&
+          !HardenedAddrRegs.count(IndexReg)) {
+        HardenPostLoad.insert(&MI);
+        HardenedAddrRegs.insert(MI.getOperand(0).getReg());
+        continue;
+      }
+
+      // Record this instruction for address hardening and record its register
+      // operands as being address-hardened.
+      HardenLoadAddr.insert(&MI);
+      if (BaseReg)
+        HardenedAddrRegs.insert(BaseReg);
+      if (IndexReg)
+        HardenedAddrRegs.insert(IndexReg);
+
+      for (MachineOperand &Def : MI.defs())
+        if (Def.isReg())
+          LoadDepRegs.set(Def.getReg());
+    }
+
+    // Now re-walk the instructions in the basic block, and apply whichever
+    // hardening strategy we have elected. Note that we do this in a second
+    // pass specifically so that we have the complete set of instructions for
+    // which we will do post-load hardening and can defer it in certain
+    // circumstances.
+    //
+    // FIXME: This could probably be made even more effective by doing it
+    // across the entire function. Rather than just walking the flat list
+    // backwards here, we could walk the function in PO and each block bottom
+    // up, allowing us to in some cases sink hardening across block blocks. As
+    // long as the in-block predicate state is used at the eventual hardening
+    // site, this remains safe.
+    for (MachineInstr &MI : MBB) {
+      // We cannot both require hardening the def of a load and its address.
+      assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
+             "Requested to harden both the address and def of a load!");
+
+      // Check if this is a load whose address needs to be hardened.
+      if (HardenLoadAddr.erase(&MI)) {
+        const MCInstrDesc &Desc = MI.getDesc();
+        int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+        assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
+
+        MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+        MachineOperand &BaseMO =
+            MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+        MachineOperand &IndexMO =
+            MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+        hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg);
+        continue;
+      }
+
+      // Test if this instruction is one of our post load instructions (and
+      // remove it from the set if so).
+      if (HardenPostLoad.erase(&MI)) {
+        assert(!MI.isCall() && "Must not try to post-load harden a call!");
+
+        // If this is a data-invariant load, we want to try and sink any
+        // hardening as far as possible.
+        if (isDataInvariantLoad(MI)) {
+          // Sink the instruction we'll need to harden as far as we can down the
+          // graph.
+          MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
+
+          // If we managed to sink this instruction, update everything so we
+          // harden that instruction when we reach it in the instruction
+          // sequence.
+          if (SunkMI != &MI) {
+            // If in sinking there was no instruction needing to be hardened,
+            // we're done.
+            if (!SunkMI)
+              continue;
+
+            // Otherwise, add this to the set of defs we harden.
+            HardenPostLoad.insert(SunkMI);
+            continue;
+          }
+        }
+
+        // The register def'ed by this instruction is trivially hardened so map
+        // it to itself.
+        AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
+            MI.getOperand(0).getReg();
+
+        hardenPostLoad(MI, PredStateSSA);
+        continue;
+      }
+
+      // After we finish processing the instruction and doing any hardening
+      // necessary for it, we need to handle transferring the predicate state
+      // into a call and recovering it after the call returns (if it returns).
+      if (!MI.isCall())
+        continue;
+
+      // If we're not hardening interprocedurally, we can just skip calls.
+      if (!HardenInterprocedurally)
+        continue;
+
+      auto InsertPt = MI.getIterator();
+      DebugLoc Loc = MI.getDebugLoc();
+
+      // First, we transfer the predicate state into the called function by
+      // merging it into the stack pointer. This will kill the current def of
+      // the state.
+      unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
+      mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
+
+      // If this call is also a return (because it is a tail call) we're done.
+      if (MI.isReturn())
+        continue;
+
+      // Otherwise we need to step past the call and recover the predicate
+      // state from SP after the return, and make this new state available.
+      ++InsertPt;
+      unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
+      PredStateSSA.AddAvailableValue(&MBB, NewStateReg);
+    }
+
+    HardenPostLoad.clear();
+    HardenLoadAddr.clear();
+    HardenedAddrRegs.clear();
+    AddrRegToHardenedReg.clear();
+
+    // Currently, we only track data-dependent loads within a basic block.
+    // FIXME: We should see if this is necessary or if we could be more
+    // aggressive here without opening up attack avenues.
+    LoadDepRegs.clear();
+  }
+}
+
+/// Save EFLAGS into the returned GPR. This can in turn be restored with
+/// `restoreEFLAGS`.
+///
+/// Note that LLVM can only lower very simple patterns of saved and restored
+/// EFLAGS registers. The restore should always be within the same basic block
+/// as the save so that no PHI nodes are inserted.
+unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    DebugLoc Loc) {
+  // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
+  // what instruction selection does.
+  unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  // We directly copy the FLAGS register and rely on later lowering to clean
+  // this up into the appropriate setCC instructions.
+  BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
+  ++NumInstsInserted;
+  return Reg;
+}
+
+/// Restore EFLAGS from the provided GPR. This should be produced by
+/// `saveEFLAGS`.
+///
+/// This must be done within the same basic block as the save in order to
+/// reliably lower.
+void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+    unsigned Reg) {
+  BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
+  ++NumInstsInserted;
+}
+
+/// Takes the current predicate state (in a register) and merges it into the
+/// stack pointer. The state is essentially a single bit, but we merge this in
+/// a way that won't form non-canonical pointers and also will be preserved
+/// across normal stack adjustments.
+void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+    unsigned PredStateReg) {
+  unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
+  // FIXME: This hard codes a shift distance based on the number of bits needed
+  // to stay canonical on 64-bit. We should compute this somehow and support
+  // 32-bit as part of that.
+  auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
+                    .addReg(PredStateReg, RegState::Kill)
+                    .addImm(47);
+  ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+  auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
+                 .addReg(X86::RSP)
+                 .addReg(TmpReg, RegState::Kill);
+  OrI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+}
+
+/// Extracts the predicate state stored in the high bits of the stack pointer.
+unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+    DebugLoc Loc) {
+  unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC);
+  unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
+
+  // We know that the stack pointer will have any preserved predicate state in
+  // its high bit. We just want to smear this across the other bits. Turns out,
+  // this is exactly what an arithmetic right shift does.
+  BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
+      .addReg(X86::RSP);
+  auto ShiftI =
+      BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(TRI->getRegSizeInBits(*PredStateRC) - 1);
+  ShiftI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+
+  return PredStateReg;
+}
+
+void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
+    MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
+    MachineSSAUpdater &PredStateSSA,
+    SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc Loc = MI.getDebugLoc();
+
+  // Check if EFLAGS are alive by seeing if there is a def of them or they
+  // live-in, and then seeing if that def is in turn used.
+  bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
+
+  SmallVector<MachineOperand *, 2> HardenOpRegs;
+
+  if (BaseMO.isFI()) {
+    // A frame index is never a dynamically controllable load, so only
+    // harden it if we're covering fixed address loads as well.
+    LLVM_DEBUG(
+        dbgs() << "  Skipping hardening base of explicit stack frame load: ";
+        MI.dump(); dbgs() << "\n");
+  } else if (BaseMO.getReg() == X86::RIP ||
+             BaseMO.getReg() == X86::NoRegister) {
+    // For both RIP-relative addressed loads or absolute loads, we cannot
+    // meaningfully harden them because the address being loaded has no
+    // dynamic component.
+    //
+    // FIXME: When using a segment base (like TLS does) we end up with the
+    // dynamic address being the base plus -1 because we can't mutate the
+    // segment register here. This allows the signed 32-bit offset to point at
+    // valid segment-relative addresses and load them successfully.
+    LLVM_DEBUG(
+        dbgs() << "  Cannot harden base of "
+               << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
+               << " address in a load!");
+  } else {
+    assert(BaseMO.isReg() &&
+           "Only allowed to have a frame index or register base.");
+    HardenOpRegs.push_back(&BaseMO);
+  }
+
+  if (IndexMO.getReg() != X86::NoRegister &&
+      (HardenOpRegs.empty() ||
+       HardenOpRegs.front()->getReg() != IndexMO.getReg()))
+    HardenOpRegs.push_back(&IndexMO);
+
+  assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
+         "Should have exactly one or two registers to harden!");
+  assert((HardenOpRegs.size() == 1 ||
+          HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
+         "Should not have two of the same registers!");
+
+  // Remove any registers that have alreaded been checked.
+  llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
+    // See if this operand's register has already been checked.
+    auto It = AddrRegToHardenedReg.find(Op->getReg());
+    if (It == AddrRegToHardenedReg.end())
+      // Not checked, so retain this one.
+      return false;
+
+    // Otherwise, we can directly update this operand and remove it.
+    Op->setReg(It->second);
+    return true;
+  });
+  // If there are none left, we're done.
+  if (HardenOpRegs.empty())
+    return;
+
+  // Compute the current predicate state.
+  unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
+
+  auto InsertPt = MI.getIterator();
+
+  // If EFLAGS are live and we don't have access to instructions that avoid
+  // clobbering EFLAGS we need to save and restore them. This in turn makes
+  // the EFLAGS no longer live.
+  unsigned FlagsReg = 0;
+  if (EFLAGSLive && !Subtarget->hasBMI2()) {
+    EFLAGSLive = false;
+    FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+  }
+
+  for (MachineOperand *Op : HardenOpRegs) {
+    auto *OpRC = MRI->getRegClass(Op->getReg());
+
+    unsigned OpReg = Op->getReg();
+    unsigned TmpReg = MRI->createVirtualRegister(OpRC);
+
+    if (!EFLAGSLive) {
+      // Merge our potential poison state into the value with an or.
+      auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
+                     .addReg(StateReg)
+                     .addReg(OpReg);
+      OrI->addRegisterDead(X86::EFLAGS, TRI);
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
+    } else {
+      // We need to avoid touching EFLAGS so shift out all but the least
+      // significant bit using the instruction that doesn't update flags.
+      auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
+                        .addReg(OpReg)
+                        .addReg(StateReg);
+      (void)ShiftI;
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting shrx: "; ShiftI->dump();
+                 dbgs() << "\n");
+    }
+
+    // Record this register as checked and update the operand.
+    assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
+           "Should not have checked this register yet!");
+    AddrRegToHardenedReg[Op->getReg()] = TmpReg;
+    Op->setReg(TmpReg);
+    ++NumAddrRegsHardened;
+  }
+
+  // And restore the flags if needed.
+  if (FlagsReg)
+    restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+}
+
+MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
+    MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
+  assert(isDataInvariantLoad(InitialMI) &&
+         "Cannot get here with a non-invariant load!");
+
+  // See if we can sink hardening the loaded value.
+  auto SinkCheckToSingleUse =
+      [&](MachineInstr &MI) -> Optional<MachineInstr *> {
+    unsigned DefReg = MI.getOperand(0).getReg();
+
+    // We need to find a single use which we can sink the check. We can
+    // primarily do this because many uses may already end up checked on their
+    // own.
+    MachineInstr *SingleUseMI = nullptr;
+    for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
+      // If we're already going to harden this use, it is data invariant and
+      // within our block and we just need to check that the use isn't in an
+      // address.
+      if (HardenedLoads.count(&UseMI)) {
+        const MCInstrDesc &Desc = UseMI.getDesc();
+        int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+        assert(MemRefBeginIdx >= 0 &&
+               "Should always have mem references here!");
+        MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+        MachineOperand &BaseMO =
+            UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+        MachineOperand &IndexMO =
+            UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+        if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
+            (IndexMO.isReg() && IndexMO.getReg() == DefReg))
+          // The load uses the register as part of its address making it not
+          // invariant.
+          return {};
+
+        continue;
+      }
+
+      if (SingleUseMI)
+        // We already have a single use, this would make two. Bail.
+        return {};
+
+      // If this single use isn't data invariant, isn't in this block, or has
+      // interfering EFLAGS, we can't sink the hardening to it.
+      if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
+        return {};
+
+      // If this instruction defines multiple registers bail as we won't harden
+      // all of them.
+      if (UseMI.getDesc().getNumDefs() > 1)
+        return {};
+
+      // If this register isn't a virtual register we can't walk uses of sanely,
+      // just bail. Also check that its register class is one of the ones we
+      // can harden.
+      unsigned UseDefReg = UseMI.getOperand(0).getReg();
+      if (!TRI->isVirtualRegister(UseDefReg) ||
+          !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
+        return {};
+
+      SingleUseMI = &UseMI;
+    }
+
+    // If SingleUseMI is still null, there is no use that needs its own
+    // checking. Otherwise, it is the single use that needs checking.
+    return {SingleUseMI};
+  };
+
+  MachineInstr *MI = &InitialMI;
+  while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
+    // Update which MI we're checking now.
+    MI = *SingleUse;
+    if (!MI)
+      break;
+  }
+
+  return MI;
+}
+
+// We can harden non-leaking loads into register without touching the address
+// by just hiding all of the loaded bits. We use an `or` instruction to do
+// this because having the poison value be all ones allows us to use the same
+// value below. And the goal is just for the loaded bits to not be exposed to
+// execution and coercing them to one is sufficient.
+void X86SpeculativeLoadHardeningPass::hardenPostLoad(
+    MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
+  assert(isDataInvariantLoad(MI) &&
+         "Cannot get here with a non-invariant load!");
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc Loc = MI.getDebugLoc();
+
+  // For all of these, the def'ed register operand is operand zero.
+  auto &DefOp = MI.getOperand(0);
+  unsigned OldDefReg = DefOp.getReg();
+
+  auto *DefRC = MRI->getRegClass(OldDefReg);
+  int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
+
+  unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
+  unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
+
+  unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
+
+  auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
+    unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
+
+    int Bytes = TRI->getRegSizeInBits(RC) / 8;
+    // FIXME: Need to teach this about 32-bit mode.
+    if (Bytes != 8) {
+      unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
+      unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
+      BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
+              NarrowStateReg)
+          .addReg(StateReg, 0, SubRegImm);
+      StateReg = NarrowStateReg;
+    }
+    return StateReg;
+  };
+
+  auto InsertPt = std::next(MI.getIterator());
+  unsigned FlagsReg = 0;
+  bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
+  if (EFLAGSLive && !Subtarget->hasBMI2()) {
+    FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
+    EFLAGSLive = false;
+  }
+
+  if (!EFLAGSLive) {
+    unsigned StateReg = GetStateRegInRC(*DefRC);
+    unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
+    DefOp.setReg(NewDefReg);
+    auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
+                   .addReg(StateReg)
+                   .addReg(NewDefReg);
+    OrI->addRegisterDead(X86::EFLAGS, TRI);
+    ++NumInstsInserted;
+    LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
+  } else {
+    assert(Subtarget->hasBMI2() &&
+           "Cannot harden loads and preserve EFLAGS without BMI2!");
+
+    unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
+    auto &ShiftRC =
+        DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
+    int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
+    unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
+
+    unsigned StateReg = GetStateRegInRC(ShiftRC);
+
+    // First have the def instruction def a temporary register.
+    unsigned TmpReg = MRI->createVirtualRegister(DefRC);
+    DefOp.setReg(TmpReg);
+    // Now copy it into a register of the shift RC.
+    unsigned ShiftInputReg = TmpReg;
+    if (DefRegBytes != ShiftRegBytes) {
+      unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
+      BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
+      ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
+      BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
+          .addReg(UndefReg)
+          .addReg(TmpReg)
+          .addImm(DefSubRegImm);
+    }
+
+    // We shift this once if the shift is wider than the def and thus we can
+    // shift *all* of the def'ed bytes out. Otherwise we need to do two shifts.
+
+    unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
+    auto Shift1I =
+        BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
+            .addReg(ShiftInputReg)
+            .addReg(StateReg);
+    (void)Shift1I;
+    ++NumInstsInserted;
+    LLVM_DEBUG(dbgs() << "  Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
+
+    // The only way we have a bit left is if all 8 bytes were defined. Do an
+    // extra shift to get the last bit in this case.
+    if (DefRegBytes == ShiftRegBytes) {
+      // We can just directly def the old def register as its the same size.
+      ShiftInputReg = ShiftedReg;
+      auto Shift2I =
+          BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
+              .addReg(ShiftInputReg)
+              .addReg(StateReg);
+      (void)Shift2I;
+      ++NumInstsInserted;
+      LLVM_DEBUG(dbgs() << "  Inserting shrx: "; Shift2I->dump();
+                 dbgs() << "\n");
+    } else {
+      // When we have different size shift register we need to fix up the
+      // class. We can do that as we copy into the old def register.
+      BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
+          .addReg(ShiftedReg, 0, DefSubRegImm);
+    }
+  }
+
+  if (FlagsReg)
+    restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
+
+  ++NumPostLoadRegsHardened;
+}
+
+void X86SpeculativeLoadHardeningPass::checkReturnInstr(
+    MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc Loc = MI.getDebugLoc();
+  auto InsertPt = MI.getIterator();
+
+  if (FenceCallAndRet) {
+    // Simply forcibly block speculation of loads out of the function by using
+    // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
+    // should be secure, is simple from an ABI perspective, and the cost can be
+    // minimized through inlining.
+    //
+    // FIXME: We should investigate ways to establish a strong data-dependency
+    // on the return. However, poisoning the stack pointer is unlikely to work
+    // because the return is *predicted* rather than relying on the load of the
+    // return address to actually resolve.
+    BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
+    ++NumInstsInserted;
+    ++NumLFENCEsInserted;
+    return;
+  }
+
+  // Take our predicate state, shift it to the high 17 bits (so that we keep
+  // pointers canonical) and merge it into RSP. This will allow the caller to
+  // extract it when we return (speculatively).
+  mergePredStateIntoSP(MBB, InsertPt, Loc,
+                       PredStateSSA.GetValueAtEndOfBlock(&MBB));
+}
+
+INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
+                      "X86 speculative load hardener", false, false)
+INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
+                    "X86 speculative load hardener", false, false)
+
+FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
+  return new X86SpeculativeLoadHardeningPass();
+}
Index: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
+++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
@@ -54,6 +54,10 @@
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableSpeculativeLoadHardening(
+    "x86-speculative-load-hardening",
+    cl::desc("Enable speculative load hardening"), cl::init(false), cl::Hidden);
+
 namespace llvm {
 
 void initializeWinEHStatePassPass(PassRegistry &);
@@ -463,6 +467,9 @@
     addPass(createX86AvoidStoreForwardingBlocks());
   }
 
+  if (EnableSpeculativeLoadHardening)
+    addPass(createX86SpeculativeLoadHardeningPass());
+
   addPass(createX86FlagsCopyLoweringPass());
   addPass(createX86WinAllocaExpander());
 }
Index: llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll
+++ llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -x86-speculative-load-hardening-lfence | FileCheck %s --check-prefix=X64-LFENCE
+;
+; FIXME: Add support for 32-bit and other EH ABIs.
+
+declare void @leak(i32 %v1, i32 %v2)
+
+declare void @sink(i32)
+
+define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2, i32** %ptr3) nounwind {
+; X64-LABEL: test_basic_conditions:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq $-1, %rbx
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # %bb.2: # %then1
+; X64-NEXT:    cmovneq %rbx, %rax
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    je .LBB0_4
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    cmoveq %rbx, %rax
+; X64-NEXT:  .LBB0_8: # %exit
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_4: # %then2
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    cmovneq %rbx, %rax
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    je .LBB0_6
+; X64-NEXT:  # %bb.5: # %else3
+; X64-NEXT:    cmoveq %rbx, %rax
+; X64-NEXT:    movslq (%r9), %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    leaq (%r15,%rcx,4), %r14
+; X64-NEXT:    movl %ecx, (%r15,%rcx,4)
+; X64-NEXT:    jmp .LBB0_7
+; X64-NEXT:  .LBB0_6: # %then3
+; X64-NEXT:    cmovneq %rbx, %rax
+; X64-NEXT:    movl (%rcx), %ecx
+; X64-NEXT:    addl (%r15), %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    movslq %ecx, %rdi
+; X64-NEXT:    movl (%r15,%rdi,4), %esi
+; X64-NEXT:    orl %eax, %esi
+; X64-NEXT:    movq (%r9), %r14
+; X64-NEXT:    orq %rax, %r14
+; X64-NEXT:    addl (%r14), %esi
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    # kill: def $edi killed $edi killed $rdi
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    callq leak
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:  .LBB0_7: # %merge
+; X64-NEXT:    movslq (%r14), %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movl $0, (%r15,%rcx,4)
+; X64-NEXT:    jmp .LBB0_8
+;
+; X64-LFENCE-LABEL: test_basic_conditions:
+; X64-LFENCE:       # %bb.0: # %entry
+; X64-LFENCE-NEXT:    testl %edi, %edi
+; X64-LFENCE-NEXT:    jne .LBB0_6
+; X64-LFENCE-NEXT:  # %bb.1: # %then1
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    testl %esi, %esi
+; X64-LFENCE-NEXT:    je .LBB0_2
+; X64-LFENCE-NEXT:  .LBB0_6: # %exit
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    retq
+; X64-LFENCE-NEXT:  .LBB0_2: # %then2
+; X64-LFENCE-NEXT:    pushq %r14
+; X64-LFENCE-NEXT:    pushq %rbx
+; X64-LFENCE-NEXT:    pushq %rax
+; X64-LFENCE-NEXT:    movq %r8, %rbx
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    testl %edx, %edx
+; X64-LFENCE-NEXT:    je .LBB0_3
+; X64-LFENCE-NEXT:  # %bb.4: # %else3
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movslq (%r9), %rax
+; X64-LFENCE-NEXT:    leaq (%rbx,%rax,4), %r14
+; X64-LFENCE-NEXT:    movl %eax, (%rbx,%rax,4)
+; X64-LFENCE-NEXT:    jmp .LBB0_5
+; X64-LFENCE-NEXT:  .LBB0_3: # %then3
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movl (%rcx), %eax
+; X64-LFENCE-NEXT:    addl (%rbx), %eax
+; X64-LFENCE-NEXT:    movslq %eax, %rdi
+; X64-LFENCE-NEXT:    movl (%rbx,%rdi,4), %esi
+; X64-LFENCE-NEXT:    movq (%r9), %r14
+; X64-LFENCE-NEXT:    addl (%r14), %esi
+; X64-LFENCE-NEXT:    # kill: def $edi killed $edi killed $rdi
+; X64-LFENCE-NEXT:    callq leak
+; X64-LFENCE-NEXT:  .LBB0_5: # %merge
+; X64-LFENCE-NEXT:    movslq (%r14), %rax
+; X64-LFENCE-NEXT:    movl $0, (%rbx,%rax,4)
+; X64-LFENCE-NEXT:    addq $8, %rsp
+; X64-LFENCE-NEXT:    popq %rbx
+; X64-LFENCE-NEXT:    popq %r14
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    retq
+entry:
+  %a.cmp = icmp eq i32 %a, 0
+  br i1 %a.cmp, label %then1, label %exit
+
+then1:
+  %b.cmp = icmp eq i32 %b, 0
+  br i1 %b.cmp, label %then2, label %exit
+
+then2:
+  %c.cmp = icmp eq i32 %c, 0
+  br i1 %c.cmp, label %then3, label %else3
+
+then3:
+  %secret1 = load i32, i32* %ptr1
+  %secret2 = load i32, i32* %ptr2
+  %secret.sum1 = add i32 %secret1, %secret2
+  %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret.sum1
+  %secret3 = load i32, i32* %ptr2.idx
+  %secret4 = load i32*, i32** %ptr3
+  %secret5 = load i32, i32* %secret4
+  %secret.sum2 = add i32 %secret3, %secret5
+  call void @leak(i32 %secret.sum1, i32 %secret.sum2)
+  br label %merge
+
+else3:
+  %secret6 = load i32*, i32** %ptr3
+  %cast = ptrtoint i32* %secret6 to i32
+  %ptr2.idx2 = getelementptr i32, i32* %ptr2, i32 %cast
+  store i32 %cast, i32* %ptr2.idx2
+  br label %merge
+
+merge:
+  %phi = phi i32* [ %secret4, %then3 ], [ %ptr2.idx2, %else3 ]
+  %secret7 = load i32, i32* %phi
+  %ptr2.idx3 = getelementptr i32, i32* %ptr2, i32 %secret7
+  store i32 0, i32* %ptr2.idx3
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @test_basic_loop(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) nounwind {
+; X64-LABEL: test_basic_loop:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq $-1, %r15
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    je .LBB1_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    cmoveq %r15, %rax
+; X64-NEXT:    jmp .LBB1_5
+; X64-NEXT:  .LBB1_2: # %l.header.preheader
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movl %esi, %ebp
+; X64-NEXT:    cmovneq %r15, %rax
+; X64-NEXT:    xorl %ebx, %ebx
+; X64-NEXT:    jmp .LBB1_3
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB1_6: # in Loop: Header=BB1_3 Depth=1
+; X64-NEXT:    cmovgeq %r15, %rax
+; X64-NEXT:  .LBB1_3: # %l.header
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movslq (%r12), %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    orq %r14, %rdx
+; X64-NEXT:    movl (%rdx,%rcx,4), %edi
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    callq sink
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    incl %ebx
+; X64-NEXT:    cmpl %ebp, %ebx
+; X64-NEXT:    jl .LBB1_6
+; X64-NEXT:  # %bb.4:
+; X64-NEXT:    cmovlq %r15, %rax
+; X64-NEXT:  .LBB1_5: # %exit
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; X64-LFENCE-LABEL: test_basic_loop:
+; X64-LFENCE:       # %bb.0: # %entry
+; X64-LFENCE-NEXT:    pushq %rbp
+; X64-LFENCE-NEXT:    pushq %r15
+; X64-LFENCE-NEXT:    pushq %r14
+; X64-LFENCE-NEXT:    pushq %rbx
+; X64-LFENCE-NEXT:    pushq %rax
+; X64-LFENCE-NEXT:    testl %edi, %edi
+; X64-LFENCE-NEXT:    jne .LBB1_3
+; X64-LFENCE-NEXT:  # %bb.1: # %l.header.preheader
+; X64-LFENCE-NEXT:    movq %rcx, %r14
+; X64-LFENCE-NEXT:    movq %rdx, %r15
+; X64-LFENCE-NEXT:    movl %esi, %ebp
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    xorl %ebx, %ebx
+; X64-LFENCE-NEXT:    .p2align 4, 0x90
+; X64-LFENCE-NEXT:  .LBB1_2: # %l.header
+; X64-LFENCE-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movslq (%r15), %rax
+; X64-LFENCE-NEXT:    movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT:    callq sink
+; X64-LFENCE-NEXT:    incl %ebx
+; X64-LFENCE-NEXT:    cmpl %ebp, %ebx
+; X64-LFENCE-NEXT:    jl .LBB1_2
+; X64-LFENCE-NEXT:  .LBB1_3: # %exit
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    addq $8, %rsp
+; X64-LFENCE-NEXT:    popq %rbx
+; X64-LFENCE-NEXT:    popq %r14
+; X64-LFENCE-NEXT:    popq %r15
+; X64-LFENCE-NEXT:    popq %rbp
+; X64-LFENCE-NEXT:    retq
+entry:
+  %a.cmp = icmp eq i32 %a, 0
+  br i1 %a.cmp, label %l.header, label %exit
+
+l.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %l.header ]
+  %secret = load i32, i32* %ptr1
+  %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret
+  %leak = load i32, i32* %ptr2.idx
+  call void @sink(i32 %leak)
+  %i.next = add i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %b
+  br i1 %i.cmp, label %l.header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2) nounwind {
+; X64-LABEL: test_basic_nested_loop:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq $-1, %r12
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    je .LBB2_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    cmoveq %r12, %rax
+; X64-NEXT:    jmp .LBB2_10
+; X64-NEXT:  .LBB2_2: # %l1.header.preheader
+; X64-NEXT:    movq %r8, %r14
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    movl %edx, %ebp
+; X64-NEXT:    movl %esi, %r15d
+; X64-NEXT:    cmovneq %r12, %rax
+; X64-NEXT:    xorl %r13d, %r13d
+; X64-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    testl %r15d, %r15d
+; X64-NEXT:    jg .LBB2_5
+; X64-NEXT:    jmp .LBB2_4
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB2_12:
+; X64-NEXT:    cmovgeq %r12, %rax
+; X64-NEXT:    testl %r15d, %r15d
+; X64-NEXT:    jle .LBB2_4
+; X64-NEXT:  .LBB2_5: # %l2.header.preheader
+; X64-NEXT:    cmovleq %r12, %rax
+; X64-NEXT:    xorl %r15d, %r15d
+; X64-NEXT:    jmp .LBB2_6
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB2_11: # in Loop: Header=BB2_6 Depth=1
+; X64-NEXT:    cmovgeq %r12, %rax
+; X64-NEXT:  .LBB2_6: # %l2.header
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movslq (%rbx), %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    orq %r14, %rdx
+; X64-NEXT:    movl (%rdx,%rcx,4), %edi
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    callq sink
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    incl %r15d
+; X64-NEXT:    cmpl %ebp, %r15d
+; X64-NEXT:    jl .LBB2_11
+; X64-NEXT:  # %bb.7:
+; X64-NEXT:    cmovlq %r12, %rax
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload
+; X64-NEXT:    jmp .LBB2_8
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB2_4:
+; X64-NEXT:    cmovgq %r12, %rax
+; X64-NEXT:  .LBB2_8: # %l1.latch
+; X64-NEXT:    movslq (%rbx), %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    orq %r14, %rdx
+; X64-NEXT:    movl (%rdx,%rcx,4), %edi
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    callq sink
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    incl %r13d
+; X64-NEXT:    cmpl %r15d, %r13d
+; X64-NEXT:    jl .LBB2_12
+; X64-NEXT:  # %bb.9:
+; X64-NEXT:    cmovlq %r12, %rax
+; X64-NEXT:  .LBB2_10: # %exit
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; X64-LFENCE-LABEL: test_basic_nested_loop:
+; X64-LFENCE:       # %bb.0: # %entry
+; X64-LFENCE-NEXT:    pushq %rbp
+; X64-LFENCE-NEXT:    pushq %r15
+; X64-LFENCE-NEXT:    pushq %r14
+; X64-LFENCE-NEXT:    pushq %r13
+; X64-LFENCE-NEXT:    pushq %r12
+; X64-LFENCE-NEXT:    pushq %rbx
+; X64-LFENCE-NEXT:    pushq %rax
+; X64-LFENCE-NEXT:    testl %edi, %edi
+; X64-LFENCE-NEXT:    jne .LBB2_6
+; X64-LFENCE-NEXT:  # %bb.1: # %l1.header.preheader
+; X64-LFENCE-NEXT:    movq %r8, %r14
+; X64-LFENCE-NEXT:    movq %rcx, %rbx
+; X64-LFENCE-NEXT:    movl %edx, %r13d
+; X64-LFENCE-NEXT:    movl %esi, %r15d
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    xorl %r12d, %r12d
+; X64-LFENCE-NEXT:    .p2align 4, 0x90
+; X64-LFENCE-NEXT:  .LBB2_2: # %l1.header
+; X64-LFENCE-NEXT:    # =>This Loop Header: Depth=1
+; X64-LFENCE-NEXT:    # Child Loop BB2_4 Depth 2
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    testl %r15d, %r15d
+; X64-LFENCE-NEXT:    jle .LBB2_5
+; X64-LFENCE-NEXT:  # %bb.3: # %l2.header.preheader
+; X64-LFENCE-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    xorl %ebp, %ebp
+; X64-LFENCE-NEXT:    .p2align 4, 0x90
+; X64-LFENCE-NEXT:  .LBB2_4: # %l2.header
+; X64-LFENCE-NEXT:    # Parent Loop BB2_2 Depth=1
+; X64-LFENCE-NEXT:    # => This Inner Loop Header: Depth=2
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movslq (%rbx), %rax
+; X64-LFENCE-NEXT:    movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT:    callq sink
+; X64-LFENCE-NEXT:    incl %ebp
+; X64-LFENCE-NEXT:    cmpl %r13d, %ebp
+; X64-LFENCE-NEXT:    jl .LBB2_4
+; X64-LFENCE-NEXT:  .LBB2_5: # %l1.latch
+; X64-LFENCE-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movslq (%rbx), %rax
+; X64-LFENCE-NEXT:    movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT:    callq sink
+; X64-LFENCE-NEXT:    incl %r12d
+; X64-LFENCE-NEXT:    cmpl %r15d, %r12d
+; X64-LFENCE-NEXT:    jl .LBB2_2
+; X64-LFENCE-NEXT:  .LBB2_6: # %exit
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    addq $8, %rsp
+; X64-LFENCE-NEXT:    popq %rbx
+; X64-LFENCE-NEXT:    popq %r12
+; X64-LFENCE-NEXT:    popq %r13
+; X64-LFENCE-NEXT:    popq %r14
+; X64-LFENCE-NEXT:    popq %r15
+; X64-LFENCE-NEXT:    popq %rbp
+; X64-LFENCE-NEXT:    retq
+entry:
+  %a.cmp = icmp eq i32 %a, 0
+  br i1 %a.cmp, label %l1.header, label %exit
+
+l1.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %l1.latch ]
+  %b.cmp = icmp sgt i32 %b, 0
+  br i1 %b.cmp, label %l2.header, label %l1.latch
+
+l2.header:
+  %j = phi i32 [ 0, %l1.header ], [ %j.next, %l2.header ]
+  %secret = load i32, i32* %ptr1
+  %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret
+  %leak = load i32, i32* %ptr2.idx
+  call void @sink(i32 %leak)
+  %j.next = add i32 %j, 1
+  %j.cmp = icmp slt i32 %j.next, %c
+  br i1 %j.cmp, label %l2.header, label %l1.latch
+
+l1.latch:
+  %secret2 = load i32, i32* %ptr1
+  %ptr2.idx2 = getelementptr i32, i32* %ptr2, i32 %secret2
+  %leak2 = load i32, i32* %ptr2.idx2
+  call void @sink(i32 %leak2)
+  %i.next = add i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %b
+  br i1 %i.cmp, label %l1.header, label %exit
+
+exit:
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_allocate_exception(i64) local_unnamed_addr
+
+declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
+
+define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; X64-LABEL: test_basic_eh:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpl $41, %edi
+; X64-NEXT:    jg .LBB3_1
+; X64-NEXT:  # %bb.2: # %thrower
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    movslq %edi, %rcx
+; X64-NEXT:    movl (%rsi,%rcx,4), %ebp
+; X64-NEXT:    orl %eax, %ebp
+; X64-NEXT:    movl $4, %edi
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    callq __cxa_allocate_exception
+; X64-NEXT:    movq %rsp, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movl %ebp, (%rax)
+; X64-NEXT:  .Ltmp0:
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    callq __cxa_throw
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:  .Ltmp1:
+; X64-NEXT:    jmp .LBB3_3
+; X64-NEXT:  .LBB3_1:
+; X64-NEXT:    cmovleq %rcx, %rax
+; X64-NEXT:  .LBB3_3: # %exit
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB3_4: # %lpad
+; X64-NEXT:  .Ltmp2:
+; X64-NEXT:    movq %rsp, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    addl (%rbx), %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    movl (%r14,%rax,4), %edi
+; X64-NEXT:    orl %ecx, %edi
+; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    callq sink
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+;
+; X64-LFENCE-LABEL: test_basic_eh:
+; X64-LFENCE:       # %bb.0: # %entry
+; X64-LFENCE-NEXT:    pushq %rbp
+; X64-LFENCE-NEXT:    pushq %r14
+; X64-LFENCE-NEXT:    pushq %rbx
+; X64-LFENCE-NEXT:    cmpl $41, %edi
+; X64-LFENCE-NEXT:    jg .LBB3_2
+; X64-LFENCE-NEXT:  # %bb.1: # %thrower
+; X64-LFENCE-NEXT:    movq %rdx, %r14
+; X64-LFENCE-NEXT:    movq %rsi, %rbx
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movslq %edi, %rax
+; X64-LFENCE-NEXT:    movl (%rsi,%rax,4), %ebp
+; X64-LFENCE-NEXT:    movl $4, %edi
+; X64-LFENCE-NEXT:    callq __cxa_allocate_exception
+; X64-LFENCE-NEXT:    movl %ebp, (%rax)
+; X64-LFENCE-NEXT:  .Ltmp0:
+; X64-LFENCE-NEXT:    xorl %esi, %esi
+; X64-LFENCE-NEXT:    xorl %edx, %edx
+; X64-LFENCE-NEXT:    movq %rax, %rdi
+; X64-LFENCE-NEXT:    callq __cxa_throw
+; X64-LFENCE-NEXT:  .Ltmp1:
+; X64-LFENCE-NEXT:  .LBB3_2: # %exit
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    popq %rbx
+; X64-LFENCE-NEXT:    popq %r14
+; X64-LFENCE-NEXT:    popq %rbp
+; X64-LFENCE-NEXT:    retq
+; X64-LFENCE-NEXT:  .LBB3_3: # %lpad
+; X64-LFENCE-NEXT:  .Ltmp2:
+; X64-LFENCE-NEXT:    movl (%rax), %eax
+; X64-LFENCE-NEXT:    addl (%rbx), %eax
+; X64-LFENCE-NEXT:    cltq
+; X64-LFENCE-NEXT:    movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT:    callq sink
+entry:
+  %a.cmp = icmp slt i32 %a, 42
+  br i1 %a.cmp, label %thrower, label %exit
+
+thrower:
+  %badidx = getelementptr i32, i32* %ptr1, i32 %a
+  %secret1 = load i32, i32* %badidx
+  %e.ptr = call i8* @__cxa_allocate_exception(i64 4)
+  %e.ptr.cast = bitcast i8* %e.ptr to i32*
+  store i32 %secret1, i32* %e.ptr.cast
+  invoke void @__cxa_throw(i8* %e.ptr, i8* null, i8* null)
+          to label %exit unwind label %lpad
+
+exit:
+  ret void
+
+lpad:
+  %e = landingpad { i8*, i32 }
+          catch i8* null
+  %e.catch.ptr = extractvalue { i8*, i32 } %e, 0
+  %e.catch.ptr.cast = bitcast i8* %e.catch.ptr to i32*
+  %secret1.catch = load i32, i32* %e.catch.ptr.cast
+  %secret2 = load i32, i32* %ptr1
+  %secret.sum = add i32 %secret1.catch, %secret2
+  %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret.sum
+  %leak = load i32, i32* %ptr2.idx
+  call void @sink(i32 %leak)
+  unreachable
+}