Index: llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
===================================================================
--- llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -9,7 +9,30 @@
 /// Description: This pass finds Load Value Injection (LVI) gadgets consisting
 /// of a load from memory (i.e., SOURCE), and any operation that may transmit
 /// the value loaded from memory over a covert channel, or use the value loaded
-/// from memory to determine a branch/call target (i.e., SINK).
+/// from memory to determine a branch/call target (i.e., SINK). After finding
+/// all such gadgets in a given function, the pass minimally inserts LFENCE
+/// instructions in such a manner that the following property is satisfied: for
+/// all SOURCE+SINK pairs, all paths in the CFG from SOURCE to SINK contain at
+/// least one LFENCE instruction. The algorithm that implements this minimal
+/// insertion is influenced by an academic paper that minimally inserts memory
+/// fences for high-performance concurrent programs:
+///         http://www.cs.ucr.edu/~lesani/companion/oopsla15/OOPSLA15.pdf
+/// The algorithm implemented in this pass is as follows:
+/// 1. Build a condensed CFG (i.e., a GadgetGraph) consisting only of the
+/// following components:
+///    - SOURCE instructions (also includes function arguments)
+///    - SINK instructions
+///    - Basic block entry points
+///    - Basic block terminators
+///    - LFENCE instructions
+/// 2. Analyze the GadgetGraph to determine which SOURCE+SINK pairs (i.e.,
+/// gadgets) are already mitigated by existing LFENCEs. If all gadgets have been
+/// mitigated, go to step 6.
+/// 3. Use a heuristic or plugin to approximate minimal LFENCE insertion.
+/// 4. Insert one LFENCE along each CFG edge that was cut in step 3.
+/// 5. Go to step 2.
+/// 6. If any LFENCEs were inserted, return `true` from runOnFunction() to tell
+/// LLVM that the function was modified.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -37,6 +60,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -45,11 +69,16 @@
 #define PASS_KEY "x86-lvi-load"
 #define DEBUG_TYPE PASS_KEY
 
+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
 STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
 STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
                                  "were deployed");
 STATISTIC(NumGadgets, "Number of LVI gadgets detected during analysis");
 
+static cl::opt<std::string> OptimizePluginPath(
+    PASS_KEY "-opt-plugin",
+    cl::desc("Specify a plugin to optimize LFENCE insertion"), cl::Hidden);
+
 static cl::opt<bool> NoConditionalBranches(
     PASS_KEY "-no-cbranch",
     cl::desc("Don't treat conditional branches as disclosure gadgets. This "
@@ -80,6 +109,12 @@
              "may improve performance, at the cost of security."),
     cl::init(false), cl::Hidden);
 
+static llvm::sys::DynamicLibrary OptimizeDL{};
+typedef int (*OptimizeCutT)(unsigned int *nodes, unsigned int nodes_size,
+                            unsigned int *edges, int *edge_values,
+                            int *cut_edges /* out */, unsigned int edges_size);
+static OptimizeCutT OptimizeCut = nullptr;
+
 #define ARG_NODE nullptr
 #define GADGET_EDGE ((int)(-1))
 #define WEIGHT(EdgeValue) ((double)(2 * (EdgeValue) + 1))
@@ -138,6 +173,11 @@
   getGadgetGraph(MachineFunction &MF, const MachineLoopInfo &MLI,
                  const MachineDominatorTree &MDT,
                  const MachineDominanceFrontier &MDF, bool FixedLoads) const;
+  std::unique_ptr<MachineGadgetGraph>
+  elimEdges(std::unique_ptr<MachineGadgetGraph> Graph) const;
+  void cutEdges(MachineGadgetGraph &G, EdgeSet &CutEdges /* out */) const;
+  int insertFences(MachineGadgetGraph &G,
+                   EdgeSet &CutEdges /* in, out */) const;
 
   bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const;
   bool instrUsesRegToBranch(const MachineInstr &I, unsigned Reg) const;
@@ -240,14 +280,17 @@
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   LLVM_DEBUG(dbgs() << "Hardening data-dependent loads...\n");
-  hardenLoads(MF, false);
+  int FencesInserted = hardenLoads(MF, false);
   LLVM_DEBUG(dbgs() << "Hardening data-dependent loads... Done\n");
   if (!NoFixedLoads) {
     LLVM_DEBUG(dbgs() << "Hardening fixed loads...\n");
-    hardenLoads(MF, true);
+    FencesInserted += hardenLoads(MF, true);
     LLVM_DEBUG(dbgs() << "Hardening fixed loads... Done\n");
   }
-  return false;
+  if (FencesInserted > 0)
+    ++NumFunctionsMitigated;
+  NumFences += FencesInserted;
+  return (FencesInserted > 0);
 }
 
 // Apply the mitigation to `MF`, return the number of fences inserted.
@@ -255,6 +298,8 @@
 // loads; otherwise, mitigation will be applied to non-fixed loads.
 int X86LoadValueInjectionLoadHardeningPass::hardenLoads(MachineFunction &MF,
                                                         bool FixedLoads) const {
+  int FencesInserted = 0;
+
   LLVM_DEBUG(dbgs() << "Building gadget graph...\n");
   const auto &MLI = getAnalysis<MachineLoopInfo>();
   const auto &MDT = getAnalysis<MachineDominatorTree>();
@@ -288,7 +333,27 @@
       return 0;
   }
 
-  return 0;
+  do {
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+    std::unique_ptr<MachineGadgetGraph> ElimGraph = elimEdges(std::move(Graph));
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+    if (ElimGraph->NumGadgets == 0)
+      break;
+
+    EdgeSet CutEdges{*ElimGraph};
+    LLVM_DEBUG(dbgs() << "Cutting edges...\n");
+    cutEdges(*ElimGraph, CutEdges);
+    LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
+
+    LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
+    FencesInserted += insertFences(*ElimGraph, CutEdges);
+    LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
+
+    Graph = GraphBuilder::trim(
+        *ElimGraph, MachineGadgetGraph::NodeSet{*ElimGraph}, CutEdges);
+  } while (true);
+
+  return FencesInserted;
 }
 
 std::unique_ptr<X86LoadValueInjectionLoadHardeningPass::MachineGadgetGraph>
@@ -460,6 +525,193 @@
   return G;
 }
 
+std::unique_ptr<X86LoadValueInjectionLoadHardeningPass::MachineGadgetGraph>
+X86LoadValueInjectionLoadHardeningPass::elimEdges(
+    std::unique_ptr<MachineGadgetGraph> Graph) const {
+  MachineGadgetGraph::NodeSet ElimNodes{*Graph};
+  MachineGadgetGraph::EdgeSet ElimEdges{*Graph};
+
+  if (Graph->NumFences > 0) { // eliminate fences
+    for (const auto &E : Graph->edges()) {
+      const MachineGadgetGraph::Node *Dest = E.getDest();
+      if (isFence(Dest->getValue())) {
+        ElimNodes.insert(Dest);
+        ElimEdges.insert(&E);
+        for_each(Dest->edges(),
+                 [&ElimEdges](const MachineGadgetGraph::Edge &E) {
+                   ElimEdges.insert(&E);
+                 });
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Eliminated " << ElimNodes.count()
+                      << " fence nodes\n");
+  }
+
+  // Eliminate gadget edges that are mitigated.
+  int NumGadgets = 0;
+  MachineGadgetGraph::NodeSet Visited{*Graph}, GadgetSinks{*Graph};
+  MachineGadgetGraph::EdgeSet ElimGadgets{*Graph};
+  for (const auto &RootN : Graph->nodes()) {
+    // collect the gadgets for this node
+    for (const auto &E : RootN.edges()) {
+      if (MachineGadgetGraph::isGadgetEdge(E)) {
+        ++NumGadgets;
+        ElimGadgets.insert(&E);
+        GadgetSinks.insert(E.getDest());
+      }
+    }
+    if (GadgetSinks.empty())
+      continue;
+    std::function<void(const MachineGadgetGraph::Node *, bool)> TraverseDFS =
+        [&](const MachineGadgetGraph::Node *N, bool FirstNode) {
+          if (!FirstNode) {
+            Visited.insert(N);
+            if (GadgetSinks.contains(N)) {
+              for (const auto &E : RootN.edges()) {
+                if (MachineGadgetGraph::isGadgetEdge(E) && E.getDest() == N)
+                  ElimGadgets.erase(&E);
+              }
+            }
+          }
+          for (const auto &E : N->edges()) {
+            const MachineGadgetGraph::Node *Dest = E.getDest();
+            if (MachineGadgetGraph::isCFGEdge(E) && !Visited.contains(Dest) &&
+                !ElimEdges.contains(&E))
+              TraverseDFS(Dest, false);
+          }
+        };
+    TraverseDFS(&RootN, true);
+    Visited.clear();
+    GadgetSinks.clear();
+  }
+  LLVM_DEBUG(dbgs() << "Eliminated " << ElimGadgets.count()
+                    << " gadget edges\n");
+  ElimEdges |= ElimGadgets;
+
+  if (!(ElimEdges.empty() && ElimNodes.empty())) {
+    int NumRemainingGadgets = NumGadgets - ElimGadgets.count();
+    Graph = GraphBuilder::trim(*Graph, ElimNodes, ElimEdges, 0 /* NumFences */,
+                               NumRemainingGadgets);
+  } else {
+    Graph->NumFences = 0;
+    Graph->NumGadgets = NumGadgets;
+  }
+  return Graph;
+}
+
+void X86LoadValueInjectionLoadHardeningPass::cutEdges(
+    MachineGadgetGraph &G,
+    MachineGadgetGraph::EdgeSet &CutEdges /* out */) const {
+  if (!OptimizePluginPath.empty()) {
+    if (!OptimizeDL.isValid()) {
+      std::string ErrorMsg{};
+      OptimizeDL = llvm::sys::DynamicLibrary::getPermanentLibrary(
+          OptimizePluginPath.c_str(), &ErrorMsg);
+      if (!ErrorMsg.empty())
+        report_fatal_error("Failed to load opt plugin: \"" + ErrorMsg + '\"');
+      OptimizeCut = (OptimizeCutT)OptimizeDL.getAddressOfSymbol("optimize_cut");
+      if (!OptimizeCut)
+        report_fatal_error("Invalid optimization plugin");
+    }
+    std::vector<unsigned int> Nodes(G.nodes_size() + 1 /* terminator node */);
+    std::vector<unsigned int> Edges(G.edges_size());
+    std::vector<int> EdgeCuts(G.edges_size());
+    std::vector<int> EdgeValues(G.edges_size());
+    for (const auto &N : G.nodes()) {
+      Nodes[std::distance(G.nodes_begin(), &N)] =
+          std::distance(G.edges_begin(), N.edges_begin());
+    }
+    Nodes[G.nodes_size()] = G.edges_size(); // terminator node
+    for (const auto &E : G.edges()) {
+      Edges[std::distance(G.edges_begin(), &E)] =
+          std::distance(G.nodes_begin(), E.getDest());
+      EdgeValues[std::distance(G.edges_begin(), &E)] = E.getValue();
+    }
+    OptimizeCut(Nodes.data(), G.nodes_size(), Edges.data(), EdgeValues.data(),
+                EdgeCuts.data(), G.edges_size());
+    for (int I = 0; I < G.edges_size(); ++I) {
+      if (EdgeCuts[I])
+        CutEdges.set(I);
+    }
+  } else { // Use the default greedy heuristic
+    // Find the cheapest CFG edge that will eliminate a gadget (by being egress
+    // from a SOURCE node or ingress to a SINK node), and cut it.
+    MachineGadgetGraph::NodeSet GadgetSinks{G};
+    const MachineGadgetGraph::Edge *CheapestSoFar = nullptr;
+    for (const auto &N : G.nodes()) {
+      for (const auto &E : N.edges()) {
+        if (MachineGadgetGraph::isGadgetEdge(E)) {
+          // NI is a SOURCE node. Look for a cheap egress edge
+          for (const auto &EE : N.edges()) {
+            if (MachineGadgetGraph::isCFGEdge(EE)) {
+              if (!CheapestSoFar || EE.getValue() < CheapestSoFar->getValue())
+                CheapestSoFar = &EE;
+            }
+          }
+          GadgetSinks.insert(E.getDest());
+        } else { // E is a CFG edge
+          if (GadgetSinks.contains(E.getDest())) {
+            // The dest is a SINK node. Hence EI is an ingress edge
+            if (!CheapestSoFar || E.getValue() < CheapestSoFar->getValue())
+              CheapestSoFar = &E;
+          }
+        }
+      }
+    }
+    assert(CheapestSoFar && "Failed to cut an edge");
+    CutEdges.insert(CheapestSoFar);
+  }
+  LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
+}
+
+int X86LoadValueInjectionLoadHardeningPass::insertFences(
+    MachineGadgetGraph &G, EdgeSet &CutEdges /* in, out */) const {
+  int FencesInserted = 0, AdditionalEdgesCut = 0;
+  auto CutAllCFGEdges =
+      [&CutEdges, &AdditionalEdgesCut](const MachineGadgetGraph::Node *N) {
+        for (const MachineGadgetGraph::Edge &E : N->edges()) {
+          if (MachineGadgetGraph::isCFGEdge(E) && !CutEdges.contains(&E)) {
+            CutEdges.insert(&E);
+            ++AdditionalEdgesCut;
+          }
+        }
+      };
+  for (const auto &N : G.nodes()) {
+    for (const auto &E : N.edges()) {
+      if (CutEdges.contains(&E)) {
+        MachineInstr *MI = N.getValue(), *Prev;
+        MachineBasicBlock *MBB;
+        MachineBasicBlock::iterator InsertionPt;
+        if (MI == ARG_NODE) { // insert LFENCE at beginning of entry block
+          MBB = &G.getMF().front();
+          InsertionPt = MBB->begin();
+          Prev = nullptr;
+        } else if (MI->isBranch()) { // insert the LFENCE before the branch
+          MBB = MI->getParent();
+          InsertionPt = MI;
+          Prev = MI->getPrevNode();
+          CutAllCFGEdges(&N);
+        } else { // insert the LFENCE after the instruction
+          MBB = MI->getParent();
+          InsertionPt = MI->getNextNode() ? MI->getNextNode() : MBB->end();
+          Prev = InsertionPt == MBB->end()
+                     ? (MBB->empty() ? nullptr : &MBB->back())
+                     : InsertionPt->getPrevNode();
+        }
+        if ((InsertionPt == MBB->end() || !isFence(&*InsertionPt)) &&
+            (!Prev || !isFence(Prev))) {
+          BuildMI(*MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
+          ++FencesInserted;
+        }
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
+  LLVM_DEBUG(dbgs() << "Cut an additional " << AdditionalEdgesCut
+                    << " edges during fence insertion\n");
+  return FencesInserted;
+}
+
 bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToAccessMemory(
     const MachineInstr &MI, unsigned Reg) const {
   if (!MI.mayLoadOrStore() || MI.getOpcode() == X86::MFENCE ||
Index: llvm/test/CodeGen/X86/lvi-hardening-loads.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/lvi-hardening-loads.ll
@@ -0,0 +1,102 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-CBFX
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown --x86-lvi-load-no-fixed < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-CB
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown --x86-lvi-load-no-cbranch < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-FX
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown --x86-lvi-load-no-fixed --x86-lvi-load-no-cbranch < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-BASE
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @test(i32** %secret, i32 %secret_size) #0 {
+; X64-LABEL: test:
+entry:
+  %secret.addr = alloca i32**, align 8
+  %secret_size.addr = alloca i32, align 4
+  %ret_val = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32** %secret, i32*** %secret.addr, align 8
+  store i32 %secret_size, i32* %secret_size.addr, align 4
+  store i32 0, i32* %ret_val, align 4
+  call void @llvm.x86.sse2.lfence()
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+; X64: # %bb.0: # %entry
+; X64-NEXT:      movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:      movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:      movl $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:      lfence
+; X64-NEXT:      movl $0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:      jmp .LBB0_1
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %secret_size.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+; X64: .LBB0_1: # %for.cond
+; X64-NEXT:      # =>This Inner Loop Header: Depth=1
+; X64-NEXT:      movl -{{[0-9]+}}(%rsp), %eax
+; X64-CBFX-NEXT: lfence
+; X64-NEXT:      cmpl -{{[0-9]+}}(%rsp), %eax
+; X64-CBFX-NEXT: lfence
+; X64-NEXT:      jge .LBB0_5
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %rem = srem i32 %2, 2
+  %cmp1 = icmp eq i32 %rem, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+; X64: # %bb.2: # %for.body
+; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NEXT:      movl -{{[0-9]+}}(%rsp), %eax
+; X64-CBFX-NEXT: lfence
+; X64-NEXT:      movl %eax, %ecx
+; X64-NEXT:      shrl $31, %ecx
+; X64-NEXT:      addl %eax, %ecx
+; X64-NEXT:      andl $-2, %ecx
+; X64-NEXT:      cmpl %ecx, %eax
+; X64-NEXT:      jne .LBB0_4
+
+if.then:                                          ; preds = %for.body
+  %3 = load i32**, i32*** %secret.addr, align 8
+  %4 = load i32, i32* %ret_val, align 4
+  %idxprom = sext i32 %4 to i64
+  %arrayidx = getelementptr inbounds i32*, i32** %3, i64 %idxprom
+  %5 = load i32*, i32** %arrayidx, align 8
+  %6 = load i32, i32* %5, align 4
+  store i32 %6, i32* %ret_val, align 4
+  br label %if.end
+
+; X64: # %bb.3: # %if.then
+; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NEXT:      movq -{{[0-9]+}}(%rsp), %rax
+; X64-CBFX-NEXT: lfence
+; X64-FX-NEXT:   lfence
+; X64-NEXT:      movslq -{{[0-9]+}}(%rsp), %rcx
+; X64-CBFX-NEXT: lfence
+; X64-FX-NEXT:   lfence
+; X64-NEXT:      movq (%rax,%rcx,8), %rax
+; X64-NEXT:      lfence
+; X64-NEXT:      movl (%rax), %eax
+; X64-NEXT:      movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:      jmp .LBB0_4
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %ret_val, align 4
+  ret i32 %8
+}
+
+; Function Attrs: nounwind
+declare void @llvm.x86.sse2.lfence() #1
+
+attributes #0 = { "target-features"="+lvi-load-hardening" }
+attributes #1 = { nounwind }