Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -120,6 +120,7 @@
 void initializeEarlyIfConverterPass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
+void initializeFencesPREPass(PassRegistry&);
 void initializeGCOVProfilerPass(PassRegistry&);
 void initializeAddressSanitizerPass(PassRegistry&);
 void initializeAddressSanitizerModulePass(PassRegistry&);
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -16,17 +16,32 @@
 #define LLVM_TRANSFORMS_SCALAR_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Intrinsics.h"
 
 namespace llvm {
 
 class BasicBlockPass;
 class FunctionPass;
-class Pass;
 class GetElementPtrInst;
+class Instruction;
+class Pass;
 class PassInfo;
 class TerminatorInst;
 class TargetLowering;
 class TargetMachine;
+class Value;
+
+//===----------------------------------------------------------------------===//
+//
+// FencesPRE - Elimination of target-specific fences based on a PRE algorithm
+//
+FunctionPass *createFencesPREPass(
+        Intrinsic::ID FenceInt = Intrinsic::not_intrinsic,
+        ArrayRef<Value*> FenceArgs = {},
+        std::function<bool(const Instruction&)> isStrongerFence
+            = [](const Instruction &I){return false;},
+        std::function<bool(const Instruction&)> isWeakerFence
+            = [](const Instruction &I){return false;});
 
 //===----------------------------------------------------------------------===//
 //
Index: lib/Target/PowerPC/PPCTargetMachine.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetMachine.cpp
+++ lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -14,7 +14,10 @@
 #include "PPCTargetMachine.h"
 #include "PPC.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -155,6 +158,28 @@
 
 void PPCPassConfig::addIRPasses() {
   addPass(createAtomicExpandPass(&getPPCTargetMachine()));
+  if (getOptLevel() != CodeGenOpt::None) {
+    // We first try to eliminate redundant hwsync instructions, ignoring lwsyncs.
+    addPass(createFencesPREPass(Intrinsic::ppc_sync, {},
+                /*IsStrongerFence=*/[](const Instruction &I) {return false;},
+                /*IsWeakerFence=*/[](const Instruction &I) {
+            if (auto FI = dyn_cast<IntrinsicInst>(&I))
+              if (FI->getIntrinsicID() == Intrinsic::ppc_lwsync)
+                return true;
+            return false;}));
+    // Then we eliminate redundant lwsyncs, taking into account the hwsyncs.
+    addPass(createFencesPREPass(Intrinsic::ppc_lwsync, {},
+                /*IsStrongerFence=*/[](const Instruction &I) {
+            if (auto FI = dyn_cast<IntrinsicInst>(&I))
+              if (FI->getIntrinsicID() == Intrinsic::ppc_sync)
+                return true;
+            return false;},
+                /*IsWeakerFence=*/[](const Instruction &I) {return false;}));
+    // These two passes require that critical edges be split. So we should
+    // cleanup the CFG afterwards.
+    // FIXME: breaks a bunch of brittle tests
+    // addPass(createCFGSimplificationPass());
+  }
   TargetPassConfig::addIRPasses();
 }
 
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -7,6 +7,7 @@
   DCE.cpp
   DeadStoreElimination.cpp
   EarlyCSE.cpp
+  FencesPRE.cpp
   FlattenCFGPass.cpp
   GVN.cpp
   IndVarSimplify.cpp
Index: lib/Transforms/Scalar/FencesPRE.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/FencesPRE.cpp
@@ -0,0 +1,521 @@
+//===- FencesPRE.cpp - Fence Elimination Implementation -=====================//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates target-specific fence intrinsics that are redundant, or
+// partially redundant if it can introduce new fences to make them fully
+// redundant.
+//
+// The algorithm is based on the following PRE paper, adapted to work on fences:
+// Bernhard Scholz, R. Nigel Horspool, Jens Knoop:
+//     Optimizing for space and time usage with speculative partial redundancy
+//     elimination.
+// LCTES 2004: 221-230
+//
+// The general idea is to build a graph based on the CFG, with a node for each
+// instruction that may affect memory, and a node for each join/split in the
+// CFG, label this graph with the cost for edge of having a fence on that edge
+// (currently using BlockFrequencyInfo). Then each memory-affecting instruction
+// immediately before a fence is marked as a source, each one immediately after
+// a fence is marked as a sink, and a min-cut provides us a minimum-cost set of
+// fences that provide the same ordering guarantees.
+// There are a few subtleties: for example some nodes must be duplicated to
+// prevent them from being both a source and a sink; and the graph is only
+// built around the fences (to avoid the cost of allocating a potentially huge
+// graph).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "fences-pre"
+
+STATISTIC(NumFencesDetected, "Number of fences pre-FencesPRE");
+STATISTIC(NumFencesInserted, "Number of fences added by FencesPRE");
+STATISTIC(NumFencesDeleted, "Number of fences erased by FencesPRE");
+
+namespace {
+  class FencesPRE : public FunctionPass {
+    public:
+    static char ID;
+    FencesPRE(Intrinsic::ID FenceInt = Intrinsic::not_intrinsic,
+              ArrayRef<Value*> Args = {},
+              // Stronger fences can make the fence under consideration
+              // redundant.
+              std::function<bool(const Instruction&)> IsStrongerFence
+                  = [](const Instruction &I){return false;},
+              // Weaker fences are ignored by the algorithm.
+              std::function<bool(const Instruction&)> IsWeakerFence
+                  = [](const Instruction &I){return false;}) :
+        FunctionPass(ID), FenceIntrinsicID(FenceInt), FenceArgs(Args),
+        isStrongerFence(IsStrongerFence), isWeakerFence(IsWeakerFence) {
+      initializeFencesPREPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequired<BlockFrequencyInfo>();
+    }
+    private:
+    Intrinsic::ID FenceIntrinsicID;
+    ArrayRef<Value*> FenceArgs;
+    std::function<bool(const Instruction&)> isStrongerFence;
+    std::function<bool(const Instruction&)> isWeakerFence;
+    BlockFrequencyInfo *BFI;
+
+    // signed so that we can also represent flow.
+    // Capacity of edges is (BlockFrequency >> 2) + 1
+    // (shifted to fit in 63 bits, +1 to account for code size increases).
+    typedef int64_t Capacity;
+    typedef uint32_t Height;
+
+    typedef struct FlowGraphNode {
+      Instruction * I;
+      // nullptr unless there is exactly one edge leading to this node, and a
+      // fence exist on this edge. Used to keep existing fences when possible,
+      // instead of erasing them and inserting a new one at the same spot.
+      Instruction * FenceBeforeIt;
+      Capacity Excess;
+      Height Height;
+      // This bit is used in the algorithm at the end to extract the min-cut
+      // from the max-flow.
+      bool Mark;
+      std::vector<std::pair<FlowGraphNode*, Capacity> > Succs;
+      std::vector<std::pair<FlowGraphNode*, Capacity> >::iterator Next;
+      // Source nodes must be separate, or a single node could be source and
+      // sink, which causes havoc. Terminator nodes are also duplicated to
+      // avoid trouble when a basic block contains just a terminator node.
+      // This is a unique_ptr because the node must be freed at the end, and
+      // is not in the FlowGraph map (since the non-duplicate already is).
+      std::unique_ptr<struct FlowGraphNode> DupNode;
+      FlowGraphNode() : I(nullptr), FenceBeforeIt(nullptr), Excess(0),
+                        Height(0), Mark(false), Succs(), Next(Succs.begin()),
+                        DupNode() {}
+      FlowGraphNode(Instruction * Inst) : I(Inst), FenceBeforeIt(nullptr),
+                                          Excess(0), Height(0), Mark(false),
+                                          Succs(), Next(Succs.begin()),
+                                          DupNode() {}
+    } FlowGraphNode;
+
+    typedef std::pair<FlowGraphNode*, FlowGraphNode *>                   Edge;
+    typedef DenseMap<Instruction*, std::unique_ptr<FlowGraphNode> > FlowGraph;
+    typedef SmallPtrSetImpl<FlowGraphNode*>                        NodeSubset;
+    typedef SmallVectorImpl<Edge>                                         Cut;
+    typedef DenseMap<Edge, Capacity>                                     Flow;
+    typedef std::vector<std::vector<FlowGraphNode*> >                 Buckets;
+
+    FlowGraphNode *makeGraphUpwards(const Instruction &FI, Instruction &Root,
+                          FlowGraph &FlowGraph, NodeSubset &Sources) const;
+    FlowGraphNode *makeGraphDownwards(const Instruction &FI, Instruction &Root,
+                          FlowGraph &FlowGraph, NodeSubset &Sources) const;
+    void findCut(const FlowGraph &FlowGraph, const NodeSubset &Sources,
+                 const NodeSubset &Sinks, Cut &Cut) const;
+    void insertFenceBefore(Instruction &I) const;
+    bool isStop(const Instruction &I) const;
+    bool isWantedFence(const Instruction &I) const;
+    FlowGraphNode *getNode(FlowGraph &FlowGraph, Instruction &I,
+                           bool shouldDuplicate=false) const;
+    void addEdge(FlowGraphNode *N1, FlowGraphNode *N2) const;
+    Capacity getResidualCapacity(Flow &Flow, Capacity C, FlowGraphNode &U,
+                                 FlowGraphNode &V) const;
+    bool push(Buckets &B, Flow &Flow, const NodeSubset &S, const NodeSubset &T,
+              Capacity C, FlowGraphNode &U, FlowGraphNode &V) const;
+    void relabel(Flow &Flow, FlowGraphNode &U) const;
+    Height discharge(Buckets &B, Flow &Flow, const NodeSubset &S, const NodeSubset &T,
+                   FlowGraphNode &U) const;
+    void markNode(Flow &Flow, Cut &Cut, FlowGraphNode &U) const;
+  };
+}
+
+char FencesPRE::ID = 0;
+INITIALIZE_PASS_BEGIN(FencesPRE, "fences-pre",
+                "Partial redundancy elimination for fences",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
+INITIALIZE_PASS_END(FencesPRE, "fences-pre",
+                "Partial redundancy elimination for fences",
+                false, false)
+FunctionPass *llvm::
+createFencesPREPass(Intrinsic::ID FenceInt, ArrayRef<Value*> FenceArgs,
+                    std::function<bool(const Instruction&)> isStrongerFence,
+                    std::function<bool(const Instruction&)> isWeakerFence) {
+    return new FencesPRE(FenceInt, FenceArgs, isStrongerFence, isWeakerFence);
+}
+
+bool FencesPRE::runOnFunction(Function &F) {
+    if (!FenceIntrinsicID)
+      return false;
+    BFI = &getAnalysis<BlockFrequencyInfo>();
+
+    // This FlowGraph ensures that the entire graph is freed at the end of the
+    // function.
+    DenseMap<Instruction*, std::unique_ptr<FlowGraphNode> > FlowGraph;
+    SmallPtrSet<FlowGraphNode*, 1> Sources;
+    SmallPtrSet<FlowGraphNode*, 1> Sinks;
+    SmallVector<Edge, 1> Cut;
+    SmallPtrSet<Instruction*, 1> Fences;
+
+    for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+      if (isWantedFence(*I)) {
+        Fences.insert(&*I);
+        ++NumFencesDetected;
+        FlowGraphNode *UpNode = makeGraphUpwards(*I, *I, FlowGraph, Sources);
+        FlowGraphNode *DownNode = makeGraphDownwards(*I, *I, FlowGraph, Sinks);
+        // These nodes can be null if there is a stronger fence just
+        // before/after this one
+        if (UpNode && DownNode) {
+          DownNode->FenceBeforeIt = &*I;
+          addEdge(UpNode, DownNode);
+        }
+      }
+    }
+    if (Fences.empty())
+      return false;
+
+    DEBUG(dbgs() << "Sources:\n";
+      for (auto S : Sources) {
+        dbgs() << "\t" << S << " : "; S->I->dump();
+        });
+    DEBUG(dbgs() << "Sinks:\n";
+      for (auto T : Sinks) {
+        dbgs() << "\t" << T << " : "; T->I->dump();
+        });
+
+    findCut(FlowGraph, Sources, Sinks, Cut);
+
+    for (auto NPair : Cut) {
+      assert(NPair.first->I->getParent() == NPair.second->I->getParent());
+      DEBUG(dbgs() << "Pair in the cut:" << NPair.first << " - " <<
+                                            NPair.second << "\n";
+            dbgs() << "\t"; NPair.first->I->dump();
+            dbgs() << "\t"; NPair.second->I->dump());
+      // If a fence is already there, just mark it as not for deletion.
+      Instruction *FenceBetweenNPair = NPair.second->FenceBeforeIt;
+      if (FenceBetweenNPair != nullptr) {
+        DEBUG(dbgs() << "preserving fence " << FenceBetweenNPair << "\n");
+        Fences.erase(FenceBetweenNPair);
+        continue;
+      }
+      // Otherwise, insert one.
+      insertFenceBefore(*(NPair.second->I));
+      ++NumFencesInserted;
+    }
+    // And then erase any fence that is still marked for deletion.
+    for (auto FI : Fences) {
+      FI->eraseFromParent();
+      ++NumFencesDeleted;
+    }
+
+    return !Fences.empty();
+}
+
+bool FencesPRE::isWantedFence(const Instruction &I) const {
+  auto FI = dyn_cast<IntrinsicInst>(&I);
+  if (!FI)
+    return false;
+  if (FI->getIntrinsicID() != FenceIntrinsicID)
+    return false;
+  unsigned NumArgs = FI->getNumArgOperands();
+  if (NumArgs != FenceArgs.size())
+    return false;
+  for (unsigned i = 0; i < NumArgs; ++i) {
+    // Args for CallInst are enumerated from 1.
+    if (FenceArgs[i] != FI->getArgOperand(i+1))
+      return false;
+  }
+
+  return true;
+}
+
+FencesPRE::FlowGraphNode *
+FencesPRE::getNode(FlowGraph &FlowGraph, Instruction &I, bool shouldDuplicate) const {
+  // Does nothing if already present.
+  // FIXME: this does one allocation every time.
+  FlowGraph.insert(make_pair(&I, std::unique_ptr<FlowGraphNode>(new FlowGraphNode(&I))));
+  auto Node = FlowGraph[&I].get();
+  if (shouldDuplicate) {
+    if(!Node->DupNode) {
+      Node->DupNode = std::unique_ptr<FlowGraphNode>(new FlowGraphNode(&I));
+    }
+    Node = Node->DupNode.get();
+  }
+  return Node;
+}
+
+void FencesPRE::addEdge(FlowGraphNode *N1, FlowGraphNode *N2) const {
+  if (N1 == nullptr || N2 == nullptr)
+    return;
+
+  auto BB1 = N1->I->getParent(), BB2 = N2->I->getParent();
+  Capacity Capacity;
+  if (BB1 != BB2)
+    Capacity = (UINT64_MAX >> 1);
+  // Plus one, because it should never be 0, because of code size, inserting a
+  // fence is never completely free, even if the code is only very rarely
+  // executed
+  else
+    Capacity = (BFI->getBlockFreq(BB1).getFrequency() >> 2) + 1;
+
+  N1->Succs.push_back(std::make_pair(N2, Capacity));
+  // This may invalidate iterators, so we reset N->Next;
+  N1->Next = N1->Succs.begin();
+  // We also insert the reverse edge so that excess preflow can flow backward.
+  N1->Succs.push_back(std::make_pair(N2, 0));
+  N2->Next = N2->Succs.begin();
+  DEBUG(dbgs() << "addEdge (" << Capacity << "): "<< N1 << " -> " << N2 << "\n";
+        dbgs() << "\t"; N1->I->dump();
+        dbgs() << "\t"; N2->I->dump(););
+}
+
+// FIXME: The first argument is currently unused, will be useful once metadata
+// is attached to fences by AtomicExpand.
+FencesPRE::FlowGraphNode *FencesPRE::
+makeGraphUpwards(const Instruction &FI, Instruction &Root,
+                 FlowGraph &FlowGraph, NodeSubset &Sources) const {
+  auto BB = Root.getParent();
+  DEBUG(dbgs() << "makeGraphUpwards from " << &Root << " : ";
+        Root.dump());
+
+  // Get a reverse iterator at the Root instruction
+  BasicBlock::reverse_iterator it = BB->rbegin();
+  for (; &*it != &Root ; ++it) {}
+
+  for (auto b = BB->rend(); it != b; ++it) {
+    if (isStrongerFence(*it))
+      return nullptr;
+    if (!isStop(*it))
+      continue;
+
+    auto Node = getNode(FlowGraph, *it, true);
+    Sources.insert(Node);
+    return Node;
+  }
+
+  // We have reached the beginning of the basic block without finding any
+  // reason to stop. Let's continue in the predecessors.
+  auto InsertPt = BB->getFirstInsertionPt();
+  auto Node = getNode(FlowGraph, *InsertPt);
+  // But first, we should check whether we are at the beginning of the function.
+  if (&BB->getParent()->getEntryBlock() == BB) {
+    Sources.insert(Node);
+    return Node;
+  }
+  for (auto PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    // FIXME: add a conservative option where we do not go forward if there is
+    // any of our predecessor that we do not post-dominate.
+    BasicBlock *BBP = *PI;
+    Instruction *TermInst = BBP->getTerminator();
+    auto TNode = getNode(FlowGraph, *TermInst, true);
+    auto UpNode = makeGraphUpwards(FI, *TermInst, FlowGraph, Sources);
+    if (UpNode) {
+      addEdge(TNode, Node);
+      addEdge(UpNode, TNode);
+    }
+  }
+  return Node;
+}
+
+// FIXME: reduce duplication with makeGraphUpwards
+FencesPRE::FlowGraphNode *FencesPRE::
+makeGraphDownwards(const Instruction &FI, Instruction &Root,
+                   FlowGraph &FlowGraph, NodeSubset &Sinks) const {
+  auto BB = Root.getParent();
+  DEBUG(dbgs() << "makeGraphDownwards from " << &Root << " : ";
+        Root.dump());
+
+  // Get an iterator at the Root instruction
+  BasicBlock::iterator it = BB->begin();
+  for (; &*it != &Root ; ++it) {}
+
+  for (auto b = BB->end(); it != b; ++it) {
+    if (isStrongerFence(*it))
+      return nullptr;
+    if (!isStop(*it))
+      continue;
+
+    auto Node = getNode(FlowGraph, *it);
+    Sinks.insert(Node);
+    return Node;
+  }
+
+  // We have reached the end of the basic block without finding any
+  // reason to stop. Let's continue in the successors.
+  auto TermInst = BB->getTerminator();
+  auto Node = getNode(FlowGraph, *TermInst, true);
+  for (auto PI = succ_begin(BB), E = succ_end(BB); PI != E; ++PI) {
+    // FIXME: add a conservative option where we do not go forward if there is
+    // any of our predecessor that we do not post-dominate.
+    BasicBlock *BBS = *PI;
+    Instruction *InsertPt = BBS->getFirstInsertionPt();
+    auto INode = getNode(FlowGraph, *InsertPt);
+    auto DownNode = makeGraphDownwards(FI, *InsertPt, FlowGraph, Sinks);
+    if (DownNode) {
+      addEdge(Node, INode);
+      addEdge(INode, DownNode);
+    }
+  }
+  return Node;
+}
+
+bool FencesPRE::isStop(const Instruction &I) const {
+  return (I.mayReadOrWriteMemory() &&
+          !isWantedFence(I) &&
+          !isWeakerFence(I)) ||
+      isa<ReturnInst>(I);
+}
+
+void FencesPRE::insertFenceBefore(Instruction &I) const {
+  IRBuilder<> Builder(&I);
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *Fence = Intrinsic::getDeclaration(M, FenceIntrinsicID);
+  Builder.CreateCall(Fence, FenceArgs);
+}
+
+/* ----- Implementation of a basic push-relabel min-cut algorithm ----- */
+
+FencesPRE::Capacity FencesPRE::
+getResidualCapacity(Flow &Flow, Capacity C,
+                    FlowGraphNode &U, FlowGraphNode &V) const {
+  return C - Flow[std::make_pair(&U, &V)];
+}
+
+
+bool FencesPRE::
+push(Buckets &B, Flow &Flow, const NodeSubset &S, const NodeSubset &T,
+     Capacity C, FlowGraphNode &U, FlowGraphNode &V) const {
+  auto amount = std::min(U.Excess, getResidualCapacity(Flow, C, U, V));
+  if (amount <= 0)
+    return false;
+  Flow[std::make_pair(&U, &V)] += amount;
+  Flow[std::make_pair(&V, &U)] -= amount;
+  U.Excess -= amount;
+  V.Excess += amount;
+  DEBUG(dbgs() << "\t push " << amount << " : " << &U << " -> " << &V << "\n");
+  if (S.count(&V) || T.count(&V))
+    return true;
+  if (B.size() <= V.Height)
+    B.resize(V.Height + 1);
+  // FIXME: it should probably be a set to remove duplicates for efficiency
+  B[V.Height].push_back(&V);
+  return true;
+}
+
+void FencesPRE::relabel(Flow &Flow, FlowGraphNode &U) const {
+  auto minHeight = UINT32_MAX;
+  for (auto &VWithCap : U.Succs) {
+    auto &V = *VWithCap.first;
+    auto C = VWithCap.second;
+    if (getResidualCapacity(Flow, C, U, V) > 0)
+      minHeight = std::min(minHeight, V.Height+1);
+  }
+  DEBUG(dbgs() << "\trelabel " << &U << " from " << U.Height
+               << " to " << minHeight << "\n");
+  assert(minHeight != UINT32_MAX);
+  U.Height = minHeight;
+}
+
+FencesPRE::Height FencesPRE::
+discharge(Buckets &B, Flow &Flow, const NodeSubset &S,
+          const NodeSubset &T, FlowGraphNode &U) const {
+  Height MaxHeight = 0;
+  while (U.Excess > 0) {
+    if (U.Next != U.Succs.end()) {
+      auto VWithCap = *U.Next;
+      auto &V = *VWithCap.first;
+      auto C = VWithCap.second;
+      if (U.Height > V.Height && push(B, Flow, S, T, C, U, V))
+        MaxHeight = std::max(MaxHeight, V.Height);
+      else
+        ++U.Next;
+    } else {
+      relabel(Flow, U);
+      U.Next = U.Succs.begin();
+    }
+  }
+  return MaxHeight;
+}
+
+void FencesPRE::markNode(Flow &Flow, Cut &Cut, FlowGraphNode &U) const {
+  if (U.Mark)
+    return;
+  U.Mark = true;
+  DEBUG(dbgs() << "Marking node " << &U << " : "; U.I->dump());
+  for (auto &VWithCap : U.Succs)
+    if (getResidualCapacity(Flow, VWithCap.second, U, *VWithCap.first) > 0)
+      markNode(Flow, Cut, *VWithCap.first);
+    else
+      Cut.push_back(std::make_pair(&U, VWithCap.first));
+}
+
+void FencesPRE::findCut(const FlowGraph &FlowGraph, const NodeSubset &Sources,
+                        const NodeSubset &Sinks, Cut &Cut) const {
+  Buckets Buckets;
+  // The flow is a separate map and not part of the graph, because when pushing
+  // along an edge U->V we must also update Flow[V->U] and not just Flow[U->V].
+  Flow Flow;
+
+  // First we must make all of the Succs vectors into actual maps.
+  for (auto &InstToNode : FlowGraph) {
+    auto N = InstToNode.second.get();
+    do {
+      std::sort(N->Succs.begin(), N->Succs.end());
+      std::unique(N->Succs.begin(), N->Succs.end());
+    } while ((N = N->DupNode.get()));
+  }
+
+  // Initialization of the sources
+  for (auto S : Sources) {
+    S->Excess = INT64_MAX;
+    S->Height = FlowGraph.size();
+    for (auto &VWithCap : S->Succs)
+      push(Buckets, Flow, Sources, Sinks, VWithCap.second, *S, *VWithCap.first);
+  }
+
+  // We are in a trivial case with no more work to do.
+  if (Buckets.size() == 0) {
+    for (auto N : Sources)
+      markNode(Flow, Cut, *N);
+    return;
+  }
+
+  // signed int so we can detect when it hits -1
+  int CurrentActiveHeight = 0;
+  while (CurrentActiveHeight >= 0) {
+    if (Buckets[CurrentActiveHeight].empty()) {
+      --CurrentActiveHeight;
+      continue;
+    }
+    FlowGraphNode &N = *Buckets[CurrentActiveHeight].back();
+    Buckets[CurrentActiveHeight].pop_back();
+
+    DEBUG(dbgs() << "Discharging node " << &N << " : "; N.I->dump());
+    Height NewMaxHeight = discharge(Buckets, Flow, Sources, Sinks, N);
+    CurrentActiveHeight = std::max(CurrentActiveHeight, (int)NewMaxHeight);
+  }
+  // We are done, just produce the cut now;
+  for (auto N : Sources)
+    markNode(Flow, Cut, *N);
+}
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -35,6 +35,7 @@
   initializeCorrelatedValuePropagationPass(Registry);
   initializeDCEPass(Registry);
   initializeDeadInstEliminationPass(Registry);
+  initializeFencesPREPass(Registry);
   initializeScalarizerPass(Registry);
   initializeDSEPass(Registry);
   initializeGVNPass(Registry);
Index: test/CodeGen/PowerPC/atomics-fences-pre.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/atomics-fences-pre.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
+; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
+; This is already checked for in Atomics-64.ll
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
+
+; sync 1 + sync 1 -> sync 1
+define i32 @acquire_release(i32* %mem) {
+; CHECK-LABEL: acquire_release
+; CHECK: lwz
+; CHECK: sync 1
+; CHECK-NOT: sync
+; CHECK: stw
+  %val = load atomic i32* %mem acquire, align 4
+  store atomic i32 42, i32* %mem release, align 4
+  ret i32 %val
+}
+
+; sync 0 + sync 1 -> sync 0
+define i32 @acquire_seq_cst(i32* %mem) {
+; CHECK-LABEL: acquire_seq_cst
+; CHECK: lwz
+; CHECK: sync 0
+; CHECK-NOT: sync
+; CHECK: stw
+  %val = load atomic i32* %mem acquire, align 4
+  store atomic i32 42, i32* %mem seq_cst, align 4
+  ret i32 %val
+}
+
+; The insertion of a fence after the monotonic store allows the elimination of
+; the fence before the release store.
+define void @basic-pre(i32* %mem, i1 %cond) {
+; CHECK-LABEL: basic-pre
+entry:
+; CHECK-LABEL: BB#0
+; CHECK: lwz
+; CHECK: sync 1
+; CHECK-NOT: sync
+; CHECK: b
+  %val1 = load atomic i32* %mem acquire, align 4
+  br i1 %cond, label %branch, label %exit
+
+branch:
+; CHECK-LABEL: BB#1
+; CHECK-NOT: sync
+; CHECK: stw
+; CHECK: sync 1
+  store atomic i32 21, i32* %mem monotonic, align 4
+  br label %exit
+
+exit:
+; CHECK-LABEL: LBB2
+; CHECK-NOT: sync
+; CHECK: stw
+  store atomic i32 42, i32* %mem release, align 4
+  ret void
+}