Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -120,6 +120,7 @@ void initializeEarlyIfConverterPass(PassRegistry&); void initializeEdgeBundlesPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); +void initializeFencesPREPass(PassRegistry &); void initializeGCOVProfilerPass(PassRegistry&); void initializeAddressSanitizerPass(PassRegistry&); void initializeAddressSanitizerModulePass(PassRegistry&); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -16,17 +16,32 @@ #define LLVM_TRANSFORMS_SCALAR_H #include "llvm/ADT/StringRef.h" +#include "llvm/IR/Intrinsics.h" namespace llvm { class BasicBlockPass; class FunctionPass; -class Pass; class GetElementPtrInst; +class Instruction; +class Pass; class PassInfo; class TerminatorInst; class TargetLowering; class TargetMachine; +class Value; + +//===----------------------------------------------------------------------===// +// +// FencesPRE - Elimination of target-specific fences based on a PRE algorithm +// +FunctionPass *createFencesPREPass( + Intrinsic::ID FenceInt = Intrinsic::not_intrinsic, + ArrayRef FenceArgs = {}, + std::function isStrongerFence = + [](const Instruction &I) { return false; }, + std::function isWeakerFence = + [](const Instruction &I) { return false; }); //===----------------------------------------------------------------------===// // Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -14,7 +14,10 @@ #include "PPCTargetMachine.h" #include "PPC.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCStreamer.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" @@ -155,6 +158,31 @@ void PPCPassConfig::addIRPasses() { addPass(createAtomicExpandPass(&getPPCTargetMachine())); + if (getOptLevel() != CodeGenOpt::None) { + // We first try to eliminate redundant hwsync instructions, ignoring + // lwsyncs. + addPass(createFencesPREPass( + Intrinsic::ppc_sync, {}, + /*IsStrongerFence=*/[](const Instruction &I) { return false; }, + /*IsWeakerFence=*/[](const Instruction &I) { + if (auto FI = dyn_cast(&I)) + if (FI->getIntrinsicID() == Intrinsic::ppc_lwsync) return true; + return false; + })); + // Then we eliminate redundant lwsyncs, taking into account the hwsyncs. + addPass(createFencesPREPass( + Intrinsic::ppc_lwsync, {}, + /*IsStrongerFence=*/[](const Instruction &I) { + if (auto FI = dyn_cast(&I)) + if (FI->getIntrinsicID() == Intrinsic::ppc_sync) return true; + return false; + }, + /*IsWeakerFence=*/[](const Instruction &I) { return false; })); + // These two passes require that critical edges be split. So we should + // cleanup the CFG afterwards. + // FIXME: breaks a bunch of brittle tests + // addPass(createCFGSimplificationPass()); + } TargetPassConfig::addIRPasses(); } Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -7,6 +7,7 @@ DCE.cpp DeadStoreElimination.cpp EarlyCSE.cpp + FencesPRE.cpp FlattenCFGPass.cpp GVN.cpp IndVarSimplify.cpp Index: lib/Transforms/Scalar/FencesPRE.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/FencesPRE.cpp @@ -0,0 +1,689 @@ +//===- FencesPRE.cpp - Fence Elimination Implementation -=====================// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates target-specific fence intrinsics that are redundant, or +// partially redundant if it can introduce new fences to make them fully +// redundant. +// +// The algorithm is based on the following PRE paper, adapted to work on fences: +// Bernhard Scholz, R. Nigel Horspool, Jens Knoop: +// Optimizing for space and time usage with speculative partial redundancy +// elimination. +// LCTES 2004: 221-230 +// +// The general idea is to build a graph based on the CFG, with a node for each +// instruction that may affect memory, and a node for each join/split in the +// CFG, label this graph with the cost per edge of having a fence on that edge +// (currently using BlockFrequencyInfo). Then each memory-affecting instruction +// immediately before a fence is marked as a source, each one immediately after +// a fence is marked as a sink, and a min-cut provides us a minimum-cost set of +// fences that provide the same ordering guarantees. +// There are a few subtleties: for example some nodes must be duplicated to +// prevent them from being both a source and a sink; and the graph is only +// built around the fences (to avoid the cost of allocating a potentially huge +// graph). +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DOTGraphTraits.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Transforms/Scalar.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "fences-pre" + +STATISTIC(NumFencesDetected, "Number of fences before FencesPRE"); +STATISTIC(NumFencesInserted, "Number of fences added by FencesPRE"); +STATISTIC(NumFencesDeleted, "Number of fences erased by FencesPRE"); + +static cl::opt ViewFencesPREFlowGraph( + "view-fences-pre-flow-graph", cl::Hidden, cl::init(false), + cl::desc( + "Show a representation in dot of the flowgraph used by the fences-pre " + "pass")); + +namespace { +// signed so that we can also represent flow. +// Capacity of edges is (BlockFrequency >> 2) + 1 +// (shifted to fit in 63 bits, +1 because adding a fence increases code size, +// even in a block that is never executed). +typedef int64_t Capacity; +typedef uint32_t Height; + +struct FlowGraphNode { + Instruction *I; + // nullptr unless there is exactly one edge leading to this node, and a + // fence exist on this edge. Used to keep existing fences when possible, + // instead of erasing them and inserting a new one at the same spot. + Instruction *FenceBeforeIt; + Capacity Excess; + Height Height; + // This bit is used in the algorithm at the end to extract the min-cut + // from the max-flow. + bool Mark; + std::vector > Succs; + std::vector >::iterator Next; + // Source nodes must be separate, or a single node could be source and + // sink, which causes havoc. Terminator nodes are also duplicated to + // avoid trouble when a basic block contains just a terminator node. + // This is a unique_ptr because the node must be freed at the end, and + // is not in the FlowGraph map (since the non-duplicate already is). + std::unique_ptr DupNode; + FlowGraphNode() LLVM_DELETED_FUNCTION; + FlowGraphNode(Instruction *Inst) + : I(Inst), + FenceBeforeIt(nullptr), + Excess(0), + Height(0), + Mark(false), + Succs(), + Next(Succs.begin()), + DupNode() {} +}; + +typedef std::pair Edge; +typedef DenseMap > FlowGraph; +typedef SmallPtrSetImpl NodeSubset; +typedef SmallVectorImpl Cut; +typedef DenseMap Flow; +typedef std::vector > Buckets; + +struct FlowGraphViewer { + const FlowGraph *InstToNodeMap; + const NodeSubset *Sources; + const NodeSubset *Sinks; + const Flow *F; + FlowGraphViewer(const FlowGraph *FG, const NodeSubset *S, const NodeSubset *T, + const Flow *Fl) + : InstToNodeMap(FG), Sources(S), Sinks(T), F(Fl) {} +}; + +class FencesPRE : public FunctionPass { + public: + static char ID; + FencesPRE(Intrinsic::ID FenceInt = Intrinsic::not_intrinsic, + ArrayRef Args = {}, + // Stronger fences can make the fence under consideration + // redundant. + std::function IsStrongerFence = + [](const Instruction &I) { return false; }, + // Weaker fences are ignored by the algorithm. + std::function IsWeakerFence = + [](const Instruction &I) { return false; }) + : FunctionPass(ID), + FenceIntrinsicID(FenceInt), + FenceArgs(Args), + isStrongerFence(IsStrongerFence), + isWeakerFence(IsWeakerFence) { + initializeFencesPREPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequiredID(BreakCriticalEdgesID); + AU.addRequired(); + } + + private: + Intrinsic::ID FenceIntrinsicID; + ArrayRef FenceArgs; + std::function isStrongerFence; + std::function isWeakerFence; + BlockFrequencyInfo *BFI; + + FlowGraphNode *makeGraphUpwards(const Instruction &FI, Instruction &Root, + FlowGraph &FlowGraph, + NodeSubset &Sources) const; + FlowGraphNode *makeGraphDownwards(const Instruction &FI, Instruction &Root, + FlowGraph &FlowGraph, + NodeSubset &Sources) const; + void findCut(const FlowGraph &FlowGraph, const NodeSubset &Sources, + const NodeSubset &Sinks, Cut &Cut) const; + void insertFenceBefore(Instruction &I) const; + bool isStop(const Instruction &I) const; + bool isWantedFence(const Instruction &I) const; + FlowGraphNode *getNode(FlowGraph &FlowGraph, Instruction &I, + bool shouldDuplicate = false) const; + void addEdge(FlowGraphNode *N1, FlowGraphNode *N2) const; + Capacity getResidualCapacity(Flow &Flow, Capacity C, FlowGraphNode &U, + FlowGraphNode &V) const; + bool push(Buckets &B, Flow &Flow, const NodeSubset &S, const NodeSubset &T, + Capacity C, FlowGraphNode &U, FlowGraphNode &V) const; + void relabel(Flow &Flow, FlowGraphNode &U) const; + Height discharge(Buckets &B, Flow &Flow, const NodeSubset &S, + const NodeSubset &T, FlowGraphNode &U) const; + void markNode(Flow &Flow, Cut &Cut, FlowGraphNode &U) const; +}; +} + +char FencesPRE::ID = 0; +INITIALIZE_PASS_BEGIN(FencesPRE, "fences-pre", + "Partial redundancy elimination for fences", false, false) +INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) +INITIALIZE_PASS_END(FencesPRE, "fences-pre", + "Partial redundancy elimination for fences", false, false) +FunctionPass *llvm::createFencesPREPass( + Intrinsic::ID FenceInt, ArrayRef FenceArgs, + std::function isStrongerFence, + std::function isWeakerFence) { + return new FencesPRE(FenceInt, FenceArgs, isStrongerFence, isWeakerFence); +} + +bool FencesPRE::runOnFunction(Function &F) { + assert(FenceIntrinsicID); + BFI = &getAnalysis(); + + // This FlowGraph ensures that the entire graph is freed at the end of the + // function. + DenseMap > FlowGraph; + SmallPtrSet Sources; + SmallPtrSet Sinks; + SmallVector Cut; + SmallPtrSet Fences; + + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { + if (isWantedFence(*I)) { + Fences.insert(&*I); + ++NumFencesDetected; + FlowGraphNode *UpNode = makeGraphUpwards(*I, *I, FlowGraph, Sources); + FlowGraphNode *DownNode = makeGraphDownwards(*I, *I, FlowGraph, Sinks); + // These nodes can be null if there is a stronger fence just + // before/after this one + if (UpNode && DownNode) { + DownNode->FenceBeforeIt = &*I; + addEdge(UpNode, DownNode); + } + } + } + if (Fences.empty()) return false; + + DEBUG(dbgs() << "Sources:\n"; for (auto S + : Sources) { + dbgs() << "\t" << S << " : "; + S->I->dump(); + }); + DEBUG(dbgs() << "Sinks:\n"; for (auto T + : Sinks) { + dbgs() << "\t" << T << " : "; + T->I->dump(); + }); + + findCut(FlowGraph, Sources, Sinks, Cut); + + // We want fences on all the edges in the cut and nowhere else. + // For statistics/debugging, we don't want to count the fences already on + // the cut as deleted and added back. So for every edge in the cut, we + // add a fence if there is none, or mark one on the edge for + // preservation if there are some. Then we delete every fence not marked + // for preservation. Being "marked for preservation" in this context is + // just being removed from the 'Fences' set. + for (auto NPair : Cut) { + assert(NPair.first->I->getParent() == NPair.second->I->getParent()); + DEBUG(dbgs() << "Pair in the cut:" << NPair.first << " - " << NPair.second + << "\n"; + dbgs() << "\t"; NPair.first->I->dump(); dbgs() << "\t"; + NPair.second->I->dump()); + // If a fence is already there, just mark it as not for deletion. + Instruction *FenceBetweenNPair = NPair.second->FenceBeforeIt; + if (FenceBetweenNPair != nullptr) { + DEBUG(dbgs() << "preserving fence " << FenceBetweenNPair << "\n"); + Fences.erase(FenceBetweenNPair); + continue; + } + // Otherwise, insert one. + insertFenceBefore(*(NPair.second->I)); + ++NumFencesInserted; + } + // And then erase any fence that is still marked for deletion. + for (auto FI : Fences) { + FI->eraseFromParent(); + ++NumFencesDeleted; + } + + return !Fences.empty(); +} + +bool FencesPRE::isWantedFence(const Instruction &I) const { + auto FI = dyn_cast(&I); + if (!FI) return false; + if (FI->getIntrinsicID() != FenceIntrinsicID) return false; + unsigned NumArgs = FI->getNumArgOperands(); + if (NumArgs != FenceArgs.size()) return false; + for (unsigned i = 0; i < NumArgs; ++i) { + // Args for CallInst are enumerated from 1. + if (FenceArgs[i] != FI->getArgOperand(i + 1)) return false; + } + + return true; +} + +FlowGraphNode *FencesPRE::getNode(FlowGraph &FlowGraph, Instruction &I, + bool shouldDuplicate) const { + // Does nothing if already present. + // FIXME: this does one allocation every time. + FlowGraph.insert( + make_pair(&I, std::unique_ptr(new FlowGraphNode(&I)))); + auto Node = FlowGraph[&I].get(); + if (shouldDuplicate) { + if (!Node->DupNode) { + Node->DupNode = std::unique_ptr(new FlowGraphNode(&I)); + } + Node = Node->DupNode.get(); + } + return Node; +} + +void FencesPRE::addEdge(FlowGraphNode *N1, FlowGraphNode *N2) const { + if (N1 == nullptr || N2 == nullptr) return; + + auto BB1 = N1->I->getParent(), BB2 = N2->I->getParent(); + Capacity Capacity; + if (BB1 != BB2) Capacity = (std::numeric_limits::max() >> 1); + // Plus one, because it should never be 0, because of code size, inserting a + // fence is never completely free, even if the code is only very rarely + // executed + else + Capacity = (BFI->getBlockFreq(BB1).getFrequency() >> 2) + 1; + + N1->Succs.push_back(std::make_pair(N2, Capacity)); + // We also insert the reverse edge so that excess preflow can flow backward. + N2->Succs.push_back(std::make_pair(N1, 0)); + DEBUG(dbgs() << "addEdge (" << Capacity << "): " << N1 << " -> " << N2 + << "\n"; + dbgs() << "\t"; N1->I->dump(); dbgs() << "\t"; N2->I->dump();); +} + +// FIXME: The first argument is currently unused, will be useful once metadata +// is attached to fences by AtomicExpand. +FlowGraphNode *FencesPRE::makeGraphUpwards(const Instruction &FI, + Instruction &Root, + FlowGraph &FlowGraph, + NodeSubset &Sources) const { + auto BB = Root.getParent(); + DEBUG(dbgs() << "makeGraphUpwards from " << &Root << " : "; Root.dump()); + + // Get a reverse iterator at the Root instruction + BasicBlock::reverse_iterator it = BB->rbegin(); + for (; &*it != &Root; ++it) { + } + + for (auto b = BB->rend(); it != b; ++it) { + if (isStrongerFence(*it)) return nullptr; + if (!isStop(*it)) continue; + + auto Node = getNode(FlowGraph, *it, true); + Sources.insert(Node); + return Node; + } + + // We have reached the beginning of the basic block without finding any + // reason to stop. Let's continue in the predecessors. + auto InsertPt = BB->getFirstInsertionPt(); + auto Node = getNode(FlowGraph, *InsertPt); + // But first, we should check whether we are at the beginning of the function. + if (&BB->getParent()->getEntryBlock() == BB) { + Sources.insert(Node); + return Node; + } + for (auto PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + // FIXME: add a conservative option where we do not go forward if there is + // any of our predecessor that we do not post-dominate. + BasicBlock *BBP = *PI; + Instruction *TermInst = BBP->getTerminator(); + auto TNode = getNode(FlowGraph, *TermInst, true); + auto UpNode = makeGraphUpwards(FI, *TermInst, FlowGraph, Sources); + if (UpNode) { + addEdge(TNode, Node); + addEdge(UpNode, TNode); + } + } + return Node; +} + +// FIXME: reduce duplication with makeGraphUpwards +FlowGraphNode *FencesPRE::makeGraphDownwards(const Instruction &FI, + Instruction &Root, + FlowGraph &FlowGraph, + NodeSubset &Sinks) const { + auto BB = Root.getParent(); + DEBUG(dbgs() << "makeGraphDownwards from " << &Root << " : "; Root.dump()); + + // Get an iterator at the Root instruction + BasicBlock::iterator it = BB->begin(); + for (; &*it != &Root; ++it) { + } + + for (auto b = BB->end(); it != b; ++it) { + if (isStrongerFence(*it)) return nullptr; + if (!isStop(*it)) continue; + + auto Node = getNode(FlowGraph, *it); + Sinks.insert(Node); + return Node; + } + + // We have reached the end of the basic block without finding any + // reason to stop. Let's continue in the successors. + auto TermInst = BB->getTerminator(); + auto Node = getNode(FlowGraph, *TermInst, true); + for (auto PI = succ_begin(BB), E = succ_end(BB); PI != E; ++PI) { + // FIXME: add a conservative option where we do not go forward if there is + // any of our predecessor that we do not post-dominate. + BasicBlock *BBS = *PI; + Instruction *InsertPt = BBS->getFirstInsertionPt(); + auto INode = getNode(FlowGraph, *InsertPt); + auto DownNode = makeGraphDownwards(FI, *InsertPt, FlowGraph, Sinks); + if (DownNode) { + addEdge(Node, INode); + addEdge(INode, DownNode); + } + } + return Node; +} + +bool FencesPRE::isStop(const Instruction &I) const { + return (I.mayReadOrWriteMemory() && !isWantedFence(I) && !isWeakerFence(I)) || + isa(I); +} + +void FencesPRE::insertFenceBefore(Instruction &I) const { + IRBuilder<> Builder(&I); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *Fence = Intrinsic::getDeclaration(M, FenceIntrinsicID); + Builder.CreateCall(Fence, FenceArgs); +} + +/* ----- Implementation of a basic push-relabel min-cut algorithm ----- */ + +Capacity FencesPRE::getResidualCapacity(Flow &Flow, Capacity C, + FlowGraphNode &U, + FlowGraphNode &V) const { + return C - Flow[std::make_pair(&U, &V)]; +} + +bool FencesPRE::push(Buckets &B, Flow &Flow, const NodeSubset &S, + const NodeSubset &T, Capacity C, FlowGraphNode &U, + FlowGraphNode &V) const { + auto amount = std::min(U.Excess, getResidualCapacity(Flow, C, U, V)); + if (amount <= 0) return false; + Flow[std::make_pair(&U, &V)] += amount; + Flow[std::make_pair(&V, &U)] -= amount; + U.Excess -= amount; + V.Excess += amount; + DEBUG(dbgs() << "\t push " << amount << " : " << &U << " -> " << &V << "\n"); + if (S.count(&V) || T.count(&V)) return true; + if (B.size() <= V.Height) B.resize(V.Height + 1); + // FIXME: it should probably be a set to remove duplicates for efficiency + B[V.Height].push_back(&V); + return true; +} + +void FencesPRE::relabel(Flow &Flow, FlowGraphNode &U) const { + auto minHeight = std::numeric_limits::max(); + for (auto &VWithCap : U.Succs) { + auto &V = *VWithCap.first; + auto C = VWithCap.second; + if (getResidualCapacity(Flow, C, U, V) > 0) + minHeight = std::min(minHeight, V.Height + 1); + } + DEBUG(dbgs() << "\trelabel " << &U << " from " << U.Height << " to " + << minHeight << "\n"); + assert(minHeight != std::numeric_limits::max()); + U.Height = minHeight; +} + +Height FencesPRE::discharge(Buckets &B, Flow &Flow, const NodeSubset &S, + const NodeSubset &T, FlowGraphNode &U) const { + Height MaxHeight = 0; + while (U.Excess > 0) { + if (U.Next != U.Succs.end()) { + auto VWithCap = *U.Next; + auto &V = *VWithCap.first; + auto C = VWithCap.second; + if (U.Height > V.Height && push(B, Flow, S, T, C, U, V)) + MaxHeight = std::max(MaxHeight, V.Height); + else + ++U.Next; + } else { + relabel(Flow, U); + U.Next = U.Succs.begin(); + } + } + return MaxHeight; +} + +void FencesPRE::markNode(Flow &Flow, Cut &Cut, FlowGraphNode &U) const { + if (U.Mark) return; + U.Mark = true; + DEBUG(dbgs() << "Marking node " << &U << " : "; U.I->dump()); + for (auto &VWithCap : U.Succs) + if (getResidualCapacity(Flow, VWithCap.second, U, *VWithCap.first) > 0) + markNode(Flow, Cut, *VWithCap.first); + else + Cut.push_back(std::make_pair(&U, VWithCap.first)); +} + +void FencesPRE::findCut(const FlowGraph &FlowGraph, const NodeSubset &Sources, + const NodeSubset &Sinks, Cut &Cut) const { + Buckets Buckets; + // The flow is a separate map and not part of the graph, because when pushing + // along an edge U->V we must also update Flow[V->U] and not just Flow[U->V]. + Flow Flow; + + // First we must make all of the Succs vectors into actual maps. + for (auto &InstToNode : FlowGraph) { + auto N = InstToNode.second.get(); + do { + std::sort(N->Succs.begin(), N->Succs.end()); + auto last = std::unique(N->Succs.begin(), N->Succs.end()); + N->Succs.erase(last, N->Succs.end()); + // And not forget to initialize the N->Next iterators. + N->Next = N->Succs.begin(); + } while ((N = N->DupNode.get())); + } + + // Initialization of the sources + for (auto S : Sources) { + S->Excess = std::numeric_limits::max(); + S->Height = FlowGraph.size(); + for (auto &VWithCap : S->Succs) + push(Buckets, Flow, Sources, Sinks, VWithCap.second, *S, *VWithCap.first); + } + + // We are in a trivial case with no more work to do. + if (Buckets.size() == 0) { + for (auto N : Sources) markNode(Flow, Cut, *N); + return; + } + + // signed int so we can detect when it hits -1 + int CurrentActiveHeight = 0; + while (CurrentActiveHeight >= 0) { + if (Buckets[CurrentActiveHeight].empty()) { + --CurrentActiveHeight; + continue; + } + FlowGraphNode &N = *Buckets[CurrentActiveHeight].back(); + Buckets[CurrentActiveHeight].pop_back(); + + DEBUG(dbgs() << "Discharging node " << &N << " : "; N.I->dump()); + Height NewMaxHeight = discharge(Buckets, Flow, Sources, Sinks, N); + CurrentActiveHeight = std::max(CurrentActiveHeight, (int)NewMaxHeight); + } + // We are done, just produce the cut now; + for (auto N : Sources) markNode(Flow, Cut, *N); + + if (ViewFencesPREFlowGraph) + ViewGraph(FlowGraphViewer(&FlowGraph, &Sources, &Sinks, &Flow), ""); +} + +/* ----- Graph pretty-printing ----- */ + +// GraphTraits are used to be able to print the flow-graph for debugging. +namespace { +struct FGNSuccsIterator + : std::iterator { + std::vector >::iterator InternalIterator; + std::vector >::iterator InternalEnd; + FGNSuccsIterator( + std::vector >::iterator It, + std::vector >::iterator End) + : InternalIterator(It), InternalEnd(End) { + // Skip the first element(s) if it has a capacity of 0. + if (It != End && It->second == 0) this->operator++(); + } + iterator &operator++() { + do { + ++InternalIterator; + // We skip all the edges of capacity 0, they are added by the algorithm + // for pushing the flow back at the end. + } while (InternalIterator != InternalEnd && InternalIterator->second == 0); + return *this; + } + iterator operator++(int n) { + auto tmp = *this; + ++(*this); + return tmp; + } + // FIXME: this should return a FlowGraphNode&, but then it does not compile. + FlowGraphNode *operator*() const { return InternalIterator->first; } + FlowGraphNode *operator->() const { return InternalIterator->first; } + bool operator==(FGNSuccsIterator &other) { + return other.InternalIterator == InternalIterator; + } + bool operator!=(FGNSuccsIterator other) { return !(*this == other); } +}; +struct FGNodesIterator + : std::iterator { + DenseMap >::const_iterator + InternalIterator; + bool IsDup; + FGNodesIterator(DenseMap >::const_iterator It) + : InternalIterator(It), IsDup(false) {} + FlowGraphNode *operator->() const { + auto NodePtr = InternalIterator->second.get(); + if (IsDup) + return NodePtr->DupNode.get(); + else + return NodePtr; + } + // FIXME: this should return a FlowGraphNode&, but then it does not compile. + FlowGraphNode *operator*() const { return (this->operator->()); } + iterator &operator++() { + if ((*this)->DupNode) { + assert(!IsDup); + IsDup = true; + return *this; + } + IsDup = false; + ++InternalIterator; + return *this; + } + iterator operator++(int n) { + auto tmp = *this; + ++(*this); + return tmp; + } + bool operator==(FGNodesIterator &other) { + return other.InternalIterator == InternalIterator && other.IsDup == IsDup; + } + bool operator!=(FGNodesIterator other) { return !(*this == other); } +}; +} +namespace llvm { +template <> +struct GraphTraits { + typedef FlowGraphNode NodeType; + + typedef FGNSuccsIterator ChildIteratorType; + static ChildIteratorType child_begin(NodeType *N) { + return FGNSuccsIterator(N->Succs.begin(), N->Succs.end()); + } + static ChildIteratorType child_end(NodeType *N) { + return FGNSuccsIterator(N->Succs.end(), N->Succs.end()); + } + + typedef FGNodesIterator nodes_iterator; + static nodes_iterator nodes_begin(const FlowGraphViewer &G) { + return FGNodesIterator(G.InstToNodeMap->begin()); + } + static nodes_iterator nodes_end(const FlowGraphViewer &G) { + return FGNodesIterator(G.InstToNodeMap->end()); + } + + static FlowGraphNode *getEntryNode(const FlowGraphViewer &) { + llvm_unreachable("getEntryNode undefined for flowgraphs"); + // They may very well have several source nodes, and even have unconnected + // parts. + } + static unsigned size(FlowGraphViewer *G) { + llvm_unreachable("Size not implemented for flowgraphs."); + // And not trivial either, as some nodes may be duplicated, see DupNode in + // FlowGraphNode definition. + } +}; +// FIXME: Nodes and their DupNode should probably be linked in the graph +// FIXME: Different style for edges in the cut. +template <> +struct DOTGraphTraits : public DefaultDOTGraphTraits { + DOTGraphTraits(bool simple = false) : DefaultDOTGraphTraits(simple) {} + bool isNodeHidden(FlowGraphNode *N) { return N->Succs.empty(); } + static bool hasNodeAddressLabel(const void *N, const FlowGraphViewer &G) { + return true; + } + std::string getNodeLabel(FlowGraphNode *N, const FlowGraphViewer &G) { + auto I = N->I; + std::string Str; + raw_string_ostream OS(Str); + if (G.InstToNodeMap->find(I)->second.get() != N) OS << "DUP: "; + OS << *I; + return OS.str(); + } + static std::string getNodeAttributes(const FlowGraphNode *Node, + const FlowGraphViewer &G) { + // FIXME: SmallPtrSet::count() should accept const pointers + auto N = const_cast(Node); + if (G.Sources->count(N)) return "color=green"; + if (G.Sinks->count(N)) return "color=blue"; + return ""; + } + template + static std::string getEdgeAttributes(FlowGraphNode *N, EdgeIter EI, + const FlowGraphViewer &G) { + auto F = G.F->lookup(std::make_pair(N, *EI)); + auto C = EI.InternalIterator->second; + // Color inter-block edges + if (C == (std::numeric_limits::max() >> 1)) + return "color=red,style=dashed"; + return ("label=\" " + itostr(F) + "/" + itostr(C) + " \""); + } +}; +} Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -35,6 +35,7 @@ initializeCorrelatedValuePropagationPass(Registry); initializeDCEPass(Registry); initializeDeadInstEliminationPass(Registry); + initializeFencesPREPass(Registry); initializeScalarizerPass(Registry); initializeDSEPass(Registry); initializeGVNPass(Registry); Index: test/CodeGen/PowerPC/atomics-fences-pre.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/atomics-fences-pre.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32 +; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction). +; This is already checked for in Atomics-64.ll +; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64 + +; sync 1 + sync 1 -> sync 1 +define i32 @acquire_release(i32* %mem) { +; CHECK-LABEL: acquire_release +; CHECK: lwz +; CHECK: sync 1 +; CHECK-NOT: sync +; CHECK: stw + %val = load atomic i32* %mem acquire, align 4 + store atomic i32 42, i32* %mem release, align 4 + ret i32 %val +} + +; sync 0 + sync 1 -> sync 0 +define i32 @acquire_seq_cst(i32* %mem) { +; CHECK-LABEL: acquire_seq_cst +; CHECK: lwz +; CHECK: sync 0 +; CHECK-NOT: sync +; CHECK: stw + %val = load atomic i32* %mem acquire, align 4 + store atomic i32 42, i32* %mem seq_cst, align 4 + ret i32 %val +} + +; The insertion of a fence after the monotonic store allows the elimination of +; the fence before the release store. +define void @basic-pre(i32* %mem, i1 %cond) { +; CHECK-LABEL: basic-pre +entry: +; CHECK-LABEL: BB#0 +; CHECK: lwz +; CHECK: sync 1 +; CHECK-NOT: sync +; CHECK: b + %val1 = load atomic i32* %mem acquire, align 4 + br i1 %cond, label %branch, label %exit + +branch: +; CHECK-LABEL: BB#1 +; CHECK-NOT: sync +; CHECK: stw +; CHECK: sync 1 + store atomic i32 21, i32* %mem monotonic, align 4 + br label %exit + +exit: +; CHECK-LABEL: LBB2 +; CHECK-NOT: sync +; CHECK: stw + store atomic i32 42, i32* %mem release, align 4 + ret void +}