Index: include/llvm/Analysis/ControlDependence.h =================================================================== --- /dev/null +++ include/llvm/Analysis/ControlDependence.h @@ -0,0 +1,66 @@ +//===- - -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// + +#ifndef LLVM_ANALYSIS_CONTROLDEPENDENCE_H +#define LLVM_ANALYSIS_CONTROLDEPENDENCE_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/CFG.h" +#include "llvm/Analysis/LoopInfo.h" +#include + +namespace llvm { + class ControlDependence { + // FIXME: do some caching here. + PostDominatorTree *PDT; + Loop *Region; + + public: + ControlDependence(PostDominatorTree *PDT, Loop *L) : PDT(PDT), Region(L) { + } + + Instruction *dependsOnBlock(BasicBlock *B) { + return nullptr; + } + + static Instruction *findControlInstInBlock(BasicBlock *B) { + return nullptr; + } + + SmallVector findControlBlocks(BasicBlock *B) { + SmallVector DirectCD; + + // FIXME: find non-direct CDs as well.x + std::copy_if(pred_begin(B), pred_end(B), + std::back_inserter(DirectCD), + [this, B](BasicBlock *P) { + return (this->Region->contains(P) && + !this->PDT->dominates(B, P)); + }); + + return DirectCD; + } + + SmallVector findControlInsts(BasicBlock *B) { + SmallVector CI; + + SmallVector CD = findControlBlocks(B); + for (BasicBlock *C : CD) { + auto *Br = cast(C->getTerminator()); + assert(Br->isConditional() && + "Control block with unconditional branch"); + CI.push_back(Br); + } + return CI; + } +}; +} + +#endif Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -16,6 +16,7 @@ LICM.cpp LoadCombine.cpp LoopDeletion.cpp + LoopDistribute.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp LoopRerollPass.cpp Index: lib/Transforms/Scalar/LoopDistribute.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/LoopDistribute.cpp @@ -0,0 +1,574 @@ +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/STLExtras.h" +#include +#include +#include +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ControlDependence.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Support/Debug.h" +#include "llvm/Pass.h" + +#define LDIST_NAME "loop-distribute" +#define DEBUG_TYPE LDIST_NAME + +using namespace llvm; + +namespace { + +void remapInstructions(SmallVectorImpl &Blocks, + ValueToValueMapTy &VMap) { + // Rewrite the code to refer to itself. + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) + for (BasicBlock::iterator I = Blocks[i]->begin(), + E = Blocks[i]->end(); I != E; ++I) + RemapInstruction(I, VMap, + RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); +} + +class InstrPartition { + typedef std::set InstructionSet; + + InstructionSet Set; + Loop *L; + ControlDependence *CD; + +public: + typedef SmallVector BlocksT; + BlocksT Blocks; + ValueToValueMapTy VMap; + bool hasCycle; + + template + InstrPartition(IterTy B, IterTy E, bool hasCycle, Loop *L, + ControlDependence *CD) : L(L), CD(CD), hasCycle(hasCycle) { + for (IterTy I = B; I != E; ++I) { + Set.insert(*I); + } + } + + InstrPartition(Instruction *I, Loop *L, ControlDependence *CD, + bool hasCycle = false) + : L(L), CD(CD), hasCycle(hasCycle) { + Set.insert(I); + } + + void add(Instruction *I) { Set.insert(I); } + + void copyTo(InstrPartition &Other) { + Other.Set.insert(begin(), end()); + } + + InstructionSet::iterator begin() { + return Set.begin(); + } + + InstructionSet::iterator end() { + return Set.end(); + } + + void populateUsedSet() { + SmallVector Worklist(Set.begin(), Set.end()); + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + BasicBlock *BB = I->getParent(); + + SmallVector CIs = CD->findControlInsts(BB); + // Insert in the set and any new elements onto the worklist. + for (Instruction *I : CIs) + if (Set.insert(I).second) + Worklist.push_back(I); + + // Insert instructions from the loop that we depend on as well. + for (Value *V : I->operand_values()) { + auto *I = dyn_cast(V); + if (I && L->contains(I->getParent()) && Set.insert(I).second) + Worklist.push_back(I); + } + } + } + + void print() { + if (hasCycle) + dbgs() << " (cycle)\n"; + for (auto *I : Set) + dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; + } + + BasicBlock *getLoopPreheader() { + return Blocks[0]; + } + + void remapInstructions() { + ::remapInstructions(Blocks, VMap); + } + + iterator_range getLoopBlocks() { + return make_range(Blocks.begin() + 1, Blocks.end() - 1); + } + + void removeUnusedInsts() { + SmallVector Unused; + for (auto *BB: L->getBlocks()) + for (auto &I: *BB) + if (!Set.count(&I)) { + Instruction *NewI = &I; + if (!VMap.empty()) + NewI = cast(VMap[NewI]); + auto *Br = dyn_cast(NewI); + if (Br) { + // If this is a conditional branch that no-one is control dependent + // on then we can go either way. Turn it into an unconditional + // branch. Keep unconditional branches. + if (Br->isConditional()) { + BranchInst::Create(Br->getSuccessor(0), Br); + Unused.push_back(NewI); + } + } else + Unused.push_back(NewI); + } + for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) + (*I)->eraseFromParent(); + + dbgs() << "\nAfter removing unused Instrs:\n"; + if (Blocks.empty()) + for (auto *BB: L->getBlocks()) + dbgs() << *BB; + else + for (auto *BB: Blocks) + dbgs() << *BB; + } +}; + +// When splitting A->B, SplitEdge produces A->B->B.split and returns the new +// block B.split, I think that in some other cases it returns the middle block. +// To keep this consistent when transforming A->B to A->M->B we return M and +// update B if necessary. +BasicBlock *LDistSplitEdge(BasicBlock *From, BasicBlock *&To, Pass *P) { + // FIXME: args + BasicBlock *NewBlock = SplitEdge(From, To, nullptr, nullptr); + BasicBlock *SinglePred = NewBlock->getSinglePredecessor(); + if (SinglePred && SinglePred != From) { + To = NewBlock; + return SinglePred; + } + return NewBlock; +} + +SmallVector splitLoopPreheadAndExit(Loop *L, Pass *P) { + BasicBlock *PH = L->getLoopPreheader(); + BasicBlock *H = L->getHeader(); + BasicBlock *NewPH = LDistSplitEdge(PH, H, P); + BasicBlock *ExitBlock = L->getExitBlock(); + BasicBlock *NewExit = LDistSplitEdge(L->getExitingBlock(), ExitBlock, P); + SmallVector LoopBlocks; + LoopBlocks.push_back(NewPH); + LoopBlocks.append(L->block_begin(), L->block_end()); + LoopBlocks.push_back(NewExit); + return LoopBlocks; +} + +void cloneLoop(BasicBlock *Before, + SmallVectorImpl &LoopBlocks, + SmallVectorImpl &NewBlocks, + ValueToValueMapTy &VMap, const Twine &NameSuffix) { + Function *F = LoopBlocks[0]->getParent(); + for (BasicBlock *BB : LoopBlocks) { + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + NewBlocks.push_back(NewBB); + VMap[BB] = NewBB; +// FIXME: +// LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L); + } + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(Before, F->getBasicBlockList(), + NewBlocks[0], F->end()); +} + + +// +// +// +class InstrPartitionContainer { + std::list> PartitionContainer; + Loop *L; + PostDominatorTree *PDT; + ControlDependence CD; + DenseMap InstToPartitionId; + +public: + InstrPartitionContainer(Loop *L, PostDominatorTree *PDT) : + L(L), PDT(PDT), CD(PDT, L) { + } + + unsigned getSize() const { return PartitionContainer.size(); } + + void addToCyclicPartition(Instruction *I) { + if (PartitionContainer.empty() || !PartitionContainer.back()->hasCycle) + PartitionContainer.push_back(llvm::make_unique(I, L, &CD, + true)); + else + PartitionContainer.back()->add(I); + } + + void addToNewNonCyclicPartition(Instruction *I) { + PartitionContainer.push_back(llvm::make_unique(I, L, &CD)); + } + + bool isConditional(Instruction *I) { + return !PDT->dominates(I->getParent(), L->getHeader()); + } + + void merge() { + // Merge adjacent non-cyclic partitions. The idea is that we currently only + // want to isolate the non-vectorizable partition. We could later allow + // more distribution among these partition too. + for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { + auto &Part1 = *I; + if (Part1->hasCycle) + ++I; + else + for (I = std::next(I); + I != PartitionContainer.end() && !(*I)->hasCycle;) { + (*I)->copyTo(*Part1); + I = PartitionContainer.erase(I); + } + } + } + + void setupPartitionIdOnInstructions() { + unsigned PartitionID = 0; + for (auto &PartitionPtr : PartitionContainer) { + for (Instruction *Instr : *PartitionPtr) + InstToPartitionId[Instr] = PartitionID; + ++PartitionID; + } + } + + void populateUsedSet() { + for (auto &P : PartitionContainer) + P->populateUsedSet(); + + // We're done with the partitions set up the reverse relationship from + // instructions to partitions. + setupPartitionIdOnInstructions(); + } + + void setSuccessor(BasicBlock *BB, BasicBlock *Succ) { + auto Branch = cast(BB->getTerminator()); + assert(Branch->isUnconditional()); + Branch->setSuccessor(0, Succ); + } + + void cloneLoops(Pass *P) { + BasicBlock *ExitBlock = L->getExitBlock(); + BasicBlock *PH = L->getLoopPreheader(); + SmallVector LoopBlocks = splitLoopPreheadAndExit(L, P); + BasicBlock *NewPH = LoopBlocks[0]; + + // In order to stich in the new loop on the PH -> NewPH edge: + // * change the NewExit -> ExitBlock to NewExit -> NewPH + // * change PH -> NewPH to PH -> last cloned NewPH + assert(!PartitionContainer.empty() && "at least two partitions expected"); + unsigned Index = 1; + for (auto I = std::next(PartitionContainer.crbegin()), + E = PartitionContainer.crend(); + I != E; + ++I, ++Index) { + auto &Part = *I; + + cloneLoop(NewPH, LoopBlocks, Part->Blocks, Part->VMap, + Twine(".ldist") + Twine(Index)); + Part->VMap[ExitBlock] = NewPH; + NewPH = Part->getLoopPreheader(); + Part->remapInstructions(); + } + setSuccessor(PH, NewPH); + } + + void removeUnusedInsts() { + for (auto &PartitionPtr : PartitionContainer) + PartitionPtr->removeUnusedInsts(); + } + + /// \brief For each memory pointer compute the partition it is used + /// in. If it used in multiple partitions set it to -1. + SmallVector + computePartitionSetForPointers(const LoopAccessInfo &LAI) { + const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck = + LAI.getRuntimePointerCheck(); + + unsigned N = RtPtrCheck->Pointers.size(); + SmallVector Partitions(N); + for (unsigned I = 0; I < N; ++I) { + Value *Ptr = RtPtrCheck->Pointers[I]; + auto Instructions = + LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]); + + int &Partition = Partitions[I]; + // First set it to uninitialized. + Partition = -2; + std::for_each( + Instructions.begin(), Instructions.end(), [&](Instruction *Inst) { + unsigned ThisPartition = this->InstToPartitionId[Inst]; + if (Partition == -2) + Partition = ThisPartition; + // -1 means belonging to multiple partitions. + else if (Partition == -1) + return; + else if (Partition != (int)ThisPartition) + Partition = -1; + }); + assert(Partition != -2 && "Pointer not belonging to any partition"); + } + + return Partitions; + } + + void print() { + unsigned Index = 0; + for (auto I = PartitionContainer.begin(); I != PartitionContainer.end(); + ++I, ++Index) { + dbgs() << Index << ":\n"; + (*I)->print(); + } + } +}; + +class MemoryInstructionDependences { + typedef MemoryDepChecker::Dependence Dependence; +public: + struct Entry { + Instruction *Inst; + unsigned NumUnsafeDependencesStartOrEnd; + + Entry(Instruction *Inst) + : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} + }; + + typedef SmallVector AccessesType; + + AccessesType::const_iterator begin() const { return Accesses.begin(); } + AccessesType::const_iterator end() const { return Accesses.end(); } + + MemoryInstructionDependences( + const SmallVectorImpl &Instructions, + const SmallVectorImpl &InterestingDependences) { + for (auto *Inst : Instructions) + Accesses.push_back(Entry(Inst)); + + DEBUG(dbgs() << "Unsafe dependence:\n"); + for (auto &Dep : InterestingDependences) + if (Dep.isPossiblyBackward()) { + ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; + --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; + + DEBUG(Dep.print(dbgs(), 2, Instructions)); + } + } + +private: + AccessesType Accesses; +}; + +class LDistRuntimeCheckEmitter { + const LoopAccessInfo &LAI; + Loop *L; + BasicBlock *MemCheckBB; + SmallVector OrigLoopBlocks; + SmallVector PtrPartition; + +public: + LDistRuntimeCheckEmitter(const LoopAccessInfo &LAI, Loop *L) : + LAI(LAI), L(L), MemCheckBB(nullptr) { + } + + bool needsRuntimeChecks() const { + // FIXME + return true; + } + + void partitionPointers(InstrPartitionContainer &Partitions) { + // Set up partition id in PtrRtChecks. Ptr -> Access -> Intruction -> + // Partition. + PtrPartition = Partitions.computePartitionSetForPointers(LAI); + dbgs() << "\nPointers:\n"; + LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrPartition); + } + + void versionLoop(Pass *P) { + BasicBlock *OrigPH = L->getLoopPreheader(); + SmallVector LoopBlocks = splitLoopPreheadAndExit(L, P); + ValueToValueMapTy VMap; + cloneLoop(*(LoopBlocks.end() - 1), LoopBlocks, OrigLoopBlocks, VMap, + ".ldist.orig"); + remapInstructions(OrigLoopBlocks, VMap); + + // FIXME: fill out nullptr args + MemCheckBB = SplitBlock(OrigPH, OrigPH->getTerminator(), nullptr, nullptr); + MemCheckBB->setName(L->getHeader()->getName() + ".ldist.memcheck"); + + } + + void insertChecks() { + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + Instruction *OrigTerm = MemCheckBB->getTerminator(); + std::tie(FirstCheckInst, MemRuntimeCheck) = + LAI.addRuntimeCheck(OrigTerm, &PtrPartition); + BranchInst::Create(OrigLoopBlocks[0], L->getLoopPreheader(), + MemRuntimeCheck, OrigTerm); + OrigTerm->eraseFromParent(); + } +}; + +struct LoopDistribute : public FunctionPass { + static char ID; + typedef EquivalenceClasses PartitionsT; + + LoopDistribute() : FunctionPass(ID) { + initializeLoopDistributePass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + const DataLayout *DL; + LoopInfo *LI; + AliasAnalysis *AA; + PostDominatorTree *PDT; + TargetLibraryInfo *TLI; + LoopAccessAnalysis *LAA; + + virtual bool runOnFunction(Function &F) { + SE = &getAnalysis(); + DataLayoutPass *DLP = getAnalysisIfAvailable(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + LI = &getAnalysis().getLoopInfo(); + AA = &getAnalysis(); + PDT = &getAnalysis(); + auto *TLIP = getAnalysisIfAvailable(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; + LAA = &getAnalysis(); + + // Build up a worklist of inner-loops to vectorize. This is necessary as + // the act of vectorizing or partially unrolling a loop creates new loops + // and can invalidate iterators across the loops. + SmallVector Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + while (!Worklist.empty()) + Changed |= processLoop(Worklist.pop_back_val()); + + // Process each loop nest in the function. + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool processLoop(Loop *L) { + assert(L->empty() && "Only process inner loops."); + + DEBUG(dbgs() << "\nLDist: In \"" + << L->getHeader()->getParent()->getName() << + "\" checking " << *L << "\n"); + + // ScalarEvolution needs to be able to find the exit count. + const SCEV *ExitCount = SE->getBackedgeTakenCount(L); + if (ExitCount == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "LDist: SCEV could not compute the loop exit count.\n"); + return false; + } + + const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + + // Currently, we only distribute to isolate the part of the loop with + // dependence cycles to enable partial vectorization. + if (LAI.canVectorizeMemory()) + return false; + if (LAI.getDepChecker().isSafeForVectorization()) + return false; + auto *InterestingDependences = + LAI.getDepChecker().getInterestingDependences(); + if (!InterestingDependences) + return false; + + InstrPartitionContainer Partitions(L, PDT); + + const MemoryDepChecker &DepChecker = LAI.getDepChecker(); + MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), + *InterestingDependences); + + int NumUnsafeDependencesActive = 0; + for (auto &InstDep : MID) { + Instruction *I = InstDep.Inst; + if (NumUnsafeDependencesActive) + Partitions.addToCyclicPartition(I); + else if (isa(I)) + Partitions.addToNewNonCyclicPartition(I); + NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; + } + + // FIXME: also add defs whose value is used outside the loop + + DEBUG(dbgs() << "Seeded partitions:\n"); + DEBUG(Partitions.print()); + + Partitions.merge(); + DEBUG(dbgs() << "\nMerged partitions:\n"); + DEBUG(Partitions.print()); + + if (Partitions.getSize() < 2) + return false; + + Partitions.populateUsedSet(); + dbgs() << "\nPopulated partitions:\n"; + Partitions.print(); + + LDistRuntimeCheckEmitter RtCheckEmitter(LAI, L); + if (RtCheckEmitter.needsRuntimeChecks()) { + RtCheckEmitter.partitionPointers(Partitions); + RtCheckEmitter.versionLoop(this); + RtCheckEmitter.insertChecks(); + } + + Partitions.cloneLoops(this); + + Partitions.removeUnusedInsts(); + + return true; + } +}; +} + +char LoopDistribute::ID = 0; +static const char ldist_name[] = "Loop Distribition"; + +INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) + +namespace llvm { + Pass *createLoopDistributePass() { + return new LoopDistribute(); + } +} Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -76,6 +76,7 @@ initializeLoadCombinePass(Registry); initializePlaceBackedgeSafepointsImplPass(Registry); initializePlaceSafepointsPass(Registry); + initializeLoopDistributePass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {