Index: llvm/trunk/include/llvm/InitializePasses.h =================================================================== --- llvm/trunk/include/llvm/InitializePasses.h +++ llvm/trunk/include/llvm/InitializePasses.h @@ -256,6 +256,7 @@ void initializeMemorySSAWrapperPassPass(PassRegistry&); void initializeMemorySanitizerPass(PassRegistry&); void initializeMergeFunctionsPass(PassRegistry&); +void initializeMergeICmpsPass(PassRegistry&); void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&); void initializeMetaRenamerPass(PassRegistry&); void initializeModuleDebugInfoPrinterPass(PassRegistry&); Index: llvm/trunk/include/llvm/LinkAllPasses.h =================================================================== --- llvm/trunk/include/llvm/LinkAllPasses.h +++ llvm/trunk/include/llvm/LinkAllPasses.h @@ -179,6 +179,7 @@ (void) llvm::createPostOrderFunctionAttrsLegacyPass(); (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); + (void) llvm::createMergeICmpsPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); Index: llvm/trunk/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar.h +++ llvm/trunk/include/llvm/Transforms/Scalar.h @@ -422,6 +422,12 @@ //===----------------------------------------------------------------------===// // +// MergeICmps - Merge integer comparison chains +// +Pass *createMergeICmpsPass(); + +//===----------------------------------------------------------------------===// +// // ValuePropagation - Propagate CFG-derived value information // Pass *createCorrelatedValuePropagationPass(); Index: llvm/trunk/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TargetPassConfig.cpp +++ llvm/trunk/lib/CodeGen/TargetPassConfig.cpp @@ -94,6 +94,10 @@ "enable-implicit-null-checks", cl::desc("Fold null checks into faulting memory operations"), cl::init(false)); +static cl::opt EnableMergeICmps( + "enable-mergeicmps", + cl::desc("Merge ICmp chains into a single memcmp"), + cl::init(false)); static cl::opt PrintLSR("print-lsr-output", cl::Hidden, cl::desc("Print LLVM IR produced by the loop-reduce pass")); static cl::opt PrintISelInput("print-isel-input", cl::Hidden, @@ -591,6 +595,10 @@ addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } + if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { + addPass(createMergeICmpsPass()); + } + // Run GC lowering passes for builtin collectors // TODO: add a pass insertion point here addPass(createGCLoweringPass()); Index: llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt @@ -42,6 +42,7 @@ LowerExpectIntrinsic.cpp LowerGuardIntrinsic.cpp MemCpyOptimizer.cpp + MergeICmps.cpp MergedLoadStoreMotion.cpp NaryReassociate.cpp NewGVN.cpp Index: llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp +++ llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp @@ -0,0 +1,641 @@ +//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass turns chains of integer comparisons into memcmp (the memcmp is +// later typically inlined as a chain of efficient hardware comparisons). This +// typically benefits c++ member or nonmember operator==(). +// +// The basic idea is to replace a larger chain of integer comparisons loaded +// from contiguous memory locations into a smaller chain of such integer +// comparisons. Benefits are double: +// - There are less jumps, and therefore less opportunities for mispredictions +// and I-cache misses. +// - Code size is smaller, both because jumps are removed and because the +// encoding of a 2*n byte compare is smaller than that of two n-byte +// compares. + +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/APSInt.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" + +using namespace llvm; + +namespace { + +#define DEBUG_TYPE "mergeicmps" + +#define MERGEICMPS_DOT_ON + +// A BCE atom. +struct BCEAtom { + const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; } + + bool operator<(const BCEAtom &O) const { + return Base() == O.Base() ? Offset.slt(O.Offset) : Base() < O.Base(); + } + + GetElementPtrInst *GEP = nullptr; + LoadInst *LoadI = nullptr; + APInt Offset; +}; + +// If this value is a load from a constant offset w.r.t. a base address, and +// there are no othe rusers of the load or address, returns the base address and +// the offset. +BCEAtom visitICmpLoadOperand(Value *const Val) { + BCEAtom Result; + if (auto *const LoadI = dyn_cast(Val)) { + DEBUG(dbgs() << "load\n"); + if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { + DEBUG(dbgs() << "used outside of block\n"); + return {}; + } + if (LoadI->isVolatile()) { + DEBUG(dbgs() << "volatile\n"); + return {}; + } + Value *const Addr = LoadI->getOperand(0); + if (auto *const GEP = dyn_cast(Addr)) { + DEBUG(dbgs() << "GEP\n"); + if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { + DEBUG(dbgs() << "used outside of block\n"); + return {}; + } + const auto &DL = GEP->getModule()->getDataLayout(); + if (!isDereferenceablePointer(GEP, DL)) { + DEBUG(dbgs() << "not dereferenceable\n"); + // We need to make sure that we can do comparison in any order, so we + // require memory to be unconditionnally dereferencable. + return {}; + } + Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0); + if (GEP->accumulateConstantOffset(DL, Result.Offset)) { + Result.GEP = GEP; + Result.LoadI = LoadI; + } + } + } + return Result; +} + +// A basic block with a comparison between two BCE atoms. +// Note: the terminology is misleading: the comparison is symmetric, so there +// is no real {l/r}hs. To break the symmetry, we use the smallest atom as Lhs. +class BCECmpBlock { + public: + BCECmpBlock() {} + + BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits) + : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) { + if (Rhs_ < Lhs_) + std::swap(Rhs_, Lhs_); + } + + bool IsValid() const { + return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr; + } + + // Assert the the block is consistent: If valid, it should also have + // non-null members besides Lhs_ and Rhs_. + void AssertConsistent() const { + if (IsValid()) { + assert(BB); + assert(CmpI); + assert(BranchI); + } + } + + const BCEAtom &Lhs() const { return Lhs_; } + const BCEAtom &Rhs() const { return Rhs_; } + int SizeBits() const { return SizeBits_; } + + // Returns true if the block does other works besides comparison. + bool doesOtherWork() const; + + // The basic block where this comparison happens. + BasicBlock *BB = nullptr; + // The ICMP for this comparison. + ICmpInst *CmpI = nullptr; + // The terminating branch. + BranchInst *BranchI = nullptr; + + private: + BCEAtom Lhs_; + BCEAtom Rhs_; + int SizeBits_ = 0; +}; + +bool BCECmpBlock::doesOtherWork() const { + AssertConsistent(); + // TODO(courbet): Can we allow some other things ? This is very conservative. + // We might be able to get away with anything does does not have any side + // effects outside of the basic block. + // Note: The GEPs and/or loads are not necessarily in the same block. + for (const Instruction &Inst : *BB) { + if (const auto *const GEP = dyn_cast(&Inst)) { + if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) + return true; + } else if (const auto *const L = dyn_cast(&Inst)) { + if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) + return true; + } else if (const auto *const C = dyn_cast(&Inst)) { + if (C != CmpI) + return true; + } else if (const auto *const Br = dyn_cast(&Inst)) { + if (Br != BranchI) + return true; + } else { + return true; + } + } + return false; +} + +// Visit the given comparison. If this is a comparison between two valid +// BCE atoms, returns the comparison. +BCECmpBlock visitICmp(const ICmpInst *const CmpI, + const ICmpInst::Predicate ExpectedPredicate) { + if (CmpI->getPredicate() == ExpectedPredicate) { + DEBUG(dbgs() << "cmp " + << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") + << "\n"); + auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0)); + if (!Lhs.Base()) + return {}; + auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1)); + if (!Rhs.Base()) + return {}; + return BCECmpBlock(std::move(Lhs), std::move(Rhs), + CmpI->getOperand(0)->getType()->getScalarSizeInBits()); + } + return {}; +} + +// Visit the given comparison block. If this is a comparison between two valid +// BCE atoms, returns the comparison. +BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, + const BasicBlock *const PhiBlock) { + if (Block->empty()) + return {}; + auto *const BranchI = dyn_cast(Block->getTerminator()); + if (!BranchI) + return {}; + DEBUG(dbgs() << "branch\n"); + if (BranchI->isUnconditional()) { + // In this case, we expect an incoming value which is the result of the + // comparison. This is the last link in the chain of comparisons (note + // that this does not mean that this is the last incoming value, blocks + // can be reordered). + auto *const CmpI = dyn_cast(Val); + if (!CmpI) + return {}; + DEBUG(dbgs() << "icmp\n"); + auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ); + Result.CmpI = CmpI; + Result.BranchI = BranchI; + return Result; + } else { + // In this case, we expect a constant incoming value (the comparison is + // chained). + const auto *const Const = dyn_cast(Val); + DEBUG(dbgs() << "const\n"); + if (!Const->isZero()) + return {}; + DEBUG(dbgs() << "false\n"); + auto *const CmpI = dyn_cast(BranchI->getCondition()); + if (!CmpI) + return {}; + DEBUG(dbgs() << "icmp\n"); + assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch"); + BasicBlock *const FalseBlock = BranchI->getSuccessor(1); + auto Result = visitICmp( + CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE); + Result.CmpI = CmpI; + Result.BranchI = BranchI; + return Result; + } + return {}; +} + +// A chain of comparisons. +class BCECmpChain { + public: + BCECmpChain(const std::vector &Blocks, PHINode &Phi); + + int size() const { return Comparisons_.size(); } + +#ifdef MERGEICMPS_DOT_ON + void dump() const; +#endif // MERGEICMPS_DOT_ON + + bool simplify(const TargetLibraryInfo *const TLI); + + private: + static bool IsContiguous(const BCECmpBlock &First, + const BCECmpBlock &Second) { + return First.Lhs().Base() == Second.Lhs().Base() && + First.Rhs().Base() == Second.Rhs().Base() && + First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && + First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; + } + + // Merges the given comparison blocks into one memcmp block and update + // branches. Comparisons are assumed to be continguous. If NextBBInChain is + // null, the merged block will link to the phi block. + static void mergeComparisons(ArrayRef Comparisons, + BasicBlock *const NextBBInChain, PHINode &Phi, + const TargetLibraryInfo *const TLI); + + PHINode &Phi_; + std::vector Comparisons_; + // The original entry block (before sorting); + BasicBlock *EntryBlock_; +}; + +BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi) + : Phi_(Phi) { + // Now look inside blocks to check for BCE comparisons. + std::vector Comparisons; + for (BasicBlock *Block : Blocks) { + BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block), + Block, Phi.getParent()); + Comparison.BB = Block; + if (!Comparison.IsValid()) { + DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n"); + return; + } + if (Comparison.doesOtherWork()) { + DEBUG(dbgs() << "block does extra work besides compare\n"); + if (Comparisons.empty()) { // First block. + // TODO(courbet): The first block can do other things, and we should + // split them apart in a separate block before the comparison chain. + // Right now we just discard it and make the chain shorter. + DEBUG(dbgs() + << "ignoring first block that does extra work besides compare\n"); + continue; + } + // TODO(courbet): Right now we abort the whole chain. We could be + // merging only the blocks that don't do other work and resume the + // chain from there. For example: + // if (a[0] == b[0]) { // bb1 + // if (a[1] == b[1]) { // bb2 + // some_value = 3; //bb3 + // if (a[2] == b[2]) { //bb3 + // do a ton of stuff //bb4 + // } + // } + // } + // + // This is: + // + // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+ + // \ \ \ \ + // ne ne ne \ + // \ \ \ v + // +------------+-----------+----------> bb_phi + // + // We can only merge the first two comparisons, because bb3* does + // "other work" (setting some_value to 3). + // We could still merge bb1 and bb2 though. + return; + } + DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits() + << " bits between " << Comparison.Lhs().Base() << " + " + << Comparison.Lhs().Offset << " and " + << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset + << "\n"); + DEBUG(dbgs() << "\n"); + Comparisons.push_back(Comparison); + } + EntryBlock_ = Comparisons[0].BB; + Comparisons_ = std::move(Comparisons); +#ifdef MERGEICMPS_DOT_ON + errs() << "BEFORE REORDERING:\n\n"; + dump(); +#endif // MERGEICMPS_DOT_ON + // Reorder blocks by LHS. We can do that without changing the + // semantics because we are only accessing dereferencable memory. + std::sort(Comparisons_.begin(), Comparisons_.end(), + [](const BCECmpBlock &a, const BCECmpBlock &b) { + return a.Lhs() < b.Lhs(); + }); +#ifdef MERGEICMPS_DOT_ON + errs() << "AFTER REORDERING:\n\n"; + dump(); +#endif // MERGEICMPS_DOT_ON +} + +#ifdef MERGEICMPS_DOT_ON +void BCECmpChain::dump() const { + errs() << "digraph dag {\n"; + errs() << " graph [bgcolor=transparent];\n"; + errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n"; + errs() << " edge [color=black];\n"; + for (size_t I = 0; I < Comparisons_.size(); ++I) { + const auto &Comparison = Comparisons_[I]; + errs() << " \"" << I << "\" [label=\"%" + << Comparison.Lhs().Base()->getName() << " + " + << Comparison.Lhs().Offset << " == %" + << Comparison.Rhs().Base()->getName() << " + " + << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8) + << " bytes)\"];\n"; + const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB); + if (I > 0) + errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n"; + errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n"; + } + errs() << " \"Phi\" [label=\"Phi\"];\n"; + errs() << "}\n\n"; +} +#endif // MERGEICMPS_DOT_ON + +bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) { + // First pass to check if there is at least one merge. If not, we don't do + // anything and we keep analysis passes intact. + { + bool AtLeastOneMerged = false; + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { + AtLeastOneMerged = true; + break; + } + } + if (!AtLeastOneMerged) + return false; + } + + // Remove phi references to comparison blocks, they will be rebuilt as we + // merge the blocks. + for (const auto &Comparison : Comparisons_) { + Phi_.removeIncomingValue(Comparison.BB, false); + } + + // Point the predecessors of the chain to the first comparison block (which is + // the new entry point). + if (EntryBlock_ != Comparisons_[0].BB) + EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); + + // Effectively merge blocks. + int NumMerged = 1; + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { + ++NumMerged; + } else { + // Merge all previous comparisons and start a new merge block. + mergeComparisons( + makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged), + Comparisons_[I].BB, Phi_, TLI); + NumMerged = 1; + } + } + mergeComparisons(makeArrayRef(Comparisons_) + .slice(Comparisons_.size() - NumMerged, NumMerged), + nullptr, Phi_, TLI); + + return true; +} + +void BCECmpChain::mergeComparisons(ArrayRef Comparisons, + BasicBlock *const NextBBInChain, + PHINode &Phi, + const TargetLibraryInfo *const TLI) { + assert(!Comparisons.empty()); + const auto &FirstComparison = *Comparisons.begin(); + BasicBlock *const BB = FirstComparison.BB; + LLVMContext &Context = BB->getContext(); + + if (Comparisons.size() >= 2) { + DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); + const auto TotalSize = + std::accumulate(Comparisons.begin(), Comparisons.end(), 0, + [](int Size, const BCECmpBlock &C) { + return Size + C.SizeBits(); + }) / + 8; + + // Incoming edges do not need to be updated, and both GEPs are already + // computing the right address, we just need to: + // - replace the two loads and the icmp with the memcmp + // - update the branch + // - update the incoming values in the phi. + FirstComparison.BranchI->eraseFromParent(); + FirstComparison.CmpI->eraseFromParent(); + FirstComparison.Lhs().LoadI->eraseFromParent(); + FirstComparison.Rhs().LoadI->eraseFromParent(); + + IRBuilder<> Builder(BB); + const auto &DL = Phi.getModule()->getDataLayout(); + Value *const MemCmpCall = + emitMemCmp(FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, + ConstantInt::get(DL.getIntPtrType(Context), TotalSize), + Builder, DL, TLI); + Value *const MemCmpIsZero = Builder.CreateICmpEQ( + MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); + + // Add a branch to the next basic block in the chain. + if (NextBBInChain) { + Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent()); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + } else { + Builder.CreateBr(Phi.getParent()); + Phi.addIncoming(MemCmpIsZero, BB); + } + + // Delete merged blocks. + for (size_t I = 1; I < Comparisons.size(); ++I) { + BasicBlock *CBB = Comparisons[I].BB; + CBB->replaceAllUsesWith(BB); + CBB->eraseFromParent(); + } + } else { + assert(Comparisons.size() == 1); + // There are no blocks to merge, but we still need to update the branches. + DEBUG(dbgs() << "Only one comparison, updating branches\n"); + if (NextBBInChain) { + if (FirstComparison.BranchI->isConditional()) { + DEBUG(dbgs() << "conditional -> conditional\n"); + // Just update the "true" target, the "false" target should already be + // the phi block. + assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent()); + FirstComparison.BranchI->setSuccessor(0, NextBBInChain); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + } else { + DEBUG(dbgs() << "unconditional -> conditional\n"); + // Replace the unconditional branch by a conditional one. + FirstComparison.BranchI->eraseFromParent(); + IRBuilder<> Builder(BB); + Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain, + Phi.getParent()); + Phi.addIncoming(FirstComparison.CmpI, BB); + } + } else { + if (FirstComparison.BranchI->isConditional()) { + DEBUG(dbgs() << "conditional -> unconditional\n"); + // Replace the conditional branch by an unconditional one. + FirstComparison.BranchI->eraseFromParent(); + IRBuilder<> Builder(BB); + Builder.CreateBr(Phi.getParent()); + Phi.addIncoming(FirstComparison.CmpI, BB); + } else { + DEBUG(dbgs() << "unconditional -> unconditional\n"); + Phi.addIncoming(FirstComparison.CmpI, BB); + } + } + } +} + +std::vector getOrderedBlocks(PHINode &Phi, + BasicBlock *const LastBlock, + int NumBlocks) { + // Walk up from the last block to find other blocks. + std::vector Blocks(NumBlocks); + BasicBlock *CurBlock = LastBlock; + for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) { + if (CurBlock->hasAddressTaken()) { + // Somebody is jumping to the block through an address, all bets are + // off. + DEBUG(dbgs() << "skip: block " << BlockIndex + << " has its address taken\n"); + return {}; + } + Blocks[BlockIndex] = CurBlock; + auto *SinglePredecessor = CurBlock->getSinglePredecessor(); + if (!SinglePredecessor) { + // The block has two or more predecessors. + DEBUG(dbgs() << "skip: block " << BlockIndex + << " has two or more predecessors\n"); + return {}; + } + if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) { + // The block does not link back to the phi. + DEBUG(dbgs() << "skip: block " << BlockIndex + << " does not link back to the phi\n"); + return {}; + } + CurBlock = SinglePredecessor; + } + Blocks[0] = CurBlock; + return Blocks; +} + +bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { + DEBUG(dbgs() << "processPhi()\n"); + if (Phi.getNumIncomingValues() <= 1) { + DEBUG(dbgs() << "skip: only one incoming value in phi\n"); + return false; + } + // We are looking for something that has the following structure: + // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+ + // \ \ \ \ + // ne ne ne \ + // \ \ \ v + // +------------+-----------+----------> bb_phi + // + // - The last basic block (bb4 here) must branch unconditionally to bb_phi. + // It's the only block that contributes a non-constant value to the Phi. + // - All other blocks (b1, b2, b3) must have exactly two successors, one of + // them being the the phi block. + // - All intermediate blocks (bb2, bb3) must have only one predecessor. + // - Blocks cannot do other work besides the comparison, see doesOtherWork() + + // The blocks are not necessarily ordered in the phi, so we start from the + // last block and reconstruct the order. + BasicBlock *LastBlock = nullptr; + for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) { + if (isa(Phi.getIncomingValue(I))) + continue; + if (LastBlock) { + // There are several non-constant values. + DEBUG(dbgs() << "skip: several non-constant values\n"); + return false; + } + LastBlock = Phi.getIncomingBlock(I); + } + if (!LastBlock) { + // There is no non-constant block. + DEBUG(dbgs() << "skip: no non-constant block\n"); + return false; + } + if (LastBlock->getSingleSuccessor() != Phi.getParent()) { + DEBUG(dbgs() << "skip: last block non-phi successor\n"); + return false; + } + + const auto Blocks = + getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues()); + if (Blocks.empty()) + return false; + BCECmpChain CmpChain(Blocks, Phi); + + if (CmpChain.size() < 2) { + DEBUG(dbgs() << "skip: only one compare block\n"); + return false; + } + + return CmpChain.simplify(TLI); +} + +class MergeICmps : public FunctionPass { + public: + static char ID; + + MergeICmps() : FunctionPass(ID) { + initializeMergeICmpsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; + const auto &TLI = getAnalysis().getTLI(); + auto PA = runImpl(F, &TLI); + return !PA.areAllPreserved(); + } + + private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI); +}; + +PreservedAnalyses MergeICmps::runImpl(Function &F, + const TargetLibraryInfo *TLI) { + DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n"); + + bool MadeChange = false; + + for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { + // A Phi operation is always first in a basic block. + if (auto *const Phi = dyn_cast(&*BBIt->begin())) + MadeChange |= processPhi(*Phi, TLI); + } + + if (MadeChange) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +} // namespace + +char MergeICmps::ID = 0; +INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps", + "Merge contiguous icmps into a memcmp", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(MergeICmps, "mergeicmps", + "Merge contiguous icmps into a memcmp", false, false) + +Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); } + Index: llvm/trunk/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/Scalar.cpp +++ llvm/trunk/lib/Transforms/Scalar/Scalar.cpp @@ -73,6 +73,7 @@ initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); initializeMemCpyOptLegacyPassPass(Registry); + initializeMergeICmpsPass(Registry); initializeMergedLoadStoreMotionLegacyPassPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); Index: llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll =================================================================== --- llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll +++ llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll @@ -0,0 +1,87 @@ +; RUN: opt -mergeicmps -S -o - %s | FileCheck %s + +%"struct.std::pair" = type { i32, i32 } + +define zeroext i1 @opeq1( + %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp3.i = icmp eq i32 %2, %3 + br label %opeq1.exit + +opeq1.exit: + %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] + ret i1 %4 +; CHECK-LABEL: @opeq1( +; The entry block with zero-offset GEPs is kept, loads are removed. +; CHECK: entry +; CHECK: getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; CHECK: getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; The two 4 byte loads and compares are replaced with a single 8-byte memcmp. +; CHECK: @memcmp({{.*}}8) +; CHECK: icmp eq {{.*}} 0 +; The branch is now a direct branch; the other block has been removed. +; CHECK: br label %opeq1.exit +; CHECK-NOT: br +; The phi is updated. +; CHECK: phi i1 [ %{{[^,]*}}, %entry ] +; CHECK-NEXT: ret +} + +; Same as above, but the two blocks are in inverse order. +define zeroext i1 @opeq1_inverse( + %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %1 = load i32, i32* %first1.i, align 4 + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %3 = load i32, i32* %second2.i, align 4 + %cmp3.i = icmp eq i32 %2, %3 + br label %opeq1.exit + +opeq1.exit: + %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] + ret i1 %4 +; CHECK-LABEL: @opeq1_inverse( +; The second block with zero-offset GEPs is kept, loads are removed. +; CHECK: land.rhs.i +; CHECK: getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; CHECK: getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; The two 4 byte loads and compares are replaced with a single 8-byte memcmp. +; CHECK: @memcmp({{.*}}8) +; CHECK: icmp eq {{.*}} 0 +; The branch is now a direct branch; the other block has been removed. +; CHECK: br label %opeq1.exit +; CHECK-NOT: br +; The phi is updated. +; CHECK: phi i1 [ %{{[^,]*}}, %land.rhs.i ] +; CHECK-NEXT: ret +} + + + Index: llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll =================================================================== --- llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll +++ llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll @@ -0,0 +1,73 @@ +; RUN: opt -mergeicmps -S -o - %s | FileCheck %s + +; This is a more involved test: clang generates this weird pattern for +; tuple. Right now we skip the entry block +; (which defines the base pointer for other blocks) and the last one (which +; does not have the expected structure). Only middle blocks (bytes [1,2]) are +; merged. + +%"class.std::tuple" = type { %"struct.std::_Tuple_impl" } +%"struct.std::_Tuple_impl" = type { %"struct.std::_Tuple_impl.0", %"struct.std::_Head_base.6" } +%"struct.std::_Tuple_impl.0" = type { %"struct.std::_Tuple_impl.1", %"struct.std::_Head_base.5" } +%"struct.std::_Tuple_impl.1" = type { %"struct.std::_Tuple_impl.2", %"struct.std::_Head_base.4" } +%"struct.std::_Tuple_impl.2" = type { %"struct.std::_Head_base" } +%"struct.std::_Head_base" = type { i8 } +%"struct.std::_Head_base.4" = type { i8 } +%"struct.std::_Head_base.5" = type { i8 } +%"struct.std::_Head_base.6" = type { i8 } + +define zeroext i1 @opeq( + %"class.std::tuple"* nocapture readonly dereferenceable(4) %a, + %"class.std::tuple"* nocapture readonly dereferenceable(4) %b) local_unnamed_addr #1 { +entry: + %0 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %a, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 + %add.ptr.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 3 + %1 = load i8, i8* %add.ptr.i.i.i.i.i, align 1 + %2 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %b, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 + %add.ptr.i.i.i6.i.i = getelementptr inbounds i8, i8* %2, i64 3 + %3 = load i8, i8* %add.ptr.i.i.i6.i.i, align 1 + %cmp.i.i = icmp eq i8 %1, %3 + br i1 %cmp.i.i, label %land.rhs.i.i, label %opeq.exit + +land.rhs.i.i: + %add.ptr.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 2 + %4 = load i8, i8* %add.ptr.i.i.i.i.i.i, align 1 + %add.ptr.i.i.i6.i.i.i = getelementptr inbounds i8, i8* %2, i64 2 + %5 = load i8, i8* %add.ptr.i.i.i6.i.i.i, align 1 + %cmp.i.i.i = icmp eq i8 %4, %5 + br i1 %cmp.i.i.i, label %land.rhs.i.i.i, label %opeq.exit + +land.rhs.i.i.i: + %add.ptr.i.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 1 + %6 = load i8, i8* %add.ptr.i.i.i.i.i.i.i, align 1 + %add.ptr.i.i.i6.i.i.i.i = getelementptr inbounds i8, i8* %2, i64 1 + %7 = load i8, i8* %add.ptr.i.i.i6.i.i.i.i, align 1 + %cmp.i.i.i.i = icmp eq i8 %6, %7 + br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %opeq.exit + +land.rhs.i.i.i.i: + %8 = load i8, i8* %0, align 1 + %9 = load i8, i8* %2, align 1 + %cmp.i.i.i.i.i = icmp eq i8 %8, %9 + br label %opeq.exit + +opeq.exit: + %10 = phi i1 [ false, %entry ], [ false, %land.rhs.i.i ], [ false, %land.rhs.i.i.i ], [ %cmp.i.i.i.i.i, %land.rhs.i.i.i.i ] + ret i1 %10 +; CHECK-LABEL: @opeq( +; The entry block is kept as is, but the next block is now the merged comparison +; block for bytes [1,2] or the block for the head. +; CHECK: entry +; CHECK: br i1 %cmp.i.i, label %land.rhs.i.i.i{{(.i)?}}, label %opeq.exit +; The two 1 byte loads and compares at offset 1 are replaced with a single +; 2-byte memcmp. +; CHECK: land.rhs.i.i.i +; CHECK: @memcmp({{.*}}2) +; CHECK: icmp eq {{.*}} 0 +; In the end we have three blocks. +; CHECK: phi i1 +; CHECK-SAME %entry +; CHECK-SAME %land.rhs.i.i.i.i +; CHECK-SAME %land.rhs.i.i.i +} + Index: llvm/trunk/test/Transforms/MergeICmps/volatile.ll =================================================================== --- llvm/trunk/test/Transforms/MergeICmps/volatile.ll +++ llvm/trunk/test/Transforms/MergeICmps/volatile.ll @@ -0,0 +1,30 @@ +; RUN: opt -mergeicmps -S -o - %s | FileCheck %s + +%"struct.std::pair" = type { i32, i32 } + +define zeroext i1 @opeq( + %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %2 = load volatile i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp3.i = icmp eq i32 %2, %3 + br label %opeq1.exit + +opeq1.exit: + %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] + ret i1 %4 +; CHECK-LABEL: @opeq( +; CHECK-NOT: memcmp +} +