Index: lib/Transforms/Scalar/MergeICmps.cpp =================================================================== --- lib/Transforms/Scalar/MergeICmps.cpp +++ lib/Transforms/Scalar/MergeICmps.cpp @@ -41,6 +41,15 @@ #define DEBUG_TYPE "mergeicmps" +// Returns true if the instruction is a simple load or a simple store +static bool isSimpleLoadOrStore(const Instruction *I) { + if (const LoadInst *LI = dyn_cast(I)) + return LI->isSimple(); + if (const StoreInst *SI = dyn_cast(I)) + return SI->isSimple(); + return false; +} + // A BCE atom. struct BCEAtom { BCEAtom() : GEP(nullptr), LoadI(nullptr), Offset() {} @@ -150,18 +159,19 @@ // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp // instructions in the block. - bool canSplit() const; + bool canSplit(AliasAnalysis *AA) const; // Return true if this all the relevant instructions in the BCE-cmp-block can // be sunk below this instruction. By doing this, we know we can separate the // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the // block. - bool canSinkBCECmpInst(const Instruction *, DenseSet &) const; + bool canSinkBCECmpInst(const Instruction *, DenseSet &, + AliasAnalysis *AA) const; // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block // instructions. Split the old block and move all non-BCE-cmp-insts into the // new parent block. - void split(BasicBlock *NewParent) const; + void split(BasicBlock *NewParent, AliasAnalysis *AA) const; // The basic block where this comparison happens. BasicBlock *BB = nullptr; @@ -179,12 +189,21 @@ }; bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, - DenseSet &BlockInsts) const { + DenseSet &BlockInsts, + AliasAnalysis *AA) const { // If this instruction has side effects and its in middle of the BCE cmp block // instructions, then bail for now. - // TODO: use alias analysis to tell whether there is real interference. - if (Inst->mayHaveSideEffects()) - return false; + if (Inst->mayHaveSideEffects()) { + // Bail if this is not a simple load or store + if (!isSimpleLoadOrStore(Inst)) + return false; + + MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI); + MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI); + if (isModSet(AA->getModRefInfo(Inst, LLoc)) || + isModSet(AA->getModRefInfo(Inst, RLoc))) + return false; + } // Make sure this instruction does not use any of the BCE cmp block // instructions as operand. for (auto BI : BlockInsts) { @@ -194,14 +213,15 @@ return true; } -void BCECmpBlock::split(BasicBlock *NewParent) const { +void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis *AA) const { DenseSet BlockInsts( {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); llvm::SmallVector OtherInsts; for (Instruction &Inst : *BB) { if (BlockInsts.count(&Inst)) continue; - assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block"); + assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) && + "Split unsplittable block"); // This is a non-BCE-cmp-block instruction. And it can be separated // from the BCE-cmp-block instruction. OtherInsts.push_back(&Inst); @@ -213,12 +233,12 @@ } } -bool BCECmpBlock::canSplit() const { +bool BCECmpBlock::canSplit(AliasAnalysis *AA) const { DenseSet BlockInsts( {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); for (Instruction &Inst : *BB) { if (!BlockInsts.count(&Inst)) { - if (!canSinkBCECmpInst(&Inst, BlockInsts)) + if (!canSinkBCECmpInst(&Inst, BlockInsts, AA)) return false; } } @@ -324,17 +344,18 @@ // A chain of comparisons. class BCECmpChain { public: - BCECmpChain(const std::vector &Blocks, PHINode &Phi); + BCECmpChain(const std::vector &Blocks, PHINode &Phi, + AliasAnalysis *AA); - int size() const { return Comparisons_.size(); } + int size() const { return Comparisons_.size(); } #ifdef MERGEICMPS_DOT_ON void dump() const; #endif // MERGEICMPS_DOT_ON - bool simplify(const TargetLibraryInfo *const TLI); + bool simplify(const TargetLibraryInfo *const TLI, AliasAnalysis *AA); - private: +private: static bool IsContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) { return First.Lhs().Base() == Second.Lhs().Base() && @@ -348,7 +369,7 @@ // null, the merged block will link to the phi block. void mergeComparisons(ArrayRef Comparisons, BasicBlock *const NextBBInChain, PHINode &Phi, - const TargetLibraryInfo *const TLI); + const TargetLibraryInfo *const TLI, AliasAnalysis *AA); PHINode &Phi_; std::vector Comparisons_; @@ -356,7 +377,8 @@ BasicBlock *EntryBlock_; }; -BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi) +BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, + AliasAnalysis *AA) : Phi_(Phi) { assert(!Blocks.empty() && "a chain should have at least one block"); // Now look inside blocks to check for BCE comparisons. @@ -388,7 +410,7 @@ // and start anew. // // NOTE: we only handle block with single predecessor for now. - if (Comparison.canSplit()) { + if (Comparison.canSplit(AA)) { LLVM_DEBUG(dbgs() << "Split initial block '" << Comparison.BB->getName() << "' that does extra work besides compare\n"); @@ -475,7 +497,8 @@ } #endif // MERGEICMPS_DOT_ON -bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) { +bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI, + AliasAnalysis *AA) { // First pass to check if there is at least one merge. If not, we don't do // anything and we keep analysis passes intact. { @@ -523,13 +546,13 @@ // Merge all previous comparisons and start a new merge block. mergeComparisons( makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged), - Comparisons_[I].BB, Phi_, TLI); + Comparisons_[I].BB, Phi_, TLI, AA); NumMerged = 1; } } mergeComparisons(makeArrayRef(Comparisons_) .slice(Comparisons_.size() - NumMerged, NumMerged), - nullptr, Phi_, TLI); + nullptr, Phi_, TLI, AA); return true; } @@ -537,7 +560,8 @@ void BCECmpChain::mergeComparisons(ArrayRef Comparisons, BasicBlock *const NextBBInChain, PHINode &Phi, - const TargetLibraryInfo *const TLI) { + const TargetLibraryInfo *const TLI, + AliasAnalysis *AA) { assert(!Comparisons.empty()); const auto &FirstComparison = *Comparisons.begin(); BasicBlock *const BB = FirstComparison.BB; @@ -550,7 +574,7 @@ auto C = std::find_if(Comparisons.begin(), Comparisons.end(), [](const BCECmpBlock &B) { return B.RequireSplit; }); if (C != Comparisons.end()) - C->split(EntryBlock_); + C->split(EntryBlock_, AA); LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); const auto TotalSize = @@ -666,7 +690,8 @@ return Blocks; } -bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { +bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI, + AliasAnalysis *AA) { LLVM_DEBUG(dbgs() << "processPhi()\n"); if (Phi.getNumIncomingValues() <= 1) { LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n"); @@ -724,14 +749,14 @@ const auto Blocks = getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues()); if (Blocks.empty()) return false; - BCECmpChain CmpChain(Blocks, Phi); + BCECmpChain CmpChain(Blocks, Phi, AA); if (CmpChain.size() < 2) { LLVM_DEBUG(dbgs() << "skip: only one compare block\n"); return false; } - return CmpChain.simplify(TLI); + return CmpChain.simplify(TLI, AA); } class MergeICmps : public FunctionPass { @@ -746,7 +771,8 @@ if (skipFunction(F)) return false; const auto &TLI = getAnalysis().getTLI(); const auto &TTI = getAnalysis().getTTI(F); - auto PA = runImpl(F, &TLI, &TTI); + AliasAnalysis *AA = &getAnalysis().getAAResults(); + auto PA = runImpl(F, &TLI, &TTI, AA); return !PA.areAllPreserved(); } @@ -754,14 +780,16 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); } PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI); + const TargetTransformInfo *TTI, AliasAnalysis *AA); }; PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, + AliasAnalysis *AA) { LLVM_DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n"); // We only try merging comparisons if the target wants to expand memcmp later. @@ -777,7 +805,7 @@ for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { // A Phi operation is always first in a basic block. if (auto *const Phi = dyn_cast(&*BBIt->begin())) - MadeChange |= processPhi(*Phi, TLI); + MadeChange |= processPhi(*Phi, TLI, AA); } if (MadeChange) return PreservedAnalyses::none(); @@ -791,6 +819,7 @@ "Merge contiguous icmps into a memcmp", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MergeICmps, "mergeicmps", "Merge contiguous icmps into a memcmp", false, false) Index: test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- test/CodeGen/AArch64/O3-pipeline.ll +++ test/CodeGen/AArch64/O3-pipeline.ll @@ -32,6 +32,8 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Induction Variable Users ; CHECK-NEXT: Loop Strength Reduction +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions Index: test/CodeGen/Generic/llc-start-stop.ll =================================================================== --- test/CodeGen/Generic/llc-start-stop.ll +++ test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -mergeicmps +; START-AFTER: -aa -mergeicmps ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Merge contiguous icmps into a memcmp +; START-AFTER-NEXT: Dominator Tree Construction ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Merge contiguous icmps into a memcmp +; START-BEFORE-NEXT: Basic Alias Analysis (stateless AA impl) ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE Index: test/CodeGen/X86/O3-pipeline.ll =================================================================== --- test/CodeGen/X86/O3-pipeline.ll +++ test/CodeGen/X86/O3-pipeline.ll @@ -26,6 +26,8 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Induction Variable Users ; CHECK-NEXT: Loop Strength Reduction +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions Index: test/Transforms/MergeICmps/X86/alias-merge-blocks.ll =================================================================== --- /dev/null +++ test/Transforms/MergeICmps/X86/alias-merge-blocks.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mergeicmps -S | FileCheck %s --check-prefix=X86 + +%"struct.std::pair" = type { i32, i32, i32, i32 } + +; Before the pass +define zeroext i1 @opeq1( +; CHECK-LABEL: @opeq1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR:%.*]] = alloca i32 +; CHECK-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4 +; CHECK-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4 +; CHECK-NEXT: store i32 42, i32* [[PTR]] +; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]] +; CHECK: land.rhs.i: +; CHECK-NEXT: [[SECOND_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[SECOND_I]], align 4 +; CHECK-NEXT: [[SECOND2_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B]], i64 0, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4 +; CHECK-NEXT: [[CMP2_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: br i1 [[CMP2_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]] + +; After the pass +; We can split %entry and create a memcmp(16 bytes) +; X86-LABEL: @opeq1( +; X86-NEXT: entry: +; X86-NEXT: [[PTR:%.*]] = alloca i32 +; X86-NEXT: store i32 42, i32* [[PTR]] +; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* +; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) +; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] +; X86: opeq1.exit: +; X86-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ] +; X86-NEXT: ret i1 [[TMP1]] +; + %"struct.std::pair"* nocapture readonly dereferenceable(16) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { +entry: + %ptr = alloca i32 + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + ; Does other work, has no interference, merge block + store i32 42, i32* %ptr + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp2.i = icmp eq i32 %2, %3 + br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit + +land.rhs.i.2: + %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2 + %4 = load i32, i32* %third.i, align 4 + %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 2 + %5 = load i32, i32* %third2.i, align 4 + %cmp3.i = icmp eq i32 %4, %5 + br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit + +land.rhs.i.3: + %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 3 + %6 = load i32, i32* %fourth.i, align 4 + %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3 + %7 = load i32, i32* %fourth2.i, align 4 + %cmp4.i = icmp eq i32 %6, %7 + br label %opeq1.exit + +opeq1.exit: + %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ] + ret i1 %8 +}