Index: lib/Transforms/Scalar/MergeICmps.cpp =================================================================== --- lib/Transforms/Scalar/MergeICmps.cpp +++ lib/Transforms/Scalar/MergeICmps.cpp @@ -110,6 +110,10 @@ } // A basic block with a comparison between two BCE atoms. +// The block might do extra work besides the atom comparison, in which case +// doesOtherWork() returns true. Under some conditions, the block can be +// split into the atom comparison part and the "other work" part +// (see canSplit()). // Note: the terminology is misleading: the comparison is symmetric, so there // is no real {l/r}hs. What we want though is to have the same base on the // left (resp. right), so that we can detect consecutive loads. To ensure this @@ -144,19 +148,83 @@ // Returns true if the block does other works besides comparison. bool doesOtherWork() const; + // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp + // instructions in the block. + bool canSplit() const; + + // Return true if this all the relevant instructions in the BCE-cmp-block can + // be sunk below this instruction. By doing this, we know we can separate the + // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the + // block. + bool canSinkBCECmpInst(const Instruction *, DenseSet &) const; + + // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block + // instructions. Split the old block and move all non-BCE-cmp-insts into the + // new parent block. + void split(BasicBlock *NewParent) const; + // The basic block where this comparison happens. BasicBlock *BB = nullptr; // The ICMP for this comparison. ICmpInst *CmpI = nullptr; // The terminating branch. BranchInst *BranchI = nullptr; + // The block requires splitting. + bool RequireSplit = false; - private: +private: BCEAtom Lhs_; BCEAtom Rhs_; int SizeBits_ = 0; }; +bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, + DenseSet &BlockInsts) const { + // If this instruction has side effects and its in middle of the BCE cmp block + // instructions, then bail for now. + // TODO: use alias analysis to tell whether there is real interference. + if (Inst->mayHaveSideEffects()) + return false; + // Make sure this instruction does not use any of the BCE cmp block + // instructions as operand. + for (auto BI : BlockInsts) { + if (is_contained(Inst->operands(), BI)) + return false; + } + return true; +} + +void BCECmpBlock::split(BasicBlock *NewParent) const { + DenseSet BlockInsts( + {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); + llvm::SmallVector OtherInsts; + for (Instruction &Inst : *BB) { + if (BlockInsts.count(&Inst)) + continue; + assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block"); + // This is a non-BCE-cmp-block instruction. And it can be separated + // from the BCE-cmp-block instruction. + OtherInsts.push_back(&Inst); + } + + // Do the actual spliting. + for (Instruction *Inst : reverse(OtherInsts)) { + Inst->moveBefore(&*NewParent->begin()); + } +} + +bool BCECmpBlock::canSplit() const { + DenseSet BlockInsts( + {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); + for (Instruction &Inst : *BB) { + if (!BlockInsts.count(&Inst)) { + if (!canSinkBCECmpInst(&Inst, BlockInsts)) + return false; + } + } + return true; +} + bool BCECmpBlock::doesOtherWork() const { AssertConsistent(); // All the instructions we care about in the BCE cmp block. @@ -241,6 +309,17 @@ return {}; } +static inline void enqueueBlock(std::vector &Comparisons, + BCECmpBlock &Comparison) { + DEBUG(dbgs() << "Block '" << Comparison.BB->getName() << "': Found cmp of " + << Comparison.SizeBits() << " bits between " + << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset + << " and " << Comparison.Rhs().Base() << " + " + << Comparison.Rhs().Offset << "\n"); + DEBUG(dbgs() << "\n"); + Comparisons.push_back(Comparison); +} + // A chain of comparisons. class BCECmpChain { public: @@ -266,9 +345,9 @@ // Merges the given comparison blocks into one memcmp block and update // branches. Comparisons are assumed to be continguous. If NextBBInChain is // null, the merged block will link to the phi block. - static void mergeComparisons(ArrayRef Comparisons, - BasicBlock *const NextBBInChain, PHINode &Phi, - const TargetLibraryInfo *const TLI); + void mergeComparisons(ArrayRef Comparisons, + BasicBlock *const NextBBInChain, PHINode &Phi, + const TargetLibraryInfo *const TLI); PHINode &Phi_; std::vector Comparisons_; @@ -295,12 +374,28 @@ DEBUG(dbgs() << "block '" << Comparison.BB->getName() << "' does extra work besides compare\n"); if (Comparisons.empty()) { - // TODO(courbet): The initial block can do other things, and we should - // split them apart in a separate block before the comparison chain. - // Right now we just discard it and make the chain shorter. - DEBUG(dbgs() - << "ignoring initial block '" << Comparison.BB->getName() - << "' that does extra work besides compare\n"); + // This is the initial block in the chain, in case this block does other + // work, we can try to split the block and move the irrelevant + // instructions to the predecessor. + // + // If this is not the initial block in the chain, splitting it wont + // work. + // + // As once split, there will still be instructions before the BCE cmp + // instructions that do other work in program order, i.e. within the + // chain before sorting. Unless we can abort the chain at this point + // and start anew. + // + // NOTE: we only handle block with single predecessor for now. + if (Comparison.canSplit()) { + DEBUG(dbgs() << "Split initial block '" << Comparison.BB->getName() + << "' that does extra work besides compare\n"); + Comparison.RequireSplit = true; + enqueueBlock(Comparisons, Comparison); + } else { + DEBUG(dbgs() << "ignoring initial block '" << Comparison.BB->getName() + << "' that does extra work besides compare\n"); + } continue; } // TODO(courbet): Right now we abort the whole chain. We could be @@ -328,13 +423,7 @@ // We could still merge bb1 and bb2 though. return; } - DEBUG(dbgs() << "Block '" << Comparison.BB->getName()<< "': Found cmp of " - << Comparison.SizeBits() << " bits between " - << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset - << " and " << Comparison.Rhs().Base() << " + " - << Comparison.Rhs().Offset << "\n"); - DEBUG(dbgs() << "\n"); - Comparisons.push_back(Comparison); + enqueueBlock(Comparisons, Comparison); } // It is possible we have no suitable comparison to merge. @@ -416,9 +505,11 @@ } // Point the predecessors of the chain to the first comparison block (which is - // the new entry point). - if (EntryBlock_ != Comparisons_[0].BB) + // the new entry point) and update the entry block of the chain. + if (EntryBlock_ != Comparisons_[0].BB) { EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); + EntryBlock_ = Comparisons_[0].BB; + } // Effectively merge blocks. int NumMerged = 1; @@ -450,6 +541,14 @@ LLVMContext &Context = BB->getContext(); if (Comparisons.size() >= 2) { + // If there is one block that requires splitting, we do it now, i.e. + // just before we know we will collapse the chain. The instructions + // can be executed before any of the instructions in the chain. + auto C = std::find_if(Comparisons.begin(), Comparisons.end(), + [](const BCECmpBlock &B) { return B.RequireSplit; }); + if (C != Comparisons.end()) + C->split(EntryBlock_); + DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); const auto TotalSize = std::accumulate(Comparisons.begin(), Comparisons.end(), 0, Index: test/Transforms/MergeICmps/X86/split-block-does-work.ll =================================================================== --- /dev/null +++ test/Transforms/MergeICmps/X86/split-block-does-work.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s --check-prefix=X86 + +%"struct.std::pair" = type { i32, i32, i32, i32 } + +declare void @foo(...) nounwind readnone + +; We can split %entry and create a memcmp(16 bytes). +define zeroext i1 @opeq1( +; X86-LABEL: @opeq1( +; +; Make sure this call is moved to the beginning of the entry block. +; X86: entry: +; X86-NEXT: call void (...) @foo() +; X86-NEXT: [[THIRD_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[THIRD1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[THIRD_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[THIRD1_I]] to i8* +; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) +; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] +; + %"struct.std::pair"* nocapture readonly dereferenceable(16) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + ; Does other work. + call void (...) @foo() + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp2.i = icmp eq i32 %2, %3 + br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit + +land.rhs.i.2: + %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2 + %4 = load i32, i32* %third.i, align 4 + %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 2 + %5 = load i32, i32* %third2.i, align 4 + %cmp3.i = icmp eq i32 %4, %5 + br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit + +land.rhs.i.3: + %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 3 + %6 = load i32, i32* %fourth.i, align 4 + %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3 + %7 = load i32, i32* %fourth2.i, align 4 + %cmp4.i = icmp eq i32 %6, %7 + br label %opeq1.exit + +opeq1.exit: + %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ] + ret i1 %8 +} + + +; We will not be able to merge anything, make sure the call is not moved out. +define zeroext i1 @opeq1_discontiguous( +; X86-LABEL: @opeq1_discontiguous( +; +; Make sure this call is moved in the entry block. +; X86: entry: +; X86: [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 1 +; X86: [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; X86: call void (...) @foo() + %"struct.std::pair"* nocapture readonly dereferenceable(16) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + ; Does other work. + call void (...) @foo() + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp2.i = icmp eq i32 %2, %3 + br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit + +land.rhs.i.2: + %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2 + %4 = load i32, i32* %third.i, align 4 + %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3 + %5 = load i32, i32* %third2.i, align 4 + %cmp3.i = icmp eq i32 %4, %5 + br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit + +land.rhs.i.3: + %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %6 = load i32, i32* %fourth.i, align 4 + %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3 + %7 = load i32, i32* %fourth2.i, align 4 + %cmp4.i = icmp eq i32 %6, %7 + br label %opeq1.exit + +opeq1.exit: + %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ] + ret i1 %8 +} Index: test/Transforms/MergeICmps/X86/tuple-four-int8.ll =================================================================== --- test/Transforms/MergeICmps/X86/tuple-four-int8.ll +++ test/Transforms/MergeICmps/X86/tuple-four-int8.ll @@ -19,28 +19,24 @@ define zeroext i1 @opeq( ; CHECK-LABEL: @opeq( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 -; CHECK-NEXT: [[A_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[A_ELEM3_ADDR]], align 1 -; CHECK-NEXT: [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 -; CHECK-NEXT: [[B_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[B_ELEM3_ADDR]], align 1 -; CHECK-NEXT: [[CMP_ELEM3:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP_ELEM3]], label [[LAND_ELEM0:%.*]], label [[OPEQ_EXIT:%.*]] +; +; These 2 instructions are split. Then we can merge 3 bytes, instead of 2. +; CHECK: br label [[LAND_ELEM0:%.*]] ; CHECK: land.elem1: -; CHECK-NEXT: [[A_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 1 -; CHECK-NEXT: [[B_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 1 -; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 2) +; CHECK-NEXT: [[A_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* %a.base, i64 1 +; CHECK-NEXT: [[B_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* %b.base, i64 1 +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 3) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 -; CHECK-NEXT: br label [[OPEQ_EXIT]] +; CHECK-NEXT: br label [[OPEQ_EXIT:%.*]] ; CHECK: land.elem0: +; CHECK: [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 +; CHECK: [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[A_BASE]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[B_BASE]], align 1 ; CHECK-NEXT: [[CMP_ELEM0:%.*]] = icmp eq i8 [[TMP3]], [[TMP4]] ; CHECK-NEXT: br i1 [[CMP_ELEM0]], label [[LAND_ELEM1:%.*]], label [[OPEQ_EXIT]] ; CHECK: opeq.exit: -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ] ; CHECK-NEXT: ret i1 [[TMP5]] ; %"class.std::tuple"* nocapture readonly dereferenceable(4) %a,