Index: lib/Transforms/Scalar/MergeICmps.cpp =================================================================== --- lib/Transforms/Scalar/MergeICmps.cpp +++ lib/Transforms/Scalar/MergeICmps.cpp @@ -144,6 +144,15 @@ // Returns true if the block does other works besides comparison. bool doesOtherWork() const; + // Return true if this all the relevant instructions in the BCE cmp block can + // be sunk below this instruction. By doing this, we know we can separate the + // BCE-cmp-block instructions with the non-BCE-cmp-block instructions. + bool canSinkBCECmpInst(const Instruction *, DenseSet &) const; + + // Return true if we can separate the BCE cmp insts and the non-BCE cmp + // insts. Additonally, split the old block. False otherwise. + bool tryToSplitBCECmpBlock(); + // The basic block where this comparison happens. BasicBlock *BB = nullptr; // The ICMP for this comparison. @@ -157,6 +166,47 @@ int SizeBits_ = 0; }; +bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, + DenseSet &BlockInsts) const { + // If this instruction has side effect and its in middle of the BCE cmp block + // instructions, then bail for now. + // TODO: use alias analysis to tell whether there is real interference. + if (Inst->mayHaveSideEffects()) + return false; + // Make sure this instruction does not use any of the BCE cmp block + // instructions as operand. + for (auto BI : BlockInsts) { + if (is_contained(Inst->operands(), BI)) + return false; + } + return true; +} + +bool BCECmpBlock::tryToSplitBCECmpBlock() { + DenseSet BlockInsts( + {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); + llvm::SmallVector OtherInsts; + for (Instruction &Inst : *BB) { + if (!BlockInsts.count(&Inst)) { + if (!canSinkBCECmpInst(&Inst, BlockInsts)) + return false; + // This is a non-BCE-cmp-block instruction. And it can be separated + // from the BCE-cmp-block instruction. + OtherInsts.push_back(&Inst); + } + } + + // Do the actual spliting. + BasicBlock *NBB = + BasicBlock::Create(BB->getContext(), "", BB->getParent(), BB); + BranchInst::Create(BB, NBB); + for (Instruction *Inst : OtherInsts) { + Inst->moveBefore(NBB->getTerminator()); + } + + return true; +} + bool BCECmpBlock::doesOtherWork() const { AssertConsistent(); // All the instructions we care about in the BCE cmp block. @@ -232,6 +282,17 @@ return {}; } +static inline void enqueueBlock(std::vector &Comparisons, + BCECmpBlock &Comparison) { + DEBUG(dbgs() << "Block '" << Comparison.BB->getName() << "': Found cmp of " + << Comparison.SizeBits() << " bits between " + << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset + << " and " << Comparison.Rhs().Base() << " + " + << Comparison.Rhs().Offset << "\n"); + DEBUG(dbgs() << "\n"); + Comparisons.push_back(Comparison); +} + // A chain of comparisons. class BCECmpChain { public: @@ -286,12 +347,23 @@ DEBUG(dbgs() << "block '" << Comparison.BB->getName() << "' does extra work besides compare\n"); if (Comparisons.empty()) { - // TODO(courbet): The initial block can do other things, and we should - // split them apart in a separate block before the comparison chain. - // Right now we just discard it and make the chain shorter. - DEBUG(dbgs() - << "ignoring initial block '" << Comparison.BB->getName() - << "' that does extra work besides compare\n"); + // This is the initial block in the chain, in case this block does other + // work, we try to split the block. + // + // If this is not the initial block in the chain, splitting it wont + // work. + // + // As once split, there will still be instructions before the BCE cmp + // instructions that do other work, i.e. within the chain. Unless + // we can abort the chain at this point and start anew. + if (Comparison.tryToSplitBCECmpBlock()) { + DEBUG(dbgs() << "Split initial block '" << Comparison.BB->getName() + << "' that does extra work besides compare\n"); + enqueueBlock(Comparisons, Comparison); + } else { + DEBUG(dbgs() << "ignoring initial block '" << Comparison.BB->getName() + << "' that does extra work besides compare\n"); + } continue; } // TODO(courbet): Right now we abort the whole chain. We could be @@ -319,13 +391,7 @@ // We could still merge bb1 and bb2 though. return; } - DEBUG(dbgs() << "Block '" << Comparison.BB->getName()<< "': Found cmp of " - << Comparison.SizeBits() << " bits between " - << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset - << " and " << Comparison.Rhs().Base() << " + " - << Comparison.Rhs().Offset << "\n"); - DEBUG(dbgs() << "\n"); - Comparisons.push_back(Comparison); + enqueueBlock(Comparisons, Comparison); } // It is possible we have no suitable comparison to merge. Index: test/Transforms/MergeICmps/X86/split-block-does-work.ll =================================================================== --- /dev/null +++ test/Transforms/MergeICmps/X86/split-block-does-work.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s --check-prefix=X86 + +%"struct.std::pair" = type { i32, i32, i32, i32 } + +declare void @foo(...) nounwind readnone + +; We can split %entry and create a memcmp(16 bytes). +define zeroext i1 @opeq1( +; X86-LABEL: @opeq1( +; X86: call void (...) @foo() +; X86: entry: +; X86-NEXT: [[THIRD_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[THIRD1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[THIRD_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[THIRD1_I]] to i8* +; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) +; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] +; + %"struct.std::pair"* nocapture readonly dereferenceable(16) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + ; Does other work. + call void (...) @foo() + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp2.i = icmp eq i32 %2, %3 + br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit + +land.rhs.i.2: + %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2 + %4 = load i32, i32* %third.i, align 4 + %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 2 + %5 = load i32, i32* %third2.i, align 4 + %cmp3.i = icmp eq i32 %4, %5 + br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit + +land.rhs.i.3: + %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 3 + %6 = load i32, i32* %fourth.i, align 4 + %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3 + %7 = load i32, i32* %fourth2.i, align 4 + %cmp4.i = icmp eq i32 %6, %7 + br label %opeq1.exit + +opeq1.exit: + %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ] + ret i1 %8 +} Index: test/Transforms/MergeICmps/X86/tuple-four-int8.ll =================================================================== --- test/Transforms/MergeICmps/X86/tuple-four-int8.ll +++ test/Transforms/MergeICmps/X86/tuple-four-int8.ll @@ -19,28 +19,24 @@ define zeroext i1 @opeq( ; CHECK-LABEL: @opeq( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 -; CHECK-NEXT: [[A_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[A_ELEM3_ADDR]], align 1 -; CHECK-NEXT: [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 -; CHECK-NEXT: [[B_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[B_ELEM3_ADDR]], align 1 -; CHECK-NEXT: [[CMP_ELEM3:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP_ELEM3]], label [[LAND_ELEM0:%.*]], label [[OPEQ_EXIT:%.*]] +; +; These 2 instructions are split. Then we can merge 3 bytes, instead of 2. +; CHECK: [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 +; CHECK: [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 +; CHECK: br label [[LAND_ELEM0:%.*]] ; CHECK: land.elem1: ; CHECK-NEXT: [[A_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 1 ; CHECK-NEXT: [[B_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 1 -; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 2) +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 3) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 -; CHECK-NEXT: br label [[OPEQ_EXIT]] +; CHECK-NEXT: br label [[OPEQ_EXIT:%.*]] ; CHECK: land.elem0: ; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[A_BASE]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[B_BASE]], align 1 ; CHECK-NEXT: [[CMP_ELEM0:%.*]] = icmp eq i8 [[TMP3]], [[TMP4]] ; CHECK-NEXT: br i1 [[CMP_ELEM0]], label [[LAND_ELEM1:%.*]], label [[OPEQ_EXIT]] ; CHECK: opeq.exit: -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ] ; CHECK-NEXT: ret i1 [[TMP5]] ; %"class.std::tuple"* nocapture readonly dereferenceable(4) %a,