Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -1675,6 +1675,7 @@ void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); void emitMemCmpResultBlock(bool IsLittleEndian); Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian); + Value *getMemCmpEqZeroOneBlock(unsigned Size); unsigned getLoadSize(unsigned Size); unsigned getNumLoads(unsigned Size); @@ -1699,31 +1700,35 @@ unsigned MaxLoadSize, unsigned LoadsPerBlock) : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) { - IRBuilder<> Builder(CI->getContext()); - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - - // Calculate how many load compare blocks are required for an expansion of - // given Size. NumBlocks = calculateNumBlocks(Size); - createResultBlock(); - - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) - setupResultBlockPHINodes(); - - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); + if (!IsUsedForZeroCmp || NumBlocks != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. + IRBuilder<> Builder(CI->getContext()); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); } void MemCmpExpansion::createLoadCmpBlocks() { @@ -1810,7 +1815,12 @@ unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); - Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Value *Cmp = nullptr; for (unsigned i = 0; i < NumLoads; ++i) { unsigned LoadSize = getLoadSize(RemainingBytes); @@ -2071,11 +2081,22 @@ return PhiRes; } +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { + unsigned NumBytesProcessed = 0; + IRBuilder<> Builder(CI->getContext()); + Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) { if (IsUsedForZeroCmp) - return getMemCmpExpansionZeroCase(Size, IsLittleEndian); + return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : + getMemCmpExpansionZeroCase(Size, IsLittleEndian); // This loop calls emitLoadCompareBlock for comparing Size bytes of the two // memcmp sources. It starts with loading using the maximum load size set by Index: llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll +++ llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -17,13 +17,13 @@ ; Check 4 bytes - requires 1 load for each param. define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) { ; CHECK-LABEL: zeroEqualityTest02: -; CHECK: # BB#0: # %loadbb +; CHECK: # BB#0: ; CHECK-NEXT: lwz 3, 0(3) ; CHECK-NEXT: lwz 4, 0(4) -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: isel 3, 0, 5, 2 -; CHECK-NEXT: clrldi 3, 3, 32 +; CHECK-NEXT: xor 3, 3, 4 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: srwi 3, 3, 5 +; CHECK-NEXT: xori 3, 3, 1 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4) %not.cmp = icmp ne i32 %call, 0