Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -454,6 +454,9 @@ /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; + /// \brief Enable inline expansion of memcmp + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const; + /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -828,6 +831,7 @@ unsigned VF) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; + virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -1047,6 +1051,9 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override { + return Impl.expandMemCmp(I, MaxLoadSize); + } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -274,6 +274,8 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; } + bool enableInterleavedAccessVectorization() { return false; } bool isFPVectorizationPotentiallyUnsafe() { return false; } Index: llvm/trunk/include/llvm/Analysis/ValueTracking.h =================================================================== --- llvm/trunk/include/llvm/Analysis/ValueTracking.h +++ llvm/trunk/include/llvm/Analysis/ValueTracking.h @@ -85,6 +85,8 @@ const Instruction *CxtI = nullptr, const DominatorTree *DT = nullptr); + bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI); + /// Return true if the given value is known to be non-zero when defined. For /// vectors, return true if every element is known to be non-zero when /// defined. For pointers, if the context instruction and dominator tree are Index: llvm/trunk/include/llvm/Target/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetLowering.h +++ llvm/trunk/include/llvm/Target/TargetLowering.h @@ -1143,6 +1143,16 @@ return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + /// Get maximum # of load operations permitted for memcmp + /// + /// This function returns the maximum number of load operations permitted + /// to replace a call to memcmp. The value is set by the target at the + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. + unsigned getMaxExpandSizeMemcmp(bool OptSize) const { + return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp; + } + /// \brief Get maximum # of store operations permitted for llvm.memmove /// /// This function returns the maximum number of store operations permitted @@ -2330,6 +2340,8 @@ /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; + unsigned MaxLoadsPerMemcmp; + unsigned MaxLoadsPerMemcmpOptSize; /// \brief Specify maximum bytes of store instructions per memmove call. /// Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -215,6 +215,10 @@ return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } +bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const { + return TTIImpl->expandMemCmp(I, MaxLoadSize); +} + bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } Index: llvm/trunk/lib/Analysis/ValueTracking.cpp =================================================================== --- llvm/trunk/lib/Analysis/ValueTracking.cpp +++ llvm/trunk/lib/Analysis/ValueTracking.cpp @@ -172,6 +172,18 @@ } +bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) { + for (const User *U : CxtI->users()) { + if (const ICmpInst *IC = dyn_cast(U)) + if (IC->isEquality()) + if (Constant *C = dyn_cast(IC->getOperand(1))) + if (C->isNullValue()) + continue; + return false; + } + return true; +} + static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, const Query &Q); Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -24,12 +24,13 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -60,6 +61,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/Transforms/Utils/ValueMapper.h" + using namespace llvm; using namespace llvm::PatternMatch; @@ -84,6 +86,12 @@ STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -144,6 +152,11 @@ cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; @@ -1629,6 +1642,593 @@ return true; } +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB; + PHINode *PhiSrc1; + PHINode *PhiSrc2; + ResultBlock(); + }; + + CallInst *CI; + ResultBlock ResBlock; + unsigned MaxLoadSize; + unsigned NumBlocks; + unsigned NumBlocksNonOneByte; + unsigned NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + bool IsUsedForZeroCmp; + int calculateNumBlocks(unsigned Size); + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex, + bool IsLittleEndian); + void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size, + unsigned &NumBytesProcessed); + void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); + void emitMemCmpResultBlock(bool IsLittleEndian); + Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian); + unsigned getLoadSize(unsigned Size); + unsigned getNumLoads(unsigned Size); + +public: + MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, + unsigned NumLoadsPerBlock); + Value *getMemCmpExpansion(bool IsLittleEndian); +}; + +MemCmpExpansion::ResultBlock::ResultBlock() + : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {} + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, + unsigned NumLoadsPerBlock) + : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(NumLoadsPerBlock) { + + IRBuilder<> Builder(CI->getContext()); + + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + uint64_t Size = SizeCast->getZExtValue(); + + // Calculate how many load compare blocks are required for an expansion of + // given Size. + NumBlocks = calculateNumBlocks(Size); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < NumBlocks; i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp paramters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) { + IRBuilder<> Builder(CI->getContext()); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]); + + if (Index < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock, otherwise continue to + // next LoadCmpBlock + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +unsigned MemCmpExpansion::getNumLoads(unsigned Size) { + return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize); +} + +unsigned MemCmpExpansion::getLoadSize(unsigned Size) { + return MinAlign(PowerOf2Floor(Size), MaxLoadSize); +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( + unsigned Index, unsigned Size, unsigned &NumBytesProcessed) { + + IRBuilder<> Builder(CI->getContext()); + + std::vector XorList, OrList; + Value *Diff; + + unsigned RemainingBytes = Size - NumBytesProcessed; + unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); + unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + + for (unsigned i = 0; i < NumLoads; ++i) { + unsigned LoadSize = getLoadSize(RemainingBytes); + unsigned GEPIndex = NumBytesProcessed / LoadSize; + NumBytesProcessed += LoadSize; + RemainingBytes -= LoadSize; + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType); + XorList.push_back(Diff); + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + // Pair wise OR the XOR results + OrList = pairWiseOr(XorList); + + // Pair wise OR the OR results until one result left + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, OrList[0], + ConstantInt::get(Diff->getType(), 0)); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock, otherwise continue to + // next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes) + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize, + int GEPIndex, bool IsLittleEndian) { + if (LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex); + return; + } + + IRBuilder<> Builder(CI->getContext()); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (IsLittleEndian) { + Function *F = LoadCmpBlocks[Index]->getParent(); + + Function *Bswap = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); + } + + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock, otherwise continue to + // next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes) + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) { + IRBuilder<> Builder(CI->getContext()); + + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +int MemCmpExpansion::calculateNumBlocks(unsigned Size) { + int NumBlocks = 0; + bool haveOneByteLoad = false; + unsigned RemainingSize = Size; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + if (LoadSize == 1) + haveOneByteLoad = true; + NumBlocks += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks; + + if (IsUsedForZeroCmp) + NumBlocks = NumBlocks / NumLoadsPerBlock + + (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0); + + return NumBlocks; +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + IRBuilder<> Builder(CI->getContext()); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + IRBuilder<> Builder(CI->getContext()); + + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size, + bool IsLittleEndian) { + unsigned NumBytesProcessed = 0; + // This loop populates each of the LoadCmpBlocks with IR sequence to handle + // multiple loads per block + for (unsigned i = 0; i < NumBlocks; ++i) { + emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed); + } + + emitMemCmpResultBlock(IsLittleEndian); + return PhiRes; +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion(bool IsLittleEndian) { + + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + uint64_t Size = SizeCast->getZExtValue(); + + int LoadSize = MaxLoadSize; + int NumBytesToBeProcessed = Size; + + if (IsUsedForZeroCmp) { + return getMemCmpExpansionZeroCase(Size, IsLittleEndian); + } + + unsigned Index = 0; + // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two + // memcmp source. It starts with loading using the maximum load size set by + // the target. It processes any remaining bytes using a load size which is the + // next smallest power of 2. + while (NumBytesToBeProcessed) { + // Calculate how many blocks we can create with the current load size + int NumBlocks = NumBytesToBeProcessed / LoadSize; + int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize; + NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize; + + // For each NumBlocks, populate the instruction sequence for loading and + // comparing LoadSize bytes + while (NumBlocks--) { + emitLoadCompareBlock(Index, LoadSize, GEPIndex, IsLittleEndian); + Index++; + GEPIndex++; + } + // Get the next LoadSize to use + LoadSize = LoadSize / 2; + } + + emitMemCmpResultBlock(IsLittleEndian); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced wtih new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + IRBuilder<> Builder(CI->getContext()); + + // TTI call to check if target would like to expand memcmp and get the + // MaxLoadSize + unsigned MaxLoadSize; + if (!TTI->expandMemCmp(CI, MaxLoadSize)) + return false; + + // Early exit from expansion if -Oz + if (CI->getParent()->getParent()->optForMinSize()) { + return false; + } + + // Early exit from expansion if size is not a constant + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + + // Early exit from expansion if size greater than max bytes to load + uint64_t SizeVal = SizeCast->getZExtValue(); + + unsigned NumLoads = 0; + unsigned RemainingSize = SizeVal; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + NumLoads += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + + if (NumLoads > + TLI->getMaxExpandSizeMemcmp(CI->getParent()->getParent()->optForSize())) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + // MemCmpHelper object, creates and sets up basic blocks required for + // expanding memcmp with size SizeVal + unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock; + MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, NumLoadsPerBlock); + + Value *Res = MemCmpHelper.getMemCmpExpansion(DL->isLittleEndian()); + + // Replace call with result of expansion and erarse call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1780,6 +2380,15 @@ CI->eraseFromParent(); return true; } + + LibFunc Func; + if (TLInfo->getLibFunc(*CI->getCalledFunction(), Func) && + Func == LibFunc_memcmp) { + if (expandMemCmp(CI, TTI, TLI, DL)) { + ModifiedDT = true; + return true; + } + } return false; } @@ -4927,6 +5536,7 @@ return true; } + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp @@ -842,9 +842,10 @@ initActions(); // Perform these initializations only once. - MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8; - MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize - = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = + MaxLoadsPerMemcmp = 8; + MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = + MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; HasMultipleConditionRegisters = false; Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1041,6 +1041,10 @@ MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadsPerMemcmp = 128; + } else { + MaxLoadsPerMemcmp = 8; + MaxLoadsPerMemcmpOptSize = 4; } } Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -60,6 +60,7 @@ /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,6 +215,11 @@ return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + MaxLoadSize = 8; + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } Index: llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -85,20 +85,6 @@ return false; } -/// Return true if it only matters that the value is equal or not-equal to zero. -static bool isOnlyUsedInZeroEqualityComparison(Value *V) { - for (User *U : V->users()) { - if (ICmpInst *IC = dyn_cast(U)) - if (IC->isEquality()) - if (Constant *C = dyn_cast(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - /// Return true if it is only used in equality comparisons with With. static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { for (User *U : V->users()) { Index: llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll +++ llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -0,0 +1,121 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4 +@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4 +@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4 +@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4 +@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4 +@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4 + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +; Validate with if(memcmp()) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16) + %not.tobool = icmp ne i32 %call, 0 + %. = zext i1 %not.tobool to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest01 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with if(memcmp() == 0) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp ne i32 %call, 0 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest02 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with > 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp slt i32 %call, 1 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest03 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with < 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16) + %call.lobit = lshr i32 %call, 31 + %call.lobit.not = xor i32 %call.lobit, 1 + ret i32 %call.lobit.not + + ; CHECK-LABEL: @zeroEqualityTest04 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.tobool = icmp eq i32 %call, 0 + %cond = zext i1 %not.tobool to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest05 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK: li 3, 0 +} + +; Validate with !memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.lnot = icmp ne i32 %call, 0 + %cond = zext i1 %not.lnot to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest06 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} Index: llvm/trunk/test/CodeGen/PowerPC/memcmp.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/memcmp.ll +++ llvm/trunk/test/CodeGen/PowerPC/memcmp.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK + +; Check size 8 +; Function Attrs: nounwind readonly +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2 + ret i32 %call + +; CHECK-LABEL: @test1 +; CHECK: ldbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 4 +; Function Attrs: nounwind readonly +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2 + ret i32 %call + +; CHECK-LABEL: @test2 +; CHECK: lwbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 2 +; Function Attrs: nounwind readonly +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2 + ret i32 %call + +; CHECK-LABEL: @test3 +; CHECK: lhbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 1 +; Function Attrs: nounwind readonly +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2 + ret i32 %call + +; CHECK-LABEL: @test4 +; CHECK: lbz [[LOAD1:[0-9]+]] +; CHECK-NEXT: lbz [[LOAD2:[0-9]+]] +; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1 Index: llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll +++ llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll @@ -0,0 +1,194 @@ +; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s +; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE + +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { +entry: + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) + ret i32 %call +} + +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) + ret i32 %call +} + +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) + ret i32 %call +} + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) + ret i32 %call +} + +define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) { + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %conv = sext i32 %SIZE to i64 + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) + ret i32 %call +}